# Baseline Model - Random Forest with kinematic features only

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import time
from infostop import Infostop
import pyproj
import sklearn
import pickle
import torch
sys.path.append('./src')
from data_utils import *
clear_output(wait=False)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Data cleansing - Cut-off parameters

In [3]:
seq_cutoff_speed = 45
seq_cutoff_time = 60
filter_seq = 5

### Setup machine learning baselines
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

### Set Optimal parameters for Random Forest Classfier

In [4]:
names = [
    "Random Forest"
    #, "Decision Tree"
    #, "Nearest Neighbors"
    #, "Neural Net", "AdaBoost"
    #, "Naive Bayes"
]

classifiers = [
    RandomForestClassifier(n_estimators = 500
                           , criterion = 'gini'
                           , class_weight='balanced'
                           , max_depth= 8
                           , max_features= 'auto'
                           , random_state=0
                           , n_jobs=-1
                           , oob_score=True
                           , verbose=1)
    #, DecisionTreeClassifier(max_depth=5, class_weight="balanced")
    #, KNeighborsClassifier(3)
    #, MLPClassifier(alpha=1, max_iter=1000)
    #, AdaBoostClassifier()
    #, GaussianNB()
]

### Load same training and test set of the Artificial Neural Networks¶

In [5]:
class TensorDataset(torch.utils.data.Dataset):

        def __init__(self, df, filter_seq=filter_seq):
            self.seq = np.stack([np.roll(df[['delta_d', 'bearing']].values, i, axis = 0) for i in range(filter_seq, -1, -1)], axis = 1)
            self.seq = self.seq[df['segment_ix'] >= filter_seq]

            self.labels = df[df['segment_ix'] >= filter_seq]['label'].values        
            self.user_id = df[df['segment_ix'] >= filter_seq]['user'].values
            tod = df[df['segment_ix'] >= filter_seq]['tod'].values
            self.tod_one_hot = np.eye(5)[tod]

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, key):
            return self.seq[key], self.tod_one_hot[key], self.labels[key]

### Shuffle the sequence of users¶

In [6]:
classification_test_performance = []
training_time = []
prediction_time = []
number_list=np.arange(0,12).tolist()
random.Random(17).shuffle(number_list)
print(number_list)
#number_list = [10, 4, 6, 8, 1, 9, 3, 7, 5, 0, 2, 11] #already randomized

[10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]


### Perform the 12-fold cross validation, rotating each user at the test set, while the others are on the training set

In [7]:
for k in range(0,12):
    # Train-test split
    print(f'Train partition: {number_list[0:k]+number_list[k+1:]}')
    print(f'Test partition: {number_list[k]}')
    user_train = train = number_list[0:k]+number_list[k+1:]
    user_test = test = [number_list[k]]
    
    
    # Collect features of each user
    print(f'Fold {k}, Train {train}, Test {test}')
    data_train = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_train]).reset_index(drop=True)
    data_test = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_test]).reset_index(drop=True)
    data_train = data_train[data_train['segment_ix'] >= filter_seq]
    data_test = data_test[data_test['segment_ix'] >= filter_seq]
    
    #Prepare the sequences of features for training set
    TimeSeries = TensorDataset(pd.concat([data_train]).reset_index(drop=True))
    XY_tr = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1],TS[2].reshape(-1)), axis=0) for TS in TimeSeries]).reshape(-1,18)
    
    #Shuffle training set to ensure a iid when training
    np.random.shuffle(XY_tr)
    X_tr = XY_tr[:,0:XY_tr.shape[1]-1]
    X_tr = StandardScaler().fit_transform(X_tr)
    Y_tr = XY_tr[:,-1]
    
    #Prepare the sequences of features for test set
    TimeSeries = TensorDataset(pd.concat([data_test]).reset_index(drop=True))
    X_te = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1]), axis=0) for TS in TimeSeries]).reshape(-1,17)
    X_te = StandardScaler().fit_transform(X_te)
    Y_te = TimeSeries[:][2]
    
    
    # iterate over classifiers (only RF in this case)
    for name, clf in zip(names, classifiers):
        startTraining = time.time()
        clf.fit(X_tr, Y_tr)
        endTraining = time.time()
        Y_pred = clf.predict(X_te)
        endPrediction = time.time()
        print('REPORT: '+name)
        cr=classification_report(Y_te, Y_pred, target_names=['Motion','Stop'],output_dict=True)
        classification_test_performance.append(cr)
        training_time.append(endTraining-startTraining)
        prediction_time.append(endPrediction-endTraining)
        print(f'Training lasted {training_time[-1]}')
        print(f'Prediction lasted {prediction_time[-1]}')
        print(classification_report(Y_te, Y_pred, target_names=['Motion','Stop'],output_dict=False))
    

F1_macro_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
print(f'MEAN = {np.mean(F1_macro_AVG)}, STDEV = {np.std(F1_macro_AVG)}')

Train partition: [3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 10
Fold 0, Train [3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [10]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.5min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.2s finished


REPORT: Random Forest
Training lasted 209.87211418151855
Prediction lasted 0.2108592987060547
              precision    recall  f1-score   support

      Motion       0.90      0.52      0.66      1150
        Stop       0.41      0.86      0.56       451

    accuracy                           0.61      1601
   macro avg       0.66      0.69      0.61      1601
weighted avg       0.76      0.61      0.63      1601

Train partition: [10, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 3
Fold 1, Train [10, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [3]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished


REPORT: Random Forest
Training lasted 187.33097863197327
Prediction lasted 0.3110077381134033
              precision    recall  f1-score   support

      Motion       0.88      0.68      0.77     21110
        Stop       0.89      0.97      0.93     58167

    accuracy                           0.89     79277
   macro avg       0.89      0.82      0.85     79277
weighted avg       0.89      0.89      0.88     79277

Train partition: [10, 3, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 0
Fold 2, Train [10, 3, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [0]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   41.7s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished


REPORT: Random Forest
Training lasted 185.9017276763916
Prediction lasted 0.31560611724853516
              precision    recall  f1-score   support

      Motion       0.91      0.72      0.81     20196
        Stop       0.92      0.98      0.95     64268

    accuracy                           0.92     84464
   macro avg       0.92      0.85      0.88     84464
weighted avg       0.92      0.92      0.91     84464

Train partition: [10, 3, 0, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 2
Fold 3, Train [10, 3, 0, 7, 11, 1, 9, 5, 4, 6, 8], Test [2]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.1min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    1.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    2.1s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    2.5s finished


REPORT: Random Forest
Training lasted 182.9311261177063
Prediction lasted 2.532219409942627
              precision    recall  f1-score   support

      Motion       0.75      0.84      0.80     43225
        Stop       0.90      0.83      0.87     72339

    accuracy                           0.84    115564
   macro avg       0.83      0.84      0.83    115564
weighted avg       0.84      0.84      0.84    115564

Train partition: [10, 3, 0, 2, 11, 1, 9, 5, 4, 6, 8]
Test partition: 7
Fold 4, Train [10, 3, 0, 2, 11, 1, 9, 5, 4, 6, 8], Test [7]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   36.8s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.3min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.2s finished


REPORT: Random Forest
Training lasted 180.24498867988586
Prediction lasted 0.20838427543640137
              precision    recall  f1-score   support

      Motion       0.82      0.31      0.45      7897
        Stop       0.24      0.77      0.37      2295

    accuracy                           0.41     10192
   macro avg       0.53      0.54      0.41     10192
weighted avg       0.69      0.41      0.43     10192

Train partition: [10, 3, 0, 2, 7, 1, 9, 5, 4, 6, 8]
Test partition: 11
Fold 5, Train [10, 3, 0, 2, 7, 1, 9, 5, 4, 6, 8], Test [11]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.8min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.2s finished


REPORT: Random Forest
Training lasted 150.08737659454346
Prediction lasted 0.20920872688293457
              precision    recall  f1-score   support

      Motion       0.89      0.50      0.64     11675
        Stop       0.73      0.96      0.83     16340

    accuracy                           0.77     28015
   macro avg       0.81      0.73      0.73     28015
weighted avg       0.80      0.77      0.75     28015

Train partition: [10, 3, 0, 2, 7, 11, 9, 5, 4, 6, 8]
Test partition: 1
Fold 6, Train [10, 3, 0, 2, 7, 11, 9, 5, 4, 6, 8], Test [1]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   48.1s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.5s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    1.4s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    1.7s finished


REPORT: Random Forest
Training lasted 69.13757634162903
Prediction lasted 1.8385474681854248
              precision    recall  f1-score   support

      Motion       0.91      0.73      0.81    182626
        Stop       0.90      0.97      0.93    462039

    accuracy                           0.90    644665
   macro avg       0.90      0.85      0.87    644665
weighted avg       0.90      0.90      0.90    644665

Train partition: [10, 3, 0, 2, 7, 11, 1, 5, 4, 6, 8]
Test partition: 9
Fold 7, Train [10, 3, 0, 2, 7, 11, 1, 5, 4, 6, 8], Test [9]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.9min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.0s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.1s finished


REPORT: Random Forest
Training lasted 154.49902319908142
Prediction lasted 0.21431183815002441
              precision    recall  f1-score   support

      Motion       0.23      0.07      0.11        42
        Stop       0.34      0.67      0.45        30

    accuracy                           0.32        72
   macro avg       0.28      0.37      0.28        72
weighted avg       0.28      0.32      0.25        72

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 4, 6, 8]
Test partition: 5
Fold 8, Train [10, 3, 0, 2, 7, 11, 1, 9, 4, 6, 8], Test [5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished


REPORT: Random Forest
Training lasted 181.63563179969788
Prediction lasted 0.3323478698730469
              precision    recall  f1-score   support

      Motion       0.71      0.64      0.67      4259
        Stop       0.89      0.92      0.90     13069

    accuracy                           0.85     17328
   macro avg       0.80      0.78      0.79     17328
weighted avg       0.84      0.85      0.84     17328

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 6, 8]
Test partition: 4
Fold 9, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 6, 8], Test [4]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.7min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.4s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    1.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    1.6s finished


REPORT: Random Forest
Training lasted 144.2327880859375
Prediction lasted 1.6469242572784424
              precision    recall  f1-score   support

      Motion       0.63      0.85      0.72     77537
        Stop       0.96      0.88      0.92    340189

    accuracy                           0.88    417726
   macro avg       0.79      0.87      0.82    417726
weighted avg       0.90      0.88      0.88    417726

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 8]
Test partition: 6
Fold 10, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 8], Test [6]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.3min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.3s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished


REPORT: Random Forest
Training lasted 199.61279678344727
Prediction lasted 0.33044886589050293
              precision    recall  f1-score   support

      Motion       0.90      0.61      0.73     12477
        Stop       0.19      0.55      0.28      1983

    accuracy                           0.60     14460
   macro avg       0.54      0.58      0.50     14460
weighted avg       0.80      0.60      0.67     14460

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6]
Test partition: 8
Fold 11, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6], Test [8]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.3min finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    0.1s


REPORT: Random Forest
Training lasted 195.97861075401306
Prediction lasted 0.32500290870666504
              precision    recall  f1-score   support

      Motion       0.96      0.41      0.58     12727
        Stop       0.21      0.90      0.34      2211

    accuracy                           0.48     14938
   macro avg       0.59      0.66      0.46     14938
weighted avg       0.85      0.48      0.54     14938

MEAN = 0.6683957612182679, STDEV = 0.19935398504694668


[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    0.2s
[Parallel(n_jobs=32)]: Done 500 out of 500 | elapsed:    0.3s finished


In [8]:
np.save('classification_test_performanceRF12fold.npy',classification_test_performance)
F1_macro_AVG = []
F1_weighted_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
    F1_weighted_AVG.append(cr['weighted avg']['f1-score'])
supports = [cf['macro avg']['support'] for cf in classification_test_performance]
print(f'MEAN = {np.mean(F1_macro_AVG)}, WEIGHTED AVG = {np.sum([F1_macro_AVG[i]*supports[i] for i in range(0,len(F1_macro_AVG))])/(np.sum(supports))}, STDEV = {np.std(F1_macro_AVG)}')
print(f'MEAN = {np.mean(F1_weighted_AVG)}, WEIGHTED AVG = {np.sum([F1_weighted_AVG[i]*supports[i] for i in range(0,len(F1_weighted_AVG))])/(np.sum(supports))}, STDEV = {np.std(F1_weighted_AVG)}')

MEAN = 0.6683957612182679, WEIGHTED AVG = 0.8360022577493109, STDEV = 0.19935398504694668
MEAN = 0.7108795148065797, WEIGHTED AVG = 0.8757207685679421, STDEV = 0.2038428688062497


In [9]:
results = [i for i in zip(F1_macro_AVG, F1_weighted_AVG, supports)]

In [10]:
np.save('resRF12fold.npy',results)

In [11]:
[i for i in F1_macro_AVG]

[0.606728660279999,
 0.8473783107634704,
 0.876956702969576,
 0.8302161621011173,
 0.41003138494548014,
 0.7322218799790635,
 0.8695320918977578,
 0.27926455566905006,
 0.7864096590271031,
 0.8208406078895909,
 0.5025549258519956,
 0.45861419324501196]

In [12]:
np.sum(training_time)

2041.4647388458252

In [13]:
np.sum(prediction_time)

8.474868774414062