# Baseline Model - Random Forest with geo-spatial dummy variables and kinematic features

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import time
from infostop import Infostop
import pyproj
import sklearn
import pickle
import torch
sys.path.append('./src')
from data_utils import *
clear_output(wait=False)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Data cleansing - Cut-off parameters

In [3]:
seq_cutoff_speed = 45
seq_cutoff_time = 60
filter_seq = 5

### Setup machine learning baselines
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [4]:
names = [
    "Random Forest"
    #, "Decision Tree"
    #, "Nearest Neighbors"
    #, "Neural Net", "AdaBoost"
    #, "Naive Bayes"
]

classifiers = [
    RandomForestClassifier(n_estimators=100
                           , criterion = 'entropy'
                           , class_weight='balanced'
                           , max_depth= 8
                           , max_features= 'auto'
                           , random_state=0
                           , n_jobs=-1
                           , oob_score=True
                           , verbose=1)
    #, DecisionTreeClassifier(max_depth=5, class_weight="balanced")
    #, KNeighborsClassifier(3)
    #, MLPClassifier(alpha=1, max_iter=1000)
    #, AdaBoostClassifier()
    #, GaussianNB()
]

### Load same training and test set of the Artificial Neural Networks

In [5]:
class TensorDataset(torch.utils.data.Dataset):

        def __init__(self, df, filter_seq=filter_seq):
            self.seq = np.stack([np.roll(df[['delta_d', 'bearing', 'f_highway_motorway','f_traffic_signals','f_bus_stops','f_landuse_meadow','f_landuse_residential','f_landuse_industrial','f_landuse_commercial','f_shop','f_railways','f_railways_station','f_subway']].values, i, axis = 0) for i in range(filter_seq, -1, -1)], axis = 1)
            self.seq = self.seq[df['segment_ix'] >= filter_seq]

            self.labels = df[df['segment_ix'] >= filter_seq]['label'].values        
            self.user_id = df[df['segment_ix'] >= filter_seq]['user'].values
            tod = df[df['segment_ix'] >= filter_seq]['tod'].values
            self.tod_one_hot = np.eye(5)[tod]

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, key):
            return self.seq[key], self.tod_one_hot[key], self.labels[key]

### Shuffle the sequence of users

In [6]:
classification_test_performance = []
training_time = []
prediction_time = []
number_list=np.arange(0,12).tolist()
random.Random(17).shuffle(number_list)
print(number_list)
#number_list = [10, 4, 6, 8, 1, 9, 3, 7, 5, 0, 2, 11] #already randomized

[10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]


### Perform the 12-fold cross validation, rotating each user at the test set, while the others are on the training set

In [7]:
for k in range(0,12):
    # Train-test split
    print(f'Train partition: {number_list[0:k]+number_list[k+1:]}')
    print(f'Test partition: {number_list[k]}')
    user_train = train = number_list[0:k]+number_list[k+1:]
    user_test = test = [number_list[k]]
    
    # Collect features of each user
    print(f'Fold {k}, Train {train}, Test {test}')
    data_train = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True, load_Dummies=True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_train]).reset_index(drop=True)
    data_test = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True, load_Dummies=True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_test]).reset_index(drop=True)
    data_train = data_train[data_train['segment_ix'] >= filter_seq]
    data_test = data_test[data_test['segment_ix'] >= filter_seq]
    
    #Prepare the sequences of features for training set
    TimeSeries = TensorDataset(pd.concat([data_train]).reset_index(drop=True))
    XY_tr = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1],TS[2].reshape(-1)), axis=0) for TS in TimeSeries]).reshape(-1,84)
    
    #Shuffle training set to ensure a iid when training
    np.random.shuffle(XY_tr)
    X_tr = XY_tr[:,0:XY_tr.shape[1]-1]
    X_tr = StandardScaler().fit_transform(X_tr)
    Y_tr = XY_tr[:,-1]
    
    #Prepare the sequences of features for test set
    TimeSeries = TensorDataset(pd.concat([data_test]).reset_index(drop=True))
    X_te = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1]), axis=0) for TS in TimeSeries]).reshape(-1,83)
    X_te = StandardScaler().fit_transform(X_te)
    Y_te = TimeSeries[:][2]
    
    
    # iterate over classifiers (only RF in this case)
    for name, clf in zip(names, classifiers):
        startTraining = time.time()
        clf.fit(X_tr, Y_tr)
        endTraining = time.time()
        Y_pred = clf.predict(X_te)
        endPrediction = time.time()
        print('REPORT: '+name)
        cr=classification_report(Y_te, Y_pred, target_names=['Motion','Stop'],output_dict=True)
        classification_test_performance.append(cr)
        training_time.append(endTraining-startTraining)
        prediction_time.append(endPrediction-endTraining)
        print(f'Training lasted {training_time[-1]}')
        print(f'Prediction lasted {prediction_time[-1]}')
        print(classification_report(Y_te, Y_pred, target_names=['Motion','Stop'],output_dict=False))
    

F1_macro_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
print(f'MEAN = {np.mean(F1_macro_AVG)}, STDEV = {np.std(F1_macro_AVG)}')

Train partition: [3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 10
Fold 0, Train [3, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [10]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   36.2s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.5s finished


REPORT: Random Forest
Training lasted 74.09675312042236
Prediction lasted 0.5803370475769043
              precision    recall  f1-score   support

      Motion       0.96      0.54      0.69      1150
        Stop       0.44      0.94      0.60       451

    accuracy                           0.65      1601
   macro avg       0.70      0.74      0.64      1601
weighted avg       0.81      0.65      0.66      1601

Train partition: [10, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 3
Fold 1, Train [10, 0, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [3]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   44.8s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.2s finished


REPORT: Random Forest
Training lasted 60.3629195690155
Prediction lasted 0.2421116828918457
              precision    recall  f1-score   support

      Motion       0.89      0.68      0.77     21110
        Stop       0.89      0.97      0.93     58167

    accuracy                           0.89     79277
   macro avg       0.89      0.82      0.85     79277
weighted avg       0.89      0.89      0.89     79277

Train partition: [10, 3, 2, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 0
Fold 2, Train [10, 3, 2, 7, 11, 1, 9, 5, 4, 6, 8], Test [0]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   24.3s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.1s finished


REPORT: Random Forest
Training lasted 37.411975145339966
Prediction lasted 0.11748170852661133
              precision    recall  f1-score   support

      Motion       0.91      0.73      0.81     20196
        Stop       0.92      0.98      0.95     64268

    accuracy                           0.92     84464
   macro avg       0.91      0.85      0.88     84464
weighted avg       0.92      0.92      0.91     84464

Train partition: [10, 3, 0, 7, 11, 1, 9, 5, 4, 6, 8]
Test partition: 2
Fold 3, Train [10, 3, 0, 7, 11, 1, 9, 5, 4, 6, 8], Test [2]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   44.8s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.8s finished


REPORT: Random Forest
Training lasted 80.90433835983276
Prediction lasted 0.8917820453643799
              precision    recall  f1-score   support

      Motion       0.75      0.85      0.80     43225
        Stop       0.90      0.83      0.87     72339

    accuracy                           0.84    115564
   macro avg       0.83      0.84      0.83    115564
weighted avg       0.85      0.84      0.84    115564

Train partition: [10, 3, 0, 2, 11, 1, 9, 5, 4, 6, 8]
Test partition: 7
Fold 4, Train [10, 3, 0, 2, 11, 1, 9, 5, 4, 6, 8], Test [7]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   26.2s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished


REPORT: Random Forest
Training lasted 42.226372957229614
Prediction lasted 0.11041069030761719
              precision    recall  f1-score   support

      Motion       0.79      0.31      0.45      7897
        Stop       0.23      0.71      0.35      2295

    accuracy                           0.40     10192
   macro avg       0.51      0.51      0.40     10192
weighted avg       0.66      0.40      0.43     10192

Train partition: [10, 3, 0, 2, 7, 1, 9, 5, 4, 6, 8]
Test partition: 11
Fold 5, Train [10, 3, 0, 2, 7, 1, 9, 5, 4, 6, 8], Test [11]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.4s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.6s finished


REPORT: Random Forest
Training lasted 70.74363398551941
Prediction lasted 0.6856362819671631
              precision    recall  f1-score   support

      Motion       0.88      0.52      0.65     11675
        Stop       0.73      0.95      0.83     16340

    accuracy                           0.77     28015
   macro avg       0.81      0.73      0.74     28015
weighted avg       0.79      0.77      0.75     28015

Train partition: [10, 3, 0, 2, 7, 11, 9, 5, 4, 6, 8]
Test partition: 1
Fold 6, Train [10, 3, 0, 2, 7, 11, 9, 5, 4, 6, 8], Test [1]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   23.7s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.6s finished


REPORT: Random Forest
Training lasted 40.880995750427246
Prediction lasted 0.7924036979675293
              precision    recall  f1-score   support

      Motion       0.89      0.76      0.82    182626
        Stop       0.91      0.96      0.94    462039

    accuracy                           0.91    644665
   macro avg       0.90      0.86      0.88    644665
weighted avg       0.90      0.91      0.90    644665

Train partition: [10, 3, 0, 2, 7, 11, 1, 5, 4, 6, 8]
Test partition: 9
Fold 7, Train [10, 3, 0, 2, 7, 11, 1, 5, 4, 6, 8], Test [9]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   26.5s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished


REPORT: Random Forest
Training lasted 42.52719330787659
Prediction lasted 0.10700416564941406
              precision    recall  f1-score   support

      Motion       0.44      0.19      0.27        42
        Stop       0.37      0.67      0.48        30

    accuracy                           0.39        72
   macro avg       0.41      0.43      0.37        72
weighted avg       0.41      0.39      0.35        72

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 4, 6, 8]
Test partition: 5
Fold 8, Train [10, 3, 0, 2, 7, 11, 1, 9, 4, 6, 8], Test [5]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   49.0s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.6s finished


REPORT: Random Forest
Training lasted 86.53653955459595
Prediction lasted 0.6803915500640869
              precision    recall  f1-score   support

      Motion       0.70      0.65      0.67      4259
        Stop       0.89      0.91      0.90     13069

    accuracy                           0.85     17328
   macro avg       0.79      0.78      0.79     17328
weighted avg       0.84      0.85      0.84     17328

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 6, 8]
Test partition: 4
Fold 9, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 6, 8], Test [4]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   18.8s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.4s finished


REPORT: Random Forest
Training lasted 28.16252851486206
Prediction lasted 0.46804285049438477
              precision    recall  f1-score   support

      Motion       0.62      0.85      0.72     77537
        Stop       0.96      0.88      0.92    340189

    accuracy                           0.87    417726
   macro avg       0.79      0.87      0.82    417726
weighted avg       0.90      0.87      0.88    417726

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 8]
Test partition: 6
Fold 10, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 8], Test [6]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   25.5s finished
[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.4s finished


REPORT: Random Forest
Training lasted 52.37347102165222
Prediction lasted 0.41849851608276367
              precision    recall  f1-score   support

      Motion       0.87      0.63      0.73     12477
        Stop       0.15      0.41      0.22      1983

    accuracy                           0.60     14460
   macro avg       0.51      0.52      0.47     14460
weighted avg       0.77      0.60      0.66     14460

Train partition: [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6]
Test partition: 8
Fold 11, Train [10, 3, 0, 2, 7, 11, 1, 9, 5, 4, 6], Test [8]


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   49.5s finished


REPORT: Random Forest
Training lasted 80.559321641922
Prediction lasted 0.10925030708312988
              precision    recall  f1-score   support

      Motion       0.95      0.45      0.61     12727
        Stop       0.21      0.86      0.34      2211

    accuracy                           0.51     14938
   macro avg       0.58      0.66      0.47     14938
weighted avg       0.84      0.51      0.57     14938

MEAN = 0.6787218124788866, STDEV = 0.1882611444096775


[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 100 out of 100 | elapsed:    0.0s finished


In [8]:
np.save('classification_test_RF12foldDummies.npy',classification_test_performance)
F1_macro_AVG = []
F1_weighted_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
    F1_weighted_AVG.append(cr['weighted avg']['f1-score'])
supports = [cf['macro avg']['support'] for cf in classification_test_performance]
print(f'MEAN = {np.mean(F1_macro_AVG)}, WEIGHTED AVG = {np.sum([F1_macro_AVG[i]*supports[i] for i in range(0,len(F1_macro_AVG))])/(np.sum(supports))}, STDEV = {np.std(F1_macro_AVG)}')
#print(f'MEAN = {np.mean(F1_weighted_AVG)}, WEIGHTED AVG = {np.sum([F1_weighted_AVG[i]*supports[i] for i in range(0,len(F1_weighted_AVG))])/(np.sum(supports))}, STDEV = {np.std(F1_weighted_AVG)}')

MEAN = 0.6787218124788866, WEIGHTED AVG = 0.8397143375335308, STDEV = 0.1882611444096775


In [9]:
results = [i for i in zip(F1_macro_AVG, F1_weighted_AVG, supports)]

In [10]:
np.save('resRF12foldDummies.npy',results)

In [11]:
[i for i in F1_macro_AVG]

[0.6431044142597795,
 0.8501833732989095,
 0.8771180308462088,
 0.8329310911612731,
 0.39795151294638786,
 0.7403114450411816,
 0.8785576934047628,
 0.37142857142857144,
 0.7860072461914216,
 0.8183128823335528,
 0.4739980530750306,
 0.47475743575955903]

In [12]:
np.sum(training_time)

696.7860429286957

In [13]:
np.sum(prediction_time)

5.20335054397583