# Baseline Model

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import time
from infostop import Infostop
import pyproj
import sklearn
import pickle
import torch
sys.path.append('./src')
from data_utils import *
clear_output(wait=False)
import random

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### Data clearnsing parameters

In [3]:
seq_cutoff_speed = 45
seq_cutoff_time = 60
filter_seq = 5

### Parameters to search for best performance of the random forest classifier

In [4]:
param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
rfc=RandomForestClassifier(random_state=42)

### Setup machine learning baselines
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

### Load training and test set which are rearranged in order to present a segments of 5 points with, distance between points, bearing rate and dummy variables for the geo-spatial context

In [5]:
class TensorDataset(torch.utils.data.Dataset):

        def __init__(self, df, filter_seq=filter_seq):
            self.seq = np.stack([np.roll(df[['delta_d', 'bearing', 'f_highway_motorway','f_traffic_signals','f_bus_stops','f_landuse_meadow','f_landuse_residential','f_landuse_industrial','f_landuse_commercial','f_shop','f_railways','f_railways_station','f_subway']].values, i, axis = 0) for i in range(filter_seq, -1, -1)], axis = 1)
            self.seq = self.seq[df['segment_ix'] >= filter_seq]

            self.labels = df[df['segment_ix'] >= filter_seq]['label'].values        
            self.user_id = df[df['segment_ix'] >= filter_seq]['user'].values
            tod = df[df['segment_ix'] >= filter_seq]['tod'].values
            self.tod_one_hot = np.eye(5)[tod]

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, key):
            return self.seq[key], self.tod_one_hot[key], self.labels[key]

### Collect train and test split

In [6]:
user_train, user_val, user_test = train, val, test = train_test_data_split()
user_train = user_train+user_val
train=train+val

data_train = pd.concat([create_data_frame(*load_user_data(user
                                                          ,load_web_mercator = True
                                                          , load_GPS = True
                                                          , load_Dummies=True)
                                          , segmentation=True
                                          , seq_cutoff_time = seq_cutoff_time
                                          , seq_cutoff_speed = seq_cutoff_speed) for user in user_train]).reset_index(drop=True)

data_test = pd.concat([create_data_frame(*load_user_data(user
                                                         ,load_web_mercator = True
                                                         , load_GPS = True
                                                         , load_Dummies=True)
                                         , segmentation=True
                                         , seq_cutoff_time = seq_cutoff_time
                                         , seq_cutoff_speed = seq_cutoff_speed) for user in user_test]).reset_index(drop=True)

data_train = data_train[data_train['segment_ix'] >= filter_seq]
data_test = data_test[data_test['segment_ix'] >= filter_seq]

### Create train set

In [7]:
TimeSeries = TensorDataset(pd.concat([data_train]).reset_index(drop=True))
XY_tr = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1],TS[2].reshape(-1)), axis=0) for TS in TimeSeries]).reshape(-1,84)
np.random.shuffle(XY_tr)

In [8]:
X_tr = XY_tr[:,0:XY_tr.shape[1]-1]
#X_tr = StandardScaler().fit_transform(X_tr)
Y_tr = XY_tr[:,-1]

### Start grid search

In [9]:
startGridSearch = time.time()
#https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, n_jobs=-1, verbose=1)
CV_rfc.fit(X_tr, Y_tr)
endGridSearch = time.time()

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 74.5min
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed: 207.0min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed: 248.6min finished


### Check best performance of various parameters configurations

In [10]:
print("Best parameters set found on development set:")
print()
print(CV_rfc.best_params_)
print()
print("Grid scores on development set:")
print()
means = CV_rfc.cv_results_['mean_test_score']
stds = CV_rfc.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, CV_rfc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 100}

Grid scores on development set:

0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 100}
0.899 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 500}
0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
0.899 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 200}
0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 500}
0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 100}
0.898 (+/-0.001) for {'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 200}
0

### Estimate performance on test-set

In [14]:
TimeSeries = TensorDataset(pd.concat([data_test]).reset_index(drop=True))
X_te = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1]), axis=0) for TS in TimeSeries]).reshape(-1,83)
#X_te = StandardScaler().fit_transform(X_te)
Y_te = TimeSeries[:][2]

startPrediction = time.time()
y_true, y_pred = Y_te, CV_rfc.predict(X_te)
endPrediction = time.time()

cr=classification_report(Y_te, y_pred, target_names=['Motion','Stop'],output_dict=True)

print(classification_report(Y_te, y_pred, target_names=['Motion','Stop']))

#classification_test_performance.append(cr)
#print(f'Gridsearch and Training lasted {endGridSearch-startGridSearch}')
#print(f'Prediction lasted {endPrediction-startPrediction}')
    

#F1_macro_AVG = []
#for cr in classification_test_performance:
#    F1_macro_AVG.append(cr['macro avg']['f1-score'])
#print(f'MEAN = {np.mean(F1_macro_AVG)}, STDEV = {np.std(F1_macro_AVG)}')

              precision    recall  f1-score   support

      Motion       0.80      0.79      0.80     42456
        Stop       0.93      0.93      0.93    122886

    accuracy                           0.90    165342
   macro avg       0.86      0.86      0.86    165342
weighted avg       0.90      0.90      0.90    165342



In [16]:
print(endGridSearch-startGridSearch)
print(endPrediction-startPrediction)

15171.945814847946
2.8301570415496826
