# Baseline Model

In [29]:
%load_ext autoreload
%autoreload 2

import sys
import time
from infostop import Infostop
import pyproj
import sklearn
import pickle
import torch
sys.path.append('./src')
from data_utils import *
clear_output(wait=False)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### 4-fold Infostop

In [108]:
seq_cutoff_speed = 45
seq_cutoff_time = 60
filter_seq = 5
classification_test_performance = []
for k in range(1,5):
    if k == 1:
        user_train, user_val, user_test = train, val, test = train_test_data_split()
        user_train = user_train+user_val
        train=train+val
    else:
        rotate = user_test
        user_test = test = user_train[0:len(rotate)]
        train = train+rotate
        user_train = train[len(rotate)-12:]
        train = user_train
    data_train = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_train]).reset_index(drop=True)
    data_test = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_test]).reset_index(drop=True)
    data_train = data_train[data_train['segment_ix'] >= filter_seq]
    data_test = data_test[data_test['segment_ix'] >= filter_seq]
    model = Infostop()
    labels = model.fit_predict(data_test[['lon', 'lat']].values)
    pred = np.zeros_like(labels)
    pred[labels >= 0] = 1
    cr=classification_report(data_test['label'], pred, target_names = ['Motion','Stop'],output_dict=True)
    classification_test_performance.append(cr)
F1_macro_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
print(f'MEAN = {np.mean(F1_macro_AVG)}, STDEV = {np.std(F1_macro_AVG)}')

MEAN = 0.67053601208359, STDEV = 0.11644495482974151


### Setup machine learning baselines
https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [109]:
names = [
    "Random Forest"
    #, "Decision Tree"
    #, "Nearest Neighbors"
    #, "Neural Net", "AdaBoost"
    #, "Naive Bayes"
]

classifiers = [
    RandomForestClassifier(n_estimators=100, random_state=0, class_weight="balanced")
    #, DecisionTreeClassifier(max_depth=5, class_weight="balanced")
    #, KNeighborsClassifier(3)
    #, MLPClassifier(alpha=1, max_iter=1000)
    #, AdaBoostClassifier()
    #, GaussianNB()
]

### Load training and test set which are rearranged in order to present a segments of 5 points with distance between points and bearing rate

In [110]:
class TensorDataset(torch.utils.data.Dataset):

        def __init__(self, df, filter_seq=filter_seq):
            self.seq = np.stack([np.roll(df[['delta_d', 'bearing']].values, i, axis = 0) for i in range(filter_seq, -1, -1)], axis = 1)
            self.seq = self.seq[df['segment_ix'] >= filter_seq]

            self.labels = df[df['segment_ix'] >= filter_seq]['label'].values        
            self.user_id = df[df['segment_ix'] >= filter_seq]['user'].values
            tod = df[df['segment_ix'] >= filter_seq]['tod'].values
            self.tod_one_hot = np.eye(5)[tod]

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, key):
            return self.seq[key], self.tod_one_hot[key], self.labels[key]

### 4-fold Random Forest

In [122]:
seq_cutoff_speed = 45
seq_cutoff_time = 60
filter_seq = 5
classification_test_performance = []
for k in range(1,5):
    if k == 1:
        user_train, user_val, user_test = train, val, test = train_test_data_split()
        user_train = user_train+user_val
        train=train+val
    else:
        rotate = user_test
        user_test = test = user_train[0:len(rotate)]
        train = train+rotate
        user_train = train[len(rotate)-12:]
        train = user_train
    data_train = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_train]).reset_index(drop=True)
    data_test = pd.concat([create_data_frame(*load_user_data(user,load_web_mercator = True, load_GPS = True), segmentation=True, seq_cutoff_time = seq_cutoff_time, seq_cutoff_speed = seq_cutoff_speed) for user in user_test]).reset_index(drop=True)
    data_train = data_train[data_train['segment_ix'] >= filter_seq]
    data_test = data_test[data_test['segment_ix'] >= filter_seq]
    TimeSeries = TensorDataset(pd.concat([data_train]).reset_index(drop=True))
    X_tr = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1]), axis=0) for TS in TimeSeries]).reshape(-1,17)
    X_tr = StandardScaler().fit_transform(X_tr)
    Y_tr = TimeSeries[:][2]
    
    TimeSeries = TensorDataset(pd.concat([data_test]).reset_index(drop=True))
    X_te = np.array([np.concatenate(((TS[0]).reshape(-1),TS[1]), axis=0) for TS in TimeSeries]).reshape(-1,17)
    X_te = StandardScaler().fit_transform(X_te)
    Y_te = TimeSeries[:][2]
    
    
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        startTraining = time.time()
        clf.fit(X_tr, Y_tr)
        endTraining = time.time()
        Y_pred = clf.predict(X_te)
        endPrediction = time.time()
        print('REPORT: '+name)
        cr=classification_report(Y_te, Y_pred, target_names=['Motion','Stop'],output_dict=True)
        classification_test_performance.append(cr)
        print(f'Training lasted {endTraining-startTraining}')
        print(f'Prediction lasted {endPrediction-endTraining}')
    

F1_macro_AVG = []
for cr in classification_test_performance:
    F1_macro_AVG.append(cr['macro avg']['f1-score'])
print(f'MEAN = {np.mean(F1_macro_AVG)}, STDEV = {np.std(F1_macro_AVG)}')

REPORT: Random Forest
Training lasted 540.3672578334808
Prediction lasted 1.6743803024291992
REPORT: Random Forest
Training lasted 441.91489481925964
Prediction lasted 5.761707544326782
REPORT: Random Forest
Training lasted 234.29623460769653
Prediction lasted 7.5285561084747314
REPORT: Random Forest
Training lasted 527.4638383388519
Prediction lasted 2.5795631408691406
MEAN = 0.7340969788236402, STDEV = 0.1927068997498603


In [123]:
F1_macro_AVG

[0.8627808039885863, 0.4009358450978183, 0.8299151930060944, 0.842756073202062]