In [None]:
from Scripts.plot_confusion_matrix import plot_confusion_matrix

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [None]:
seed = 20

In [None]:
df = pd.read_csv('/Users/okkevanderwal/Local_Projects/Tilburg/Geolife_Master/Okke_Master/Final_CSV/Geolife_Features.csv')
#df = pd.read_csv('Final_CSV/Geolife_Features.csv')

In [None]:
df.shape

In [None]:
df.head(10)

## Preprocessing

In [None]:
df = df.loc[df['Label-state'] != 'Unlabelled']
df.loc[df['Mode of Transport']=='taxi','Mode of Transport'] = 'car' # group taxis and cars

for column in df.columns:
    if 'Unnamed' in column:
        df.drop(column, axis=1, inplace=True)

len(df) #We omit almost half of the data, by removing the unlabelled data        

In [None]:
df['Previous Mode'] = df['Mode of Transport'].shift() #Adding the previous mode as a predictor variable
df = df.dropna(subset=['Previous Mode']) #Remove the NaN value that is caused by using the .shift function 

df.loc[df['Previous Mode']=='taxi','Mode of Transport'] = 'car' # group taxis and cars

#To check whether we removed the Nan value: 
print(df['Previous Mode'].unique())
print(len(df)) 

In [None]:
Counter(df['Mode of Transport'])

In [None]:
#Remove transportation modes that have a low frequency
df = df.loc[df['Mode of Transport'] != 'airplane']
df = df.loc[df['Mode of Transport'] != 'run']
df = df.loc[df['Mode of Transport'] != 'motorcycle']

In [None]:
# Integer encoding for the Previous Transportation Mode variable
label_encoder = LabelEncoder()
df['Previous Mode'] = label_encoder.fit_transform(df['Previous Mode'].astype(str))

In [None]:
feature_drop = ['Mode of Transport','Path','Label-state', 'Point Count','Duration','Path-Crow Ratio',
                'Covered Area','Area/Length','Hurst Exponent','Length', 'user_id', 'n_clusters']
features = list(df.drop(feature_drop, axis=1).columns)
features

In [None]:
X = np.array(df.drop(feature_drop, axis=1))
Y = df['Mode of Transport']

In [None]:
Counter(Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

## Run Models

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

### XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

## Plot Confusion Matrix

In [None]:
labels = np.unique(y_test)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=labels,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=labels, normalize=True,
                      title='Normalized confusion matrix')

plt.show()