In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
seed = 20

In [None]:
df = pd.read_csv('Final_CSV/Geolife_Features.csv')

In [None]:
df.head(10)

### Data Preprocessing

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
df = df.loc[df['Label-state'] != 'Unlabelled']
df.loc[df['Mode of Transport']=='taxi','Mode of Transport'] = 'car' # group taxis and cars

for column in df.columns:
    if 'Unnamed' in column:
        df.drop(column, axis=1, inplace=True)

len(df) #We omit almost half of the data, by removing the unlabelled data        

In [None]:
df['Previous Mode'] = df['Mode of Transport'].shift() #Adding the previous mode as a predictor variable
df = df.dropna(subset=['Previous Mode']) #Remove the NaN value that is caused by using the .shift function 
df.loc[df['Previous Mode']=='taxi','Mode of Transport'] = 'car' # group taxis and cars

#To check whether we removed the Nan value: 
print(df['Previous Mode'].unique())
print(len(df)) 

In [None]:
from collections import Counter
Counter(df['Mode of Transport'])

In [None]:
#Remove transportation modes that have a lower frequency than 3
df = df.loc[df['Mode of Transport'] != 'airplane']
df = df.loc[df['Mode of Transport'] != 'run']
df = df.loc[df['Mode of Transport'] != 'motorcycle']

In [None]:
# Integer encoding for the Transportation Mode variables
label_encoder = LabelEncoder()
df['Previous Mode'] = label_encoder.fit_transform(df['Previous Mode'].astype(str))
df['Mode of Transport'] = label_encoder.fit_transform(df['Previous Mode'].astype(str))



#print(df['Previous Mode'].unique())

In [None]:
# Integer encoding for the Transportation Mode variable

modes = np.array(df['Mode of Transport'])

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(modes)

print(modes)

### Correlation matrix for feature selection

In [None]:
df_corr = df.drop(['Path','Label-state','user_id','n_clusters', 'Mode of Transport', 'main_cluster', 'Previous Mode'], axis=1)

In [None]:
import seaborn as sns
corr = df_corr.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
corr = df_corr.corr()
corr.style.background_gradient().set_precision(2)

### Feature selection

In [None]:
#With clustering: 
#feature_drop = ['Mode of Transport','Path','Label-state', 'Point Count','Duration','Path-Crow Ratio',
#              'Covered Area','Area/Length','Hurst Exponent','Length', 'user_id','n_clusters']

#Without clustering:
feature_drop = ['Mode of Transport','Path','Label-state', 'Point Count','Duration','Path-Crow Ratio',
               'Covered Area','Area/Length','Hurst Exponent','Length', 'user_id','n_clusters', 'main_cluster']

features = list(df.drop(feature_drop, axis=1).columns)
features

### Data construction

In [None]:
X = np.array(df.drop(feature_drop, axis=1))
Y = integer_encoded

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = seed)

# Models

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

### Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, Y)

In [None]:
for x in ['accuracy', 'f1_micro', 'recall_micro', 'precision_micro']:
    results = cross_val_score(clf, X,Y, cv=kfold, scoring = x)
    print(x ,": %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
    
    
print(pd.DataFrame({'Feature Importance':clf.feature_importances_, 'Feature':features}).loc[:,('Feature','Feature Importance')])

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy", accuracy)

recall = recall_score(y_test, y_pred, average='micro')  
print("Recall", recall)

precision = precision_score(y_test, y_pred, average='micro')  
print("Precision", precision)

f1 = f1_score(y_test, y_pred, average='micro')  
print("F1", f1)

### Scikit Learn Gradient Boost

In [None]:
clf = GradientBoostingClassifier(max_depth=2, random_state=0)
clf.fit(X, Y)

In [None]:
for x in ['accuracy', 'f1_micro', 'recall_micro', 'precision_micro']:
    results = cross_val_score(clf, X,Y, cv=kfold, scoring = x)
    print(x ,": %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
      
print(pd.DataFrame({'Feature Importance':clf.feature_importances_, 'Feature':features}).loc[:,('Feature','Feature Importance')])

In [None]:
clf = GradientBoostingClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy", accuracy)

recall = recall_score(y_test, y_pred, average='micro')  
print("Recall", recall)

precision = precision_score(y_test, y_pred, average='micro')  
print("Precision", precision)

f1 = f1_score(y_test, y_pred, average='micro')  
print("F1", f1)

### XGBoost

In [None]:
clf = XGBClassifier()
clf.fit(X, Y)

In [None]:
for x in ['accuracy', 'f1_micro', 'recall_micro', 'precision_micro']:
    results = cross_val_score(clf, X,Y, cv=kfold, scoring = x)
    print(x ,": %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    
print(pd.DataFrame({'Feature Importance':clf.feature_importances_, 'Feature':features}).loc[:,('Feature','Feature Importance')])

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy", accuracy)

recall = recall_score(y_test, y_pred, average='micro')  
print("Recall", recall)

precision = precision_score(y_test, y_pred, average='micro')  
print("Precision", precision)

f1 = f1_score(y_test, y_pred, average='micro')  
print("F1", f1)