In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the script and dataset
FOLDERNAME = 'SMM636/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

In [None]:
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# load data 
titanic = pd.read_csv("/content/drive/My Drive/SMM636/train_titanic.csv")
# get feature matrix of training set
X = titanic.loc[:, ['Pclass','Parch']] 
y = titanic.Survived

In [3]:
# get training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=105)

# **Tune *k* for *k*NN by cross-validation**

In [None]:
# set tuning values
tuned_parameters = [{"n_neighbors": [1,3,5,7,9]}]
# tune the parameter by k-fold cross-validation
# more details of grid search, see 'https://scikit-learn.org/stable/modules/grid_search.html#exhaustive-grid-search'
knnCV = GridSearchCV(neighbors.KNeighborsClassifier(), tuned_parameters, scoring='accuracy',cv=5)
# scoring can be set to f1, precision, recall, roc_auc and many other metrics, depending on the task and data
# more details see 'https://scikit-learn.org/stable/modules/model_evaluation.html'
knnCV.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(knnCV.best_params_)
print()
print("Grid scores on development set:")
print()
means = knnCV.cv_results_["mean_test_score"]
stds = knnCV.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, knnCV.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std, params))

In [None]:
# predict test set labels
y_pred = knnCV.predict(X_test)
y_true, y_pred = y_test, y_pred
print(classification_report(y_true, y_pred))
sum(y_pred==y_test)/len(y_test)

# **Tune gamma and C for SVM with RBF kernel by cross-validation**

In [6]:
from sklearn.svm import SVC

In [None]:
tuned_parameters_rbf = [{"kernel": ["rbf"], "gamma": [1,1e-1,1e-2,1e-3, 1e-4], "C": [1, 10, 100,1000]}]
svmClassifier = GridSearchCV(SVC(), tuned_parameters_rbf, scoring='accuracy',cv=5)
svmClassifier.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(svmClassifier.best_params_)
print()
print("Grid scores on development set:")
print()
means = svmClassifier.cv_results_["mean_test_score"]
stds = svmClassifier.cv_results_["std_test_score"]
for mean, std, params in zip(means, stds, svmClassifier.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std, params))


In [None]:
y_pred_svm = svmClassifier.predict(X_test)
y_true, y_pred = y_test, y_pred_svm
print(classification_report(y_true, y_pred_svm))
sum(y_pred_svm==y_test)/len(y_test)

# **Decision tree and random forest**

In this part, we are going to know how to fit decision tree and random forest in Python. I am not going to show you the parameter tuning process, to make this part more straightforward. If you want to tune the parameters, you can follow similar steps as in previous sections. 

**Here we are going to evaluate the performance of the classifier by cross-validation, so cross-validation here is for model evaluation rather than parameter tuning.**

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd

In [None]:
# fit a decision tree and obtain a cross-validation evaluation score
dt = DecisionTreeClassifier(ccp_alpha=0.001,random_state=0) 
# here ccp_alpha is the complexity parameter. you can tune it by cross-validation as in knn and svm. here I use 
# a fixed value for illustration
# evaluate model performance by cross-validation
scores_dt = cross_val_score(dt, X, y, cv=5, scoring='accuracy')
print(scores_dt)
scores_dt.mean()

In [None]:
# draw a boxplot to visualise the classification performance
scores_df=pd.DataFrame(scores_dt,columns=['Accuracy of decision tree'])
boxplot=scores_df.boxplot()

In [None]:
import matplotlib.pyplot as plt
from sklearn import tree
# visualise tree
dt_vis=dt.fit(X,y)
fn=['Pclass','Parch']
cn=['Not Survived','Survived']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(dt_vis,
               feature_names = fn, 
               class_names=cn,
               filled = True);


In [None]:
# fit a random forest and obtain a cross-validation evaluation score
rf = RandomForestClassifier(n_estimators=100,max_features="sqrt",bootstrap=True,oob_score=True,random_state=0) 
# n_estimator is the number of trees, 
# max_features is the number of features that are randomly selected to build the tree
# bootstrap and oob_score are to get the OOB evaluation
rf.fit(X,y)
print(rf.oob_score_)
# evaluate model performance by cross-validation
scores_rf = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print(scores_rf)
scores_rf.mean()

In [None]:
# compare the classification performance of decision tree and random forest visually
scores_df=pd.DataFrame(
    {   "Decision tree": scores_dt,
        "Random forest": scores_rf,
    })
boxplot=scores_df.boxplot()

In [None]:
# get variable importance, e.g. mean decrease in gini index
rf_var=rf.fit(X,y)
importances = rf_var.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_var.estimators_], axis=0)
print(importances)
print(std)
print()

forest_importances = pd.Series(importances, index=['Pclass','Parch'])
print(forest_importances)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()