In [None]:
# Please ignore this part if you don't use Colab.
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the script and dataset
FOLDERNAME = 'SMM636/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

# **Titanic - Machine Learning from Disaster**

* This task is to predict survival on the Titanic, which is a challenge provided by Kaggle: 'https://www.kaggle.com/c/titanic/overview'. The training set and test set are already split for you to use.

* There are ten variables in this dataset: 

  **One Label**: *Survival*: 0=No, 1=Yes

  **Nine Features**:
  *pclass* (ticket class); *sex*; *age*; *sibsp* (# of siblings / spouses aboard); *parch* (# of parents / children aboard); *ticket* (ticket number); *fare* (passenger fare); *cabin* (cabin number); *embarked* (port of embarkation, C=Cherbourg, Q=Queenstown, S=Southampton)

* This exercise is to get familiar with using Python for classification, so we just use two features in the dataset for illustration purpose.

# **Logistic regression via `sklearn`**

In [None]:
import pandas as pd
# load training data 
train = pd.read_csv("/content/drive/My Drive/SMM636/train_titanic.csv")
# have a look at the training data
train.head()
#train.info()

In [None]:
# get feature matrix of training set
features_train = train.loc[:, ['Pclass','Parch']] 
# DataFrame.loc: Access a group of rows and columns by label(s) or a boolean array.
# 'https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html'
print(features_train.head())
features_train.shape

In [None]:
# get labels of training set
labels_train = train.Survived
labels_train.shape

In [None]:
# build a logistic regression model
from sklearn.linear_model import LogisticRegression
# initialise a logistic regression model
# lr = LogisticRegression()
lr = LogisticRegression(penalty='none') # by default a penalty term is added 
# train the model by the training feature matrix and labels
lr.fit(features_train,labels_train)
# or combine the previous two steps in one line, the results are the same
lr = LogisticRegression(penalty='none').fit(features_train,labels_train)

In [None]:
# have a look at the estimated coefficients and intercept
print(lr.coef_, lr.intercept_)

In [None]:
# import the test set
test = pd.read_csv("/content/drive/My Drive/SMM636/test_titanic.csv")
# get the feature matrix of test set
features_test = test.loc[:, ['Pclass','Parch']] 
# note that there are no labels in test set, so we can only have our predictions, but cannot know how
# the classifier performs

In [None]:
# get prediction of survival from logistic regression
pred = lr.predict(features_test)
pred[0:9]

# **Logistic regression via `statsmodels`**

In [None]:
# however, as a statistical model, we usually want to have an easy access to the estimated coefficients,
# their p-values and other statistical quantities, as what we can easily have in R
# in this case, I would recommend to use the statsmodels library
# 'https://www.statsmodels.org/dev/examples/notebooks/generated/glm.html'
import statsmodels.api as sm
# we need to manually add a constant column to include intercept in regression
features_train_new=sm.add_constant(features_train, prepend=False) 
print(features_train_new.head())
# fit a GLM model with binomial family
lrs=sm.GLM(labels_train,features_train_new,family=sm.families.Binomial()).fit()
print(lrs.summary())

In [None]:
# predict for test set
features_test_new = sm.add_constant(features_test, prepend=False) 
scores_new = lrs.predict(features_test_new)
scores_new[0:9] # here we have the scores (posterior probability) rather than labels from prediction

In [None]:
# transform scores to labels: <0.5 --> 0; >0.5 --> 1
predict_new = scores_new # initialise the predicted label vector
predict_new.loc[predict_new<0.5] = 0
predict_new.loc[predict_new>=0.5] = 1
predict_new=predict_new.astype(int)
print(predict_new[0:9])

# ***k*NN via `sklearn`**

In this part, we are goint to know how to get training/test splits by `train_test_split` function. Thus we are going to use the training set only.

In [None]:
import numpy as np
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# We now name the matrices X and y to avoid confusion. We are going to split the dataset to a training and test set.
X=features_train 
y=labels_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=105)
X_train.head()

In [None]:
print(y_train[0:9])
# how to check how many training instances for each class?
print(sum(y_train==1))
np.count_nonzero(y_train==0)

In [None]:
# build a knn classifier based on the training set
n_neighbours=5
KNNClassifier = neighbors.KNeighborsClassifier(n_neighbours, weights="uniform")
KNNClassifier.fit(X_train, y_train)

In [None]:
# predict the test set
y_pred=KNNClassifier.predict(X_test)
# get the predicted probabilities for each class
print(KNNClassifier.predict_proba(X_test)[0:9,])
# have a look at the prediction
print(y_pred[0:9])
# get the accuracy
print(sum(y_pred==y_test)/len(y_test))
print(accuracy_score(y_pred, y_test))