In [86]:
#Based on this article: https://towardsdatascience.com/predicting-the-survival-of-titanic-passengers-30870ccc7e8

#ToDo:

#Code functional
#Plot features
#Feature engineering (e.g. cabin)
#Feature selection
#Outliers & noisy features
#Precision_recall curve - ROC curve

#Reformat the result file according to the sample file and submit to Kaggle
#Upload the final version to GitHub


In [87]:
#Load libraries

import numpy as np 
import pandas as pd 

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score


#Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [88]:
#Train data

train_df.describe()
train_df.info()
train_df.head()
print('\n\n')

##################################################################################################################
#Train data: Look at the input features & output feature relation

#sns.barplot(x='pclass', y='survived', data=train_df)

##################################################################################################################
#Train data: Add passengerId column

train_df['passengerId'] = range(len(train_df))

##################################################################################################################
#Train data: Handling missing values: age, embarked, cabin

mean_age = train_df['age'].mean()
train_df['age'].fillna(mean_age, inplace=True)

#mode_age = train_df['age'].mode()
#train_df['age'].fillna(mode_age, inplace=True)
#train_df.fillna(value = {'embarked':'S'})
common_value = 'S'
train_df['embarked'].fillna(common_value, inplace=True)

train_df.drop(columns=['cabin'], inplace=True)

##################################################################################################################
#Train data: Remove none-informative columns

train_df.drop(columns=['name','ticket','passengerId'], inplace=True)

##################################################################################################################
#Train data: Convert categorical columns

train_df['sex'].replace(['male','female'], [0,1], inplace=True)
train_df['embarked'].replace(['C','S','Q'], [0,1,2], inplace=True)
train_df['embarked'].astype(int)

##################################################################################################################
#Train data: Scale and normalize: age, fare

train_df['age'] = ( train_df['age'] - train_df['age'].mean() ) / train_df['age'].std()
train_df['fare'] = ( train_df['fare'] - train_df['fare'].mean() ) / train_df['fare'].std()

##################################################################################################################
train_df.info()
train_df.head(891)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   name      891 non-null    object 
 3   sex       891 non-null    object 
 4   age       714 non-null    float64
 5   sibsp     891 non-null    int64  
 6   parch     891 non-null    int64  
 7   ticket    891 non-null    object 
 8   fare      891 non-null    float64
 9   cabin     204 non-null    object 
 10  embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    int64  
 3   age       891 non-

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,0,-5.921480e-01,1,0,-0.502163,1
1,1,1,1,6.384304e-01,1,0,0.786404,0
2,1,3,1,-2.845034e-01,0,0,-0.488580,1
3,1,1,1,4.076970e-01,1,0,0.420494,1
4,0,3,0,4.076970e-01,0,0,-0.486064,1
...,...,...,...,...,...,...,...,...
886,0,2,0,-2.075923e-01,0,0,-0.386454,1
887,1,1,1,-8.228815e-01,0,0,-0.044356,1
888,0,3,1,4.371893e-15,1,2,-0.176164,1
889,1,1,0,-2.845034e-01,0,0,-0.044356,0


In [89]:
#Test data

train_df.describe()
test_df.info()
test_df.head()
print('\n\n')

##################################################################################################################
#Test data: Handling missing values: age, embarked, cabin, fare

mean_age = test_df['age'].mean()
test_df['age'].fillna(mean_age, inplace=True)

common_value = 'S'
test_df['embarked'].fillna(common_value, inplace=True)

test_df.drop(columns=['cabin'], inplace=True)

mean_fare = test_df['fare'].mean()
test_df['fare'].fillna(mean_fare, inplace=True)
##################################################################################################################
#Test data: Remove none-informative columns

test_df.drop(columns=['name','ticket'], inplace=True)

##################################################################################################################
#Test data: Convert categorical columns

test_df['sex'].replace(['male','female'], [0,1], inplace=True)
test_df['embarked'].replace(['C','S','Q'], [0,1,2], inplace=True)
test_df['embarked'].astype(int)

##################################################################################################################
#Test data: Scale and normalize: age, fare

test_df['age'] = ( test_df['age'] - test_df['age'].mean() ) / test_df['age'].std()
test_df['fare'] = ( test_df['fare'] - test_df['fare'].mean() ) / test_df['fare'].std()

##################################################################################################################
test_df.info()
test_df.head(418)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    418 non-null    int64  
 1   name      418 non-null    object 
 2   sex       418 non-null    object 
 3   age       332 non-null    float64
 4   sibsp     418 non-null    int64  
 5   parch     418 non-null    int64  
 6   ticket    418 non-null    object 
 7   fare      417 non-null    float64
 8   cabin     91 non-null     object 
 9   embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 32.8+ KB



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    418 non-null    int64  
 1   sex       418 non-null    int64  
 2   age       418 non-null    float64
 3   sibsp     418 non-null    int64  
 4   parch     418 non-

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,0,3.345917e-01,0,0,-0.497811,2
1,3,1,1.323944e+00,1,0,-0.512660,1
2,2,0,2.511166e+00,0,0,-0.464532,2
3,3,0,-2.590195e-01,0,0,-0.482888,1
4,3,1,-6.547602e-01,1,1,-0.417971,1
...,...,...,...,...,...,...,...
413,3,0,-2.530716e-15,0,0,-0.493856,1
414,1,1,6.907583e-01,0,0,1.312180,0
415,3,0,6.511842e-01,0,0,-0.508183,1
416,3,0,-2.530716e-15,0,0,-0.493856,1


In [90]:
#Model training and testing using the train data

X_train = train_df.drop('survived', axis=1)
Y_train = train_df['survived']
X_test =  test_df

##################################################################################################################
#LR

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_logreg = round(logreg.score(X_train, Y_train) * 100, 2)
print('LR Acc: ', acc_logreg)

##################################################################################################################
#DT

decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print('DT Acc: ', acc_decision_tree)

##################################################################################################################
#NN

perceptron = Perceptron(max_iter=10)
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
print('NN Acc: ', acc_perceptron)

##################################################################################################################
#RF
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print('RF Acc: ', acc_random_forest)


LR Acc:  80.02
DT Acc:  98.2
NN Acc:  75.98
RF Acc:  98.2


In [91]:
#Model training and testing using kfold cv

X_train = train_df.drop('survived', axis=1)
Y_train = train_df['survived']
X_test =  test_df

##################################################################################################################
#LR

logreg = LogisticRegression()
scores = cross_val_score(logreg, X_train, Y_train, cv=10, scoring='accuracy')
print('LR kfold Scores:', scores)
print('LR kfold Mean:', scores.mean())
print('LR kfold Standard Deviation:', scores.std())
print('\n\n')

##################################################################################################################
#DT

decision_tree = DecisionTreeClassifier() 
scores = cross_val_score(decision_tree, X_train, Y_train, cv=10, scoring='accuracy')
print('DT kfold Scores:', scores)
print('DT kfold Mean:', scores.mean())
print('DT kfold Standard Deviation:', scores.std())
print('\n\n')

##################################################################################################################
#NN

perceptron = Perceptron(max_iter=100)
scores = cross_val_score(perceptron, X_train, Y_train, cv=10, scoring='accuracy')
print('NN kfold Scores:', scores)
print('NN kfold Mean:', scores.mean())
print('NN kfold Standard Deviation:', scores.std())
print('\n\n')

##################################################################################################################
#RF
random_forest = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(random_forest, X_train, Y_train, cv=10, scoring='accuracy')
print('RF kfold Scores:', scores)
print('RF kfold Mean:', scores.mean())
print('RF kfold Standard Deviation:', scores.std())


LR kfold Scores: [0.78888889 0.76404494 0.74157303 0.83146067 0.78651685 0.76404494
 0.78651685 0.78651685 0.79775281 0.82022472]
LR kfold Mean: 0.7867540574282147
LR kfold Standard Deviation: 0.02513443493710398



DT kfold Scores: [0.71111111 0.82022472 0.70786517 0.76404494 0.82022472 0.76404494
 0.82022472 0.7752809  0.85393258 0.82022472]
DT kfold Mean: 0.7857178526841448
DT kfold Standard Deviation: 0.04700182039420602



NN kfold Scores: [0.68888889 0.68539326 0.70786517 0.83146067 0.41573034 0.40449438
 0.78651685 0.52808989 0.7752809  0.75280899]
NN kfold Mean: 0.6576529338327092
NN kfold Standard Deviation: 0.1460735809704356



RF kfold Scores: [0.73333333 0.82022472 0.76404494 0.82022472 0.88764045 0.84269663
 0.82022472 0.75280899 0.85393258 0.85393258]
RF kfold Mean: 0.8149063670411986
RF kfold Standard Deviation: 0.04721294300425295


In [92]:
#Feature importance by RF

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

print(X_train.columns, random_forest.feature_importances_)


Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object') [0.08822101 0.26591958 0.25867446 0.04682682 0.03854428 0.26793055
 0.0338833 ]


In [93]:
#RF hyperparam tunning

#param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10, 25, 50, 70], "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35], "n_estimators": [100, 400, 700, 1000, 1500]}
#rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
#clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
#clf.fit(X_train, Y_train)
#clf.bestparams


#Train a RF with the best parameters

random_forest = RandomForestClassifier(criterion = "gini", min_samples_leaf = 1, min_samples_split = 10, n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")


oob score: 82.83 %


In [94]:
#Confusion Matrix

predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
print('Confusion Matrix:', confusion_matrix(Y_train, predictions))

print('Precision:', precision_score(Y_train, predictions))
print('Recall:', recall_score(Y_train, predictions))
print('f_score', f1_score(Y_train, predictions))


Confusion Matrix: [[492  57]
 [ 98 244]]
Precision: 0.8106312292358804
Recall: 0.7134502923976608
f_score 0.7589424572317263


In [95]:
#Add passengerId & Y_prediction columns to the test data and save as CSV

test_df['passengerId'] = range(891 , 891+len(test_df))
test_df['survived'] = Y_prediction
test_df.info()
test_df.to_csv('result.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pclass       418 non-null    int64  
 1   sex          418 non-null    int64  
 2   age          418 non-null    float64
 3   sibsp        418 non-null    int64  
 4   parch        418 non-null    int64  
 5   fare         418 non-null    float64
 6   embarked     418 non-null    int64  
 7   passengerId  418 non-null    int64  
 8   survived     418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB
