# Goal: Predict survival on the Titanic


## Importing dataset

Find a pattern to estimate a passanger would survive on Titanic or not.

In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
#shape of train data set
train_data.shape

(891, 12)

## Exploring dataset

In [5]:
# Check if any data is missed
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
train_data['Pclass'].unique()

array([3, 1, 2])

In [7]:

female_survived = train_data.loc[train_data.Sex=='female']['Survived']
print(f"\n{100*sum(female_survived)/len(female_survived)}% of women survived")

male_survived = train_data.loc[train_data.Sex=='male']['Survived']
print(f"\n{100*sum(male_survived)/len(male_survived)}% of men survived")

class1_survived = train_data.loc[train_data.Pclass==1]['Survived']
print(f"\n{100*sum(class1_survived)/len(class1_survived)}% of class 1 survived")

class2_survived = train_data.loc[train_data.Pclass==2]['Survived']
print(f"\n{100*sum(class2_survived)/len(class2_survived)}% of class 2 survived")

class3_survived = train_data.loc[train_data.Pclass==3]['Survived']
print(f"\n{100*sum(class3_survived)/len(class3_survived)}% of class 3 survived")


74.20382165605096% of women survived

18.890814558058924% of men survived

62.96296296296296% of class 1 survived

47.28260869565217% of class 2 survived

24.236252545824847% of class 3 survived


## Modeling: **Random Forest Classification**

This model is constructed of several "trees", which individually consider each passanger data and determine whether the passanger survived. Then the random forest model makes a democratic decision based on the majority of "tree"'s votes

In [8]:
# Prepare dataset
output = train_data['Survived']
#selected features
features = ['Sex','Pclass','SibSp','Parch']
# One-hot-encoding to convert categorical columns to numeric ones
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

X_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male
0,3,1,0,0,1
1,1,1,0,1,0
2,3,0,0,1,0
3,1,1,0,1,0
4,3,0,0,0,1


In [9]:
#Checking feature-target shapes
print(X_train.shape)
print(output.shape)

(891, 5)
(891,)


In [10]:
# Model training
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# define the model parameters , 100 trees with max depth of 10 levels
n_estimators= 100
max_depth = 5
model= RandomForestClassifier(n_estimators= n_estimators, max_depth = max_depth, random_state=1)

# Cross-validation k-fold evaludation, kFold=5
scores = cross_val_score(model, X_train.values, output.values, cv=5)  #convert pandas data to numpy (.values)
print("CV scores:  ", scores)
print("Average of CV scores:  ", scores.mean())

CV scores:   [0.76536313 0.80337079 0.80898876 0.79213483 0.8258427 ]
Average of CV scores:   0.7991400414286611


In [11]:
# Divid dataset to training an d cross-validation sets
from sklearn.model_selection import train_test_split
X_train_split, X_CV, Y_train_split, Y_CV = train_test_split(X_train, output, test_size = 0.2, random_state= 1)

#fit the model to training data
model.fit(X_train_split, Y_train_split)

In [12]:
# Check how much each feature was effective in prediction
importances = model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Features':feature_names, 
    'Importance': importances
})
print(importance_df)

     Features  Importance
0      Pclass    0.219567
1       SibSp    0.084220
2       Parch    0.088350
3  Sex_female    0.342557
4    Sex_male    0.265306


In [13]:
# Evaluation
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
pred_CV = model.predict(X_CV)
F1_score = f1_score(Y_CV , pred_CV)
accuracy_score = accuracy_score(Y_CV , pred_CV)
print(f"Model accuracy based on F1-score metric is {F1_score}")
print(f"Model accuracy based on accuracy_score metric is {accuracy_score}")


Model accuracy based on F1-score metric is 0.6814814814814814
Model accuracy based on accuracy_score metric is 0.7597765363128491


In [14]:
#prediction
prediction = model.predict(X_test)
#save predictions
test_prediction = pd.DataFrame({'PassengerId':test_data['PassengerId'],
                               'Survived': prediction})
test_prediction.to_csv('test_prediction.csv',index=False)