# The purpose of this project is to predict the survival of each passenger on the titanic based on the data given. 

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

In [2]:
raw_test_df = pd.read_csv('test.csv')
raw_train_df = pd.read_csv('train.csv')

train_df = raw_train_df.copy()
test_df = raw_test_df.copy()
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
# one hot encoding
def encode_categories(df):
    # One hot encode Embarked
    df = df.join(pd.get_dummies(df.pop('Embarked')))
    
    return df

In [4]:
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','C','Q','S']

# Drop Nan Values from the Age and Fare Columns
train_df = train_df.fillna(train_df.mean())


test_df = test_df.fillna(test_df.mean())

test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.50000,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.00000,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,30.27259,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.50000,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,30.27259,0,0,359309,8.0500,,S


In [5]:
# We need to encode the sex, cabin, and embarket columns

train_df['Sex'] = train_df['Sex'].map(lambda x: 1 if x == 'male' else 0)
train_df['Cabin'] = train_df['Cabin'].map(lambda x: 0 if pd.isna(x) else 1)
train_df = encode_categories(train_df)


test_df['Sex'] = test_df['Sex'].map(lambda x: 1 if x == 'male' else 0)
test_df['Cabin'] = test_df['Cabin'].map(lambda x: 0 if pd.isna(x) else 1)
test_df = encode_categories(test_df)

In [6]:
# Survived is our target column
train_df_target = train_df['Survived']
# Features are all columns except for the target column, name, and IDs
train_df_features = train_df.drop(['Survived','Name','PassengerId','Ticket'],axis = 1)


# Hyper Parameter selection using GridSearch CV

In [7]:
# Trees 
n_estimators = [10,50,100,1000]
# Features to consider
max_features = ['auto', 'sqrt', 'log2']
# Levels in a tree
max_depth = [2,10]
# Samples required to split
min_samples_split = [2,4]
# Samples required at each leaf
min_samples_leaf = [1,2]
# Bootstrap or not
bootstrap = [True, False]

In [8]:
# Create a param grid
param_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap}
param_grid

{'n_estimators': [10, 50, 100, 1000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [2, 10],
 'min_samples_split': [2, 4],
 'min_samples_leaf': [1, 2],
 'bootstrap': [True, False]}

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# 3 Fold Grid Search CV using a Random Forest Classifier
grid = GridSearchCV(estimator = RandomForestClassifier(),param_grid = param_grid, cv = 3, n_jobs = -1)

In [10]:
# Fit the model and obtain the best parameters
grid.fit(train_df_features,train_df_target)
grid.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 10}

In [11]:
# Fit a Random Forest Classifier with the best parameters obtained through Grid Search CV
clf=RandomForestClassifier().set_params(**grid.best_params_)
clf.fit(train_df_features,train_df_target)
# y_pred=clf.predict(train_df_target)


# print("Accuracy:", metrics.accuracy_score(train_df_target,y_pred))

RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=2,
                       n_estimators=10)

In [12]:
# Predict the test set and output to a csv file
x_test = pd.get_dummies(test_df[features])
predictions = clf.predict(x_test)

output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv',index = False)

# From kaggle we have 76.8% Accuracy