# Titanic Machine Learning Models 

### By Henry Yu 2017

Here, I will be using knn, logistic regression, random forest and general boosting models for my predictions 

In [1]:
# import libraries 

import pandas as pd 
import numpy as np

from pandas import Series, DataFrame

In [125]:
# import train and test 

train = pd.read_csv(r'train.csv')
test = pd.read_csv(r'test.csv')

In [126]:
# import 
import matplotlib as plt 
import seaborn as sns 
index = test['PassengerId']

In [127]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [128]:
# change a few things to category 
# we can drop cabin and ticket, those aren't too relevant to the analysis 
train.drop('Cabin', axis = 1, inplace = True)
test.drop('Cabin', axis = 1, inplace = True)
train.drop('Ticket', axis = 1, inplace = True)
test.drop('Ticket', axis = 1, inplace = True)

In [129]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 76.6+ KB


In [130]:
# features to use for analysis 
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked','Family']

In [131]:
# convert objects into categories for sex, embarked 
train['Sex'] = train['Sex'].astype('category').cat.codes
train['Embarked'] = train['Embarked'].astype('category').cat.codes
test['Sex'] = test['Sex'].astype('category').cat.codes
test['Embarked'] = test['Embarked'].astype('category').cat.codes

In [132]:
train['Family'] = train['SibSp'] + train['Parch']

In [133]:
test['Family'] = test['SibSp'] + train['Parch']

In [134]:

train.Age.fillna(value = -1,  inplace = True)
test.Age.fillna(value = -1,  inplace = True)
test.Fare.fillna(value = -1,  inplace = True)

# change that if age is null, change it into something else (-1)
# use fillna 

In [135]:
X = train[features]
Y = train['Survived']
test = test[features]

In [136]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null int8
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int8
Family      891 non-null int64
dtypes: float64(2), int64(4), int8(2)
memory usage: 50.5 KB


In [137]:
# import train test split, knn, logistic regression, gbm 

In [138]:
from sklearn.cross_validation import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [139]:
# KNN 
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X,Y)
test_predictions = knn.predict(test)

In [140]:
# get the test predictions to a csv file 
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 8 columns):
Pclass      418 non-null int64
Sex         418 non-null int8
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Embarked    418 non-null int8
Family      418 non-null float64
dtypes: float64(3), int64(3), int8(2)
memory usage: 23.7 KB


In [141]:
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": test_predictions
    })

In [142]:
columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_knn_2.csv', index=False)

In [145]:
#KNN 1 scored .65550
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, Y)
log_predictions= logreg.predict(test)

In [146]:
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": log_predictions
    })

columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_log.csv', index=False)

In [162]:
from sklearn.ensemble import GradientBoostingClassifier

common_args = {'max_depth': 6, 'n_estimators': 500, 'subsample': 0.8, 'random_state': 2}

gbm = GradientBoostingClassifier(learning_rate=0.005, **common_args)

gbm.fit(X, Y)
gbm_predictions = gbm.predict(test)

In [163]:
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": gbm_predictions
    })

columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_gbm5.csv', index=False)

In [199]:
# from Kaggle, optimal RF
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [200]:
rf_predictions = rf.predict(test)
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": rf_predictions
    })

columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_rf_fork.csv', index=False)

In [165]:
from sklearn.ensemble import RandomForestClassifier

rf_args = {'max_depth': 5, 'n_estimators': 575,  'random_state': 2, 'n_jobs': -1,'min_samples_leaf': 2, 'max_features' : 'sqrt' }

rf = RandomForestClassifier(**rf_args)
rf.fit(X, Y)
rf_predictions = rf.predict(test)


In [166]:
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": rf_predictions
    })

columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_rfc3.csv', index=False)

In [167]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 8 columns):
Pclass      891 non-null int64
Sex         891 non-null int8
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int8
Family      891 non-null int64
dtypes: float64(2), int64(4), int8(2)
memory usage: 50.5 KB


In [168]:
import xgboost

In [195]:
xgb = xgboost.XGBClassifier(n_estimators = 100, learning_rate = 0.1, min_child_weight = 1, max_depth = 5)

In [196]:
xgb.fit(X, Y)

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=100,
       nthread=-1, objective='binary:logistic', seed=0, silent=True,
       subsample=1)

In [197]:
xgb_predictions = xgb.predict(test)

In [198]:
submission = pd.DataFrame({
        "PassengerId": index,
        "Survived": xgb_predictions
    })

columnsTitles=["PassengerId", "Survived"]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv('titanic_xgb6.csv', index=False)

In [None]:
# every single one of these models don't yield any net increases anymore 
# XGBoost, randomforest, knn, logistic regression, gradient boosting do not have anything 
# we need more feature boosting on it 