In [262]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [494]:
dtypes = {
    "PassengerId" : "int64",
    "Survived": "int64",
    "Pclass" : "object",
    "Sex" : "object",
    "Age" : "float64",
    "SibSp" : "int64",
    "Parch" : "int64",
    "Fare" : "float64",
    "Cabin" : "object",
    "Embarked" : "object"
}

In [495]:
train_df = pd.read_csv('train.csv',sep=',',header=0,usecols=["PassengerId","Survived",
            "Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])
test_df = pd.read_csv('test.csv',sep=',',header=0,usecols=["PassengerId","Pclass","Sex","Age",
                                "SibSp","Parch","Fare","Cabin","Embarked"],dtype=dtypes,na_values=[' '])

df = pd.concat([train_df,test_df])
df.set_index("PassengerId",inplace=True)
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,22.0,,S,7.25,0,3,male,1,0.0
2,38.0,C85,C,71.2833,0,1,female,1,1.0
3,26.0,,S,7.925,0,3,female,0,1.0


In [496]:
df.dropna(subset=['Embarked'],inplace=True)
df.Age.fillna(X.Age.mean(),inplace=True)
df.Cabin.fillna("Without",inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 1 to 1309
Data columns (total 9 columns):
Age         1307 non-null float64
Cabin       1307 non-null object
Embarked    1307 non-null object
Fare        1306 non-null float64
Parch       1307 non-null int64
Pclass      1307 non-null object
Sex         1307 non-null object
SibSp       1307 non-null int64
Survived    889 non-null float64
dtypes: float64(3), int64(2), object(4)
memory usage: 102.1+ KB


In [504]:
df['Cabin'] = [(x.Cabin)[0] if x.Cabin != float("NaN") else "Without" for index,x in df.iterrows()]
df['Child'] = [1 if x.Age < 15 else 0 for index,x in df.iterrows()]
df['FSize'] = [x.SibSp+x.Parch+1 for index,x in df.iterrows()]
df['Alone'] = [0 if x.FSize != 1 else 1 for index,x in df.iterrows()]
df['MFam'] = [1 if x.FSize <= 4 and x.FSize >= 2 else 0 for index,x in df.iterrows()]
df['LFam'] = [1 if x.FSize > 4 else 0 for index,x in df.iterrows()]
df.head(3)

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Child,FSize,Alone,MFam,LFam
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,22.0,W,S,7.25,0,3,male,1,0.0,0,2,0,1,0
2,38.0,C,C,71.2833,0,1,female,1,1.0,0,2,0,1,0
3,26.0,W,S,7.925,0,3,female,0,1.0,0,1,1,0,0


In [498]:
num_feat = df.select_dtypes('number').columns.values
cat_feat = df.select_dtypes('object').columns.values 
df_num = df[num_feat].drop('Survived',axis=1)
df_cat = df[cat_feat]

df_num = (df_num-df_num.mean())/df_num.std()

df_cat = pd.get_dummies(df_cat)

new_df = pd.concat([df_num,df_cat,df.Survived],axis=1)

train = new_df[df.Survived.notna()]
test = new_df[df.Survived.isna()].drop('Survived',axis=1)

train.head(3)

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,...,Cabin_W,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.104522,-0.501757,-0.445237,0.480088,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,1,0.0
2,0.806132,0.735219,-0.445237,0.480088,0,0,1,0,0,0,...,0,1,0,0,1,0,0,1,0,1.0
3,0.123141,-0.488718,-0.445237,-0.479354,0,0,0,0,0,0,...,1,0,0,1,0,0,1,1,0,1.0


In [499]:
X = train.drop('Survived',axis=1)
y = train.Survived

In [507]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
folds = 5

kfold = KFold(n_splits=folds)
avg = 0
for train,test in kfold.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train],X.iloc[test],y.iloc[train],y.iloc[test]
    gd_model = GradientBoostingClassifier().fit(X_train,y_train)
    print(gd_model.score(X_test,y_test))
    avg+=gd_model.score(X_test,y_test)
    
print("Avg: ",avg/folds)

0.7865168539325843
0.8258426966292135
0.8258426966292135
0.7808988764044944
0.8531073446327684
Avg:  0.8144416936456548


In [520]:
from sklearn.ensemble import AdaBoostClassifier

avg = 0
for train,test in kfold.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train],X.iloc[test],y.iloc[train],y.iloc[test]
    ada_model = AdaBoostClassifier(learning_rate=0.5).fit(X_train,y_train)
    print(ada_model.score(X_test,y_test))
    avg+=ada_model.score(X_test,y_test)
print("Avg: ",avg/folds)

0.7471910112359551
0.8314606741573034
0.7865168539325843
0.8258426966292135
0.8248587570621468
Avg:  0.8031739986034406


In [519]:
from sklearn.ensemble import RandomForestClassifier
avg = 0
for train,test in kfold.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train],X.iloc[test],y.iloc[train],y.iloc[test]
    rf_model = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
    print(rf_model.score(X_test,y_test))
    avg+=rf_model.score(X_test,y_test)
print("Avg: ",avg/folds)

0.7471910112359551
0.7865168539325843
0.8089887640449438
0.7640449438202247
0.8248587570621468
Avg:  0.786320066019171


In [510]:
from sklearn.svm import SVC
avg = 0
for train,test in kfold.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train],X.iloc[test],y.iloc[train],y.iloc[test]
    sv_model = SVC().fit(X_train,y_train)
    print(sv_model.score(X_test,y_test))
    avg+=sv_model.score(X_test,y_test)
print("Avg: ",avg/folds)

0.8202247191011236
0.8033707865168539
0.797752808988764
0.7808988764044944
0.8305084745762712
Avg:  0.8065511331175014


In [523]:
from sklearn.ensemble import VotingClassifier

avg=0
for train,test in kfold.split(X):
    X_train, X_test, y_train, y_test = X.iloc[train],X.iloc[test],y.iloc[train],y.iloc[test]
    v_model = VotingClassifier(estimators=[
        ('rf',rf_best_model),
        ('ada',ada_best_model),
        ('gb',gd_best_model),
         ('svc',sv_model)]).fit(X_train,y_train)
    print(v_model.score(X_test,y_test))
    avg+=v_model.score(X_test,y_test)
print("Avg: ",avg/folds)

0.8202247191011236
0.8033707865168539
0.8089887640449438
0.7696629213483146
0.8361581920903954
Avg:  0.8076810766203263


In [444]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'loss': ['deviance','exponential'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(gd_model,param_grid).fit(X_test,y_test)
gd_best_model = search.best_estimator_
print(accuracy_score(gd_best_model.predict(X_test),y_test))
print(recall_score(gd_best_model.predict(X_test),y_test))
print(precision_score(gd_best_model.predict(X_test),y_test))

0.8721910112359551
0.8560606060606061
0.8100358422939068


In [446]:
param_grid = [
  {'learning_rate': np.linspace(0.1,0.01,5), 'algorithm' : ['SAMME', 'SAMME.R'],
  'n_estimators': np.arange(50,100,5) }
 ]

search = GridSearchCV(ada_model,param_grid).fit(X_test,y_test)
ada_best_model = search.best_estimator_
print(accuracy_score(ada_best_model.predict(X_test),y_test))
print(recall_score(ada_best_model.predict(X_test),y_test))
print(precision_score(ada_best_model.predict(X_test),y_test))

0.8146067415730337
0.7837837837837838
0.7275985663082437


In [448]:
param_grid = [
  {'n_estimators': np.arange(10,100,10) }
 ]

search = GridSearchCV(rf_model,param_grid).fit(X_test,y_test)
rf_best_model = search.best_estimator_
print(accuracy_score(rf_best_model.predict(X_test),y_test))
print(recall_score(rf_best_model.predict(X_test),y_test))
print(precision_score(rf_best_model.predict(X_test),y_test))

0.9859550561797753
0.9890909090909091
0.974910394265233


In [450]:
param_grid = [
  {'voting':['hard','soft'] }
 ]

search = GridSearchCV(new_model,param_grid).fit(X_test,y_test)
best_model = search.best_estimator_
print(accuracy_score(best_model.predict(X_test),y_test))
print(recall_score(best_model.predict(X_test),y_test))
print(precision_score(best_model.predict(X_test),y_test))

0.8806179775280899
0.8702290076335878
0.8172043010752689


## Write Results

In [451]:
test.Fare.fillna(test.Fare.mean(),inplace=True)

y_predict = rf_best_model.predict(test)
result_df = pd.DataFrame(columns=['PassengerId','Survived'],
                         dtype=np.int64)
result_df['PassengerId'] = test.index
result_df['Survived'] = y_predict.astype(np.int64)
result_df.to_csv('result.csv',index=False)
result_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
