In [61]:
import pandas as pd
import numpy as np
import time
import re

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold

np.random.seed(2018)
pd.set_option('display.max_columns', None)

In [6]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [9]:
train.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
65,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
819,820,0,3,"Skoog, Master. Karl Thorsten",male,10.0,3,2,347088,27.9,,S
447,448,1,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,113794,26.55,,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S
394,395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...",female,24.0,0,2,PP 9549,16.7,G6,S
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C
251,252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,347054,10.4625,G6,S
582,583,0,2,"Downton, Mr. William James",male,54.0,0,0,28403,26.0,,S
570,571,1,2,"Harris, Mr. George",male,62.0,0,0,S.W./PP 752,10.5,,S


In [66]:
#feature engineering
train = train.fillna(-1)
test = test.fillna(-1)

train['Sex_cat'] = train['Sex'].map(lambda x: 1 if x == 'male' else 0)
test['Sex_cat'] = test['Sex'].map(lambda x: 1 if x == 'male' else 0)


In [67]:
feats = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_cat']
X = train[ feats ].values
y = train['Survived'].values

In [69]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_cat
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,-1,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,-1,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,-1,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,-1,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,-1,S,0


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [48]:
model = RandomForestClassifier(n_estimators=100, max_depth=7)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [49]:
accuracy_score(y_test, y_pred)

0.8470149253731343

In [54]:
xgb_params_1 = {
    'n_jobs': 4, 
    'max_depth': 8, 
    'n_estimators': 2500, 
    'learning_rate': 0.04, 
    'min_child_weight': 8, 
    'random_state': 4096
}

xgb_params_2 = {
    'n_jobs': 4, 
    'max_depth': 14, 
    'n_estimators': 420, 
    'learning_rate': 0.056693922378212164, 
    'min_child_weight': 8, 
    'random_state': 2018
}

xgb_params_s = {
    'n_jobs': 4, 
    'max_depth': 8, 
    'n_estimators': 200, 
    'learning_rate': 0.05, 
    'min_child_weight': 8, 
    'random_state': 2018
}

In [59]:
model = XGBClassifier(**xgb_params_s)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [60]:
accuracy_score(y_test, y_pred)

0.8470149253731343

In [63]:
cv = KFold(n_splits=3, shuffle=True, random_state=2018)

scores = {
    'x1' : [],
    'x2' : [],
    'x_s' : []
}

def perform_scoring(m, y_true, y_pred):
    
    score = accuracy_score(y_true, y_pred)
    print(m + " accuracy: ", score)
    scores[m].append(score)


fold = 0
for train_idx, test_idx in cv.split(X):
    fold += 1
    print("fold: ", fold)
    
    # first model
    model = XGBClassifier(**xgb_params_1)
    model.fit(X[train_idx], y[train_idx])
    y_pred = model.predict(X[test_idx])
    perform_scoring('x1', y[test_idx], y_pred)
    
    train['XGB_1_results'] = model.predict(X)
    
    # second model
    model = XGBClassifier(**xgb_params_2)
    model.fit(X[train_idx], y[train_idx])
    y_pred = model.predict(X[test_idx])
    perform_scoring('x2', y[test_idx], y_pred)
    
    train['XGB_2_results'] = model.predict(X)
    
    # third model
    model = XGBClassifier(**xgb_params_s)
    model.fit(X[train_idx], y[train_idx])
    y_pred = model.predict(X[test_idx])
    perform_scoring('x_s', y[test_idx], y_pred)
    
    train['XGB_s_results'] = model.predict(X)
    
    
for s in scores:
    print(s, np.mean(scores[s]), np.std(scores[s]))


fold:  1
x1 accuracy:  0.8316498316498316
x2 accuracy:  0.8451178451178452
x_s accuracy:  0.8518518518518519
fold:  2
x1 accuracy:  0.7878787878787878
x2 accuracy:  0.8148148148148148
x_s accuracy:  0.8181818181818182
fold:  3
x1 accuracy:  0.8047138047138047
x2 accuracy:  0.7878787878787878
x_s accuracy:  0.7845117845117845
x1 0.808080808080808 0.018027360723017978
x2 0.8159371492704827 0.023381219591469906
x_s 0.8181818181818182 0.027491467371303917


In [70]:
# final
model = XGBClassifier(**xgb_params_s)
model.fit(X, y)
y_pred = model.predict(test[ feats ].values)

In [73]:
test['Survived'] = y_pred

In [74]:
test[ ['PassengerId', 'Survived'] ].to_csv('output/' + time.strftime("%Y%m%d_%H%M%S") + '.csv', index=False) 