In [2]:
import warnings
warnings.filterwarnings(action='once')

In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

  from numpy.core.umath_tests import inner1d


In [4]:
data = pd.read_csv('./data/titanic_train.csv', index_col = 'PassengerId')

In [5]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
for col in data.columns:
    if data[col].hasnans:
        nans = data[col].shape[0] - data.loc[data[col].notna()].shape[0]
        print('{} col has {} NaNs out of {} vals'.format(col, nans, data[col].shape[0]))

Age col has 177 NaNs out of 891 vals
Cabin col has 687 NaNs out of 891 vals
Embarked col has 2 NaNs out of 891 vals


In [7]:
data_pclass_median = data.groupby(by = ['Pclass']).median()
data_pclass_median.head()
data_pclass_median_age = data_pclass_median['Age']
data_pclass_median_fare = data_pclass_median['Fare']

In [8]:
# data[data['Age'].isna() == True]
# data.loc[data['Age'].isna() == True , ['Age']]
data_age_na = data.loc[data['Age'].isna() == True, ['Pclass']]
imputed_ages = data_age_na.join(data_pclass_median_age, on = 'Pclass', lsuffix = ' pclass', rsuffix = ' other')
data.loc[data['Age'].isna() == True, ['Age']] = imputed_ages['Age']

In [9]:
data['Ticket'].nunique()

681

In [10]:
data.drop(columns = ['Name','Ticket','Cabin'], inplace = True)

In [11]:
to_category = ['Pclass','Sex', 'Embarked']
for col in to_category:
    data[col] = data[col].astype('category')
data['Survived'] = data['Survived'].astype('bool')

In [12]:
data = pd.get_dummies(data, columns = to_category, prefix = to_category)
data.drop(columns = ['Sex_male'], inplace = True)
data.head()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,False,22.0,1,0,7.25,0,0,1,0,0,0,1
2,True,38.0,1,0,71.2833,1,0,0,1,1,0,0
3,True,26.0,0,0,7.925,0,0,1,1,0,0,1
4,True,35.0,1,0,53.1,1,0,0,1,0,0,1
5,False,35.0,0,0,8.05,0,0,1,0,0,0,1


In [13]:
X = data[[col for col in data.columns if col != 'Survived']]
y = data['Survived']
X.head()

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,1,0,7.25,0,0,1,0,0,0,1
2,38.0,1,0,71.2833,1,0,0,1,1,0,0
3,26.0,0,0,7.925,0,0,1,1,0,0,1
4,35.0,1,0,53.1,1,0,0,1,0,0,1
5,35.0,0,0,8.05,0,0,1,0,0,0,1


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3)

In [15]:
# X_proc = X.fillna(X.median())

In [16]:
make_pipeline(StandardScaler(), svm.SVC())

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [17]:
model = make_pipeline(StandardScaler(), svm.SVC())
param_grid = {'svc__kernel':['poly'],
             'svc__degree':[2,3,4,5],
             'svc__C':[0.1,1]}
search = GridSearchCV(model, param_grid, cv = 5)
search.fit(X, y)
print('best score = {}'.format(search.best_score_))
search.best_params_

best score = 0.8204264870931538


{'svc__C': 1, 'svc__degree': 3, 'svc__kernel': 'poly'}

In [18]:
model = make_pipeline(StandardScaler(), svm.SVC())
param_grid = {'svc__kernel':['rbf'],
             'svc__gamma':[0.1, 1, 10],
             'svc__C':[0.1,1]}
search = GridSearchCV(model, param_grid, cv = 5)
search.fit(X, y)
print('best score = {}'.format(search.best_score_))
search.best_params_

best score = 0.8204264870931538


{'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}

In [19]:
model = make_pipeline(StandardScaler(), svm.SVC(C = 1, kernel='rbf', degree=3, gamma = 0.1))
scores = cross_val_score(model, X, y, cv = 5)
print('Accuracy = {} +/- {}'.format(scores.mean(), scores.std() * 2))

Accuracy = 0.8204550556478202 +/- 0.031999147267414915


In [20]:
model = make_pipeline(StandardScaler(), RandomForestClassifier())
param_grid = {'randomforestclassifier__n_estimators':[10,20,30,40],
             'randomforestclassifier__max_depth':[2,3,4,5,6],
             'randomforestclassifier__criterion':['gini','entropy']}
search = GridSearchCV(model, param_grid, cv = 5)
search.fit(X, y)
print('best score = {}'.format(search.best_score_))
search.best_params_

best score = 0.8260381593714927


{'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': 5,
 'randomforestclassifier__n_estimators': 20}

In [21]:
model = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 10, max_depth = 5))
scores = cross_val_score(model, X, y, cv = 5)
print('Accuracy = {} +/- {}'.format(scores.mean(), scores.std() * 2))

Accuracy = 0.8171282086200128 +/- 0.048460194780699496


In [22]:
model.fit(X, y)
model.steps[1][1].feature_importances_

array([0.09769598, 0.03303253, 0.01887805, 0.14998437, 0.05393554,
       0.03012827, 0.06951252, 0.50093466, 0.01826694, 0.00443183,
       0.02319932])

In [23]:
feature_scores = np.array(model.steps[1][1].feature_importances_)
feature_scores[feature_scores > 0.1]

array([0.14998437, 0.50093466])

In [24]:
model = make_pipeline(StandardScaler(), GradientBoostingClassifier())
param_grid = {'gradientboostingclassifier__n_estimators':[25, 50, 75, 100],
             'gradientboostingclassifier__max_depth':[2,3,4,5,6]}
search = GridSearchCV(model, param_grid, cv = 5)
search.fit(X, y)
print('best score = {}'.format(search.best_score_))
search.best_params_

best score = 0.835016835016835


{'gradientboostingclassifier__max_depth': 4,
 'gradientboostingclassifier__n_estimators': 100}

In [25]:
model = make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators = 100, max_depth = 4))
scores = cross_val_score(model, X, y, cv = 5)
print('Accuracy = {} +/- {}'.format(scores.mean(), scores.std() * 2))

Accuracy = 0.8350366889413987 +/- 0.03182614874822419


In [26]:
make_pipeline(StandardScaler(), xgb.XGBClassifier(max_depth = 5, reg_lambda = 1))

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbclassifier', XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constra...os_weight=None, subsample=None,
       tree_method=None, validate_parameters=None, verbosity=None))])

In [27]:
model = make_pipeline(StandardScaler(), xgb.XGBClassifier(max_depth = 5, reg_lambda = 1))
param_grid = {'xgbclassifier__max_depth':[2,3,4,5],
             'xgbclassifier__reg_lambda':[0.1, 1, 10],
             'xgbclassifier__learning_rate':[0.1, 1, 10]}
search = GridSearchCV(model, param_grid, cv = 5)
search.fit(X, y)
search.best_params_

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


{'xgbclassifier__learning_rate': 0.1,
 'xgbclassifier__max_depth': 5,
 'xgbclassifier__reg_lambda': 10}

In [28]:
model = make_pipeline(StandardScaler(), xgb.XGBClassifier(max_depth = 5, reg_lambda = 10, learning_rate = 0.1))
scores = cross_val_score(model, X, y, cv = 5)
print('Accuracy = {} +/- {}'.format(scores.mean(), scores.std() * 2))

  if diff:
  if diff:
  if diff:


Accuracy = 0.8373154071566399 +/- 0.03594723049310011


  if diff:
  if diff:


In [29]:
model = make_pipeline(StandardScaler(), xgb.XGBClassifier(max_depth = 5, reg_lambda = 10, learning_rate = 0.1))
model.fit(X, y)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
  ...cale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None))])

In [30]:
test_data = pd.read_csv('./data/titanic_test.csv', index_col = 'PassengerId')

In [31]:
for col in test_data:
    if test_data[col].hasnans:
        print('{} has {} NaNs'.format(col, test_data[test_data[col].isna() == True].shape))

Age has (86, 10) NaNs
Fare has (1, 10) NaNs
Cabin has (327, 10) NaNs


In [32]:
test_data_age_na = test_data.loc[test_data['Age'].isna() == True, ['Pclass']]
imputed_ages = test_data_age_na.join(data_pclass_median_age, on = 'Pclass', lsuffix = ' pclass', rsuffix = ' other')
test_data.loc[test_data['Age'].isna() == True, ['Age']] = imputed_ages['Age']

In [33]:
test_data_fare_na = test_data.loc[test_data['Fare'].isna() == True, ['Pclass']]
imputed_fare = test_data_fare_na.join(data_pclass_median_fare, on = 'Pclass', lsuffix = ' pclass', rsuffix = ' other')
test_data.loc[test_data['Fare'].isna() == True, ['Fare']] = imputed_fare['Fare']

In [34]:
test_data.drop(columns = ['Name','Ticket','Cabin'], inplace = True)
to_category = ['Pclass','Sex', 'Embarked']
for col in to_category:
    test_data[col] = test_data[col].astype('category')
test_data = pd.get_dummies(test_data, columns = to_category, prefix = to_category)
test_data.drop(columns = ['Sex_male'], inplace = True)

In [35]:
y_pred = model.predict(test_data)
y_pred = np.array(y_pred, dtype = 'uint8')

  if diff:


In [36]:
out = pd.DataFrame({'Survived':y_pred}, index = test_data.index)

In [37]:
out.shape

(418, 1)

In [38]:
out.to_csv('titanic_sub_xbg_better_age.csv')