In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission_data = pd.read_csv('./data/gender_submission.csv')

  from pandas import MultiIndex, Int64Index


In [2]:
#Data preprocessing
train = train.drop(['Name', 'Ticket', 'Cabin'], axis=1)

def replace_missing (attribute):
    return attribute.interpolate(inplace=True)

replace_missing(train['Age'])
replace_missing(train['Embarked'])

df = [train, test]
for data in df:
    data['Relatives'] = data['SibSp'] + data['Parch']
    data.loc[data['Relatives'] > 0, "Travelled_alone"] = 0
    data.loc[data['Relatives'] == 0, "Travelled_alone"] = 1

In [3]:
#Label Encoding for categorical variables
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_categorical = train[['Sex', 'Embarked']]

for column in train_categorical:
    label_encoder.fit(train_categorical[column])
    train[column] = label_encoder.transform(train_categorical[column])

In [4]:
#Split
X = train.drop(['Survived', 'PassengerId'], axis = 1)
y = train['Survived']

#### K-Fold cross-validation
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5, shuffle = True)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [5]:
#### Cross-validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

#DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [6]:
# "Learn" the mean from the training data
mean_train = np.mean(y_train)

# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train

# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.4f}".format(mae_baseline))

Baseline MAE is 0.4765


In [7]:
#XGBoostRegressor parameters
import time

start_time = time.time()

params = {
    'learning_rate': 0.9,
    'max_depth':10,
    'min_child_weight': 5,
    'eta':.3,
    'subsample': .3,
    'colsample_bytree': .9,
    'scale_pos_weight': 0.5,
    'gamma' : 0.5,
    'reg_lambda' : 0.1
}

params['eval_metric'] = "mae"
num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:0.30146
[1]	Test-mae:0.26950
[2]	Test-mae:0.24955
[3]	Test-mae:0.27363
[4]	Test-mae:0.29502
[5]	Test-mae:0.28183
[6]	Test-mae:0.28198
[7]	Test-mae:0.28308
[8]	Test-mae:0.27337
[9]	Test-mae:0.28611
[10]	Test-mae:0.28625
[11]	Test-mae:0.29651


In [8]:
print("Best mae: {:.4f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best mae: 0.2495 with 3 rounds


In [9]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,0.281817,0.014453,0.294177,0.025738
1,0.260283,0.028272,0.285529,0.027492
2,0.261663,0.027957,0.284112,0.01193


In [10]:
gridsearch_params = [
    (learning_rate, max_depth, min_child_weight, eta, subsample, colsample_bytree, gamma, reg_lambda)
    for learning_rate in [.5, .6, .8, .9]
    for max_depth in range(3, 6, 9)
    for min_child_weight in [.5, .75, 1]
    for eta in [.005, .01, .02]
    for subsample in [.7, .8, .9]
    for colsample_bytree in [.6, .8, .9]
    for gamma in [.3 ,.4 , .5]
    for reg_lambda in [.1, .2, 1]
]

min_mae = float("Inf")
best_params = None

for learning_rate, max_depth, min_child_weight, eta, subsample, colsample_bytree, gamma, reg_lambda in gridsearch_params:
    print ("""CV with learning_rate = {}, max_depth = {}, min_child_weight = {},
    eta = {}, subsample = {}, colsample_bytree={}, gamma={}, reg_lambda={}""".format(learning_rate, max_depth, 
                                                          min_child_weight, eta, 
                                                          subsample, colsample_bytree, gamma, reg_lambda))
    
    # Update our parameters
    params['learning_rate'] = learning_rate
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    params['eta'] = eta
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample_bytree
    params['gamma'] = gamma
    params['reg_lambda'] = reg_lambda
    
    
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].idxmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (learning_rate, max_depth, min_child_weight, eta, subsample, colsample_bytree, gamma, reg_lambda)

print("Best params: {}, {}, {}, {}, {}, {}, {}, {}, MAE: {}".format(best_params[0], best_params[1], 
                                             best_params[2], best_params[3], 
                                             best_params[4], best_params[5], best_params[6], best_params[7], min_mae))

CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsample_bytree=0.6, gamma=0.3, reg_lambda=0.1
	MAE 0.26738 for 18 rounds
CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsample_bytree=0.6, gamma=0.3, reg_lambda=0.2
	MAE 0.2580524 for 22 rounds
CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsample_bytree=0.6, gamma=0.3, reg_lambda=1
	MAE 0.2583722 for 18 rounds
CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsample_bytree=0.6, gamma=0.4, reg_lambda=0.1
	MAE 0.2588628 for 22 rounds
CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsample_bytree=0.6, gamma=0.4, reg_lambda=0.2
	MAE 0.2618568 for 22 rounds
CV with learning_rate = 0.5, max_depth = 3, min_child_weight = 0.5,
    eta = 0.005, subsample = 0.7, colsamp

In [11]:
best_params = {
    'learning_rate' : 0.9,
    'max_depth' : 3,
    'min_child_weight' : 0.75,
    'eta' : 0.005,
    'subsample' : 0.9,
    'colsample_bytree' : 0.9,
    'gamma' : 0.5,
    'reg_lambda' : 0.2,
}

In [12]:
best_model = xgb.train(
    best_params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

print("Best MAE: {:.4f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-rmse:0.37798
[1]	Test-rmse:0.38056
[2]	Test-rmse:0.38377
[3]	Test-rmse:0.38899
[4]	Test-rmse:0.39158
[5]	Test-rmse:0.39360
[6]	Test-rmse:0.39144
[7]	Test-rmse:0.39365
[8]	Test-rmse:0.39353
[9]	Test-rmse:0.39829
[10]	Test-rmse:0.39916
[11]	Test-rmse:0.40080
[12]	Test-rmse:0.40157
[13]	Test-rmse:0.40155
[14]	Test-rmse:0.40057
[15]	Test-rmse:0.40037
[16]	Test-rmse:0.40058
[17]	Test-rmse:0.40237
[18]	Test-rmse:0.40230
[19]	Test-rmse:0.40260
[20]	Test-rmse:0.40480
[21]	Test-rmse:0.40212
[22]	Test-rmse:0.41353
[23]	Test-rmse:0.41601
[24]	Test-rmse:0.41640
[25]	Test-rmse:0.41639
[26]	Test-rmse:0.41638
[27]	Test-rmse:0.42128
[28]	Test-rmse:0.42140
[29]	Test-rmse:0.42144
[30]	Test-rmse:0.42132
[31]	Test-rmse:0.42130
[32]	Test-rmse:0.42128
[33]	Test-rmse:0.42123
[34]	Test-rmse:0.42136
[35]	Test-rmse:0.41825
[36]	Test-rmse:0.42354
[37]	Test-rmse:0.42347
[38]	Test-rmse:0.42336
[39]	Test-rmse:0.42363
[40]	Test-rmse:0.42351
[41]	Test-rmse:0.42358
[42]	Test-rmse:0.42357
[43]	Test-rmse:0.4236

In [13]:
mean_absolute_error(best_model.predict(dtest), y_test)

0.30118076898393903

In [14]:
from joblib import dump

dump(best_model, "xgboost_titanic_disaster.dat")

['xgboost_titanic_disaster.dat']

In [15]:
test_categorical = test[['Sex', 'Embarked']]

test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

for column in test_categorical:
    label_encoder.fit(test_categorical[column])
    test[column] = label_encoder.transform(test_categorical[column])
    
dtest_opt = xgb.DMatrix(test)

In [16]:
pr = best_model.predict(dtest_opt)

submission_data['Survived'] = pr.round(0)

submission_data['Survived'] = submission_data['Survived'].astype(int)

In [17]:
submission_data.to_csv('submission.csv', index=False)