In [1]:
import xgboost as xgb

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
df = pd.read_csv("/content/titanic_train.csv") 

In [5]:
df = df.drop(['Ticket','Cabin', 'Name', 'PassengerId'], axis=1)
# Impute median Age for NA Age values
imputed_age = df["Age"].median()
new_age_var = np.where(df["Age"].isnull(), imputed_age, df["Age"])

df["Age"] = new_age_var
df = df.dropna()
df.shape
df.head().T

Unnamed: 0,0,1,2,3,4
Survived,0,1,1,1,0
Pclass,3,1,3,1,3
Sex,male,female,female,female,male
Age,22.0,38.0,26.0,35.0,35.0
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Embarked,S,C,S,S,S


In [6]:
df_train, df_val = train_test_split(df, test_size=0.33, random_state=11)

y_train = df_train.Survived.values
y_val = df_val.Survived.values

del df_train['Survived']
del df_val['Survived']

In [7]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')


In [8]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

In [9]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)

In [10]:
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [17]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}
#https://xgboost.readthedocs.io/en/latest/parameter.html

In [18]:
model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [19]:
y_pred = model.predict(dval)

In [20]:
roc_auc_score(y_val, y_pred)

0.877753411306043

In [21]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(xgb_params, dtrain,
                  num_boost_round=100,
                  evals=watchlist, verbose_eval=10)


[0]	train-auc:0.88691	val-auc:0.855117
[10]	train-auc:0.952023	val-auc:0.873757
[20]	train-auc:0.973164	val-auc:0.875512
[30]	train-auc:0.983758	val-auc:0.875049
[40]	train-auc:0.988075	val-auc:0.873538
[50]	train-auc:0.990989	val-auc:0.871784
[60]	train-auc:0.992433	val-auc:0.868275
[70]	train-auc:0.99398	val-auc:0.869737
[80]	train-auc:0.995335	val-auc:0.866277
[90]	train-auc:0.996001	val-auc:0.865936
[99]	train-auc:0.996565	val-auc:0.865643
