### Prepare the data
+  Remove some columns
+  Target encoding for categorical columns
+  Fill NAs

In [59]:
import pandas as pd

In [60]:
data = pd.read_csv(filepath_or_buffer="data/train.csv")

In [61]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [62]:
data.drop(labels=["PassengerId","Name","Ticket","Cabin"], inplace=True, axis=1)

In [63]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [64]:
import category_encoders as ce

In [65]:
encoder = ce.TargetEncoder(cols=['Embarked','Sex'])

In [9]:
encoded = encoder.fit_transform(data,data['Survived'])

In [68]:
encoded.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


In [69]:
encoded['Age']=encoded.fillna(value=encoded['Age'].mean())

### Using XGboost with bayesian optimization for hyperparams
+ Using BayesianOptimization

In [70]:
import xgboost as xgb
from bayes_opt import BayesianOptimization

In [71]:
xgboost_data = encoded

In [72]:
xgboost_data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0.0,1,0,7.25,0,1,0,0,1
1,1,1,1.0,1,0,71.2833,1,0,1,0,0
2,1,3,1.0,0,0,7.925,1,0,0,0,1
3,1,1,1.0,1,0,53.1,1,0,0,0,1
4,0,3,0.0,0,0,8.05,0,1,0,0,1


In [73]:
y = xgboost_data['Survived']
x = xgboost_data.drop(['Survived'], axis=1)

In [74]:
xgtrain = xgb.DMatrix(x, label=y)

In [75]:
num_rounds = 100
random_state = 2018
num_iter = 25
init_points = 5
params = {'eta': 0.1,'silent': 1,'eval_metric': 'auc','verbose_eval': 0,'seed': random_state}

In [76]:
def run_xgboost(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)

    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=2,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])
    
    return cv_result['test-auc-mean'].values[-1]

In [77]:
val = run_xgboost(min_child_weight=5,colsample_bytree=0.8,max_depth=2,subsample=0.6,gamma=3,alpha=5)
print(val)

Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	train-auc:1+0	test-auc:1+0

1.0


In [78]:
xgbBO = BayesianOptimization(run_xgboost, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (5, 15),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	train-auc:1+0	test-auc:1+0

    1 | 00m00s | [35m   1.00000[0m | [32m   5.4738[0m | [32m            0.1973[0m | [32m   5.7219[0m | [32m    10.3322[0m | [32m           12.3931[0m | [32m     0.9180[0m | 
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	train-auc:1+0	test-auc:1+0

    2 | 00m00s |    1.00000 |    5.2268 |             0.6346 |    8.0393 |      6.3071 |             6.2292 |      0.7178 | 
Multipl



KeyboardInterrupt: 