In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [9]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')


In [10]:
train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

In [11]:
train_x = train_x.drop(['PassengerId'], axis=1)
test_x = test_x.drop(['PassengerId'], axis=1)


In [12]:
train_x  = train_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x  = test_x.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [16]:
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))
    
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

In [36]:
#
# model
#

In [37]:
from xgboost import XGBClassifier

In [38]:
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=20, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=71, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [39]:
pred = model.predict_proba(test_x)[:, 1]


In [40]:
pred_label = np.where(pred > 0.5, 1, 0)

In [41]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submittion_first.csv', index=False)

In [43]:
#
# cross validation
#

In [44]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

In [46]:
scores_accuracy = []
scores_logloss = []

In [48]:
kf = KFold(n_splits=4, shuffle=True, random_state=71)

In [51]:
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)
    
    va_pred = model.predict_proba(va_x)[:, 1]
    
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)

In [55]:
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)

In [59]:
print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')


logloss: 0.4270, accuracy: 0.8148


In [62]:
#
# model tuning
#

In [63]:
import itertools

In [68]:
# tuning target parameters
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

In [69]:
param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

In [70]:
params = []
scores = []

In [71]:
for max_depth, min_child_weight in param_combinations:
    score_folds = []
    
    # cross validation
    kf = KFold(n_splits=4, shuffle=True, random_state=123456)
    for tr_idx, va_idx in kf.split(train_x):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model = XGBClassifier(n_estimators=20, random_state=71, max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)
        
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        score_folds.append(logloss)
    
    params.append((max_depth, min_child_weight))
    scores.append(np.mean(score_folds))
        

In [73]:
best_idx = np.argsort(scores)[0]
best_param = params[best_idx]

In [74]:
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')

max_depth: 7, min_child_weight: 2.0


In [75]:
#
#  ensemble
#

In [76]:
from sklearn.linear_model import LogisticRegression

In [82]:
model_xgb = XGBClassifier(n_estimators=20, random_state=71)
model_xgb.fit(train_x, train_y)

pred_xgb = model_xgb.predict_proba(test_x)[:, 1]

In [84]:
train_x2 = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x2 = test.copy().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)



In [85]:
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)

pred_lr = model_lr.predict_proba(test_x2)[:, 1]

ValueError: could not convert string to float: 'Q'

In [None]:
r