# Titanic: Machine Learning from Disaster

In [253]:
# https://www.kaggle.com/c/titanic

In [254]:
import sklearn
import pandas as pd
import numpy as np

In [255]:
# Загружаем данные из файлов
train = pd.read_csv('data/titanic/train.csv')
test = pd.read_csv('data/titanic/test.csv')

In [256]:
# сохраняем столбец идентификаторов пассажиров для тестовых данных
test_passenger_id = test['PassengerId']

In [257]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Предобработка данных, добавление признаков

In [258]:
# https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python

import re

RARE_TITLES = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']

# создаем словарь для исправления префиксов имени
TITLES = dict((title, 'Rare') for title in RARE_TITLES)

TITLES['Mlle'] = 'Miss'
TITLES['Ms'] = 'Miss'
TITLES['Mme'] = 'Mrs'

def get_title(name):
    """ Возвращает префикс имени """
    match = re.search(' ([A-Za-z]+)\.', name)
    if match:
        # исправляем опечатки в префиксе имени
        return TITLES.get(match.group(1), match.group(1))
    return ''

# Применяем процесс для обучающего и тестового наборов
for dataset in [test, train]:
    # длина имени
    dataset['Name_Length'] = train['Name'].apply(len)
    # была ли каюта у пассажира
    dataset['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    # сколько членов семьи было на корабле
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # флаг = 1, если путешествует в одиночестве
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    # заполняем пропуски для Embarked
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    # заполняем индексами
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    # заполняем пропуски Fare медианой
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    # заполняем пропуски возраста медианой
    dataset['Age'] = dataset['Age'].fillna(train['Age'].median())
    # добавляем префикс имени как отдельный категориальный признак
    dataset['Title'] = dataset['Name'].apply(get_title)
    dataset['Title'] = dataset['Title'].map( {'Mr': 1, 
                                              'Miss': 2, 
                                              'Mrs': 3,
                                              'Master': 4,
                                              'Rare': 5} )
    # заполняем неизвестные префиксы нулями
    dataset['Title'] = dataset['Title'].fillna(0)
    # бинаризуем пол
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    # категоризируем стоимость билета
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    # категоризируем возраст
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age']

In [259]:
# Удаляем неиспользуемые столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']

X_train = np.array(train.drop(DROP_COL + ['Survived'], axis=1))
y_train = np.array(train['Survived'])
X_test = np.array(test.drop(DROP_COL, axis=1))

## Предсказание моделей для стеккинга

In [260]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

random_state=123123
estimators = []

In [261]:
def rename_params(estimator, params, meta=False):
    correct = dict()
    pref = "{}__".format(estimator.__class__.__name__.lower())
    if meta:
        pref = "meta-{}".format(pref)
    for k, v in params.items():
        correct[pref+k] = v
    return correct

def get_best(est, params, X2train=X_train):
    grid = GridSearchCV(estimator=est, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X2train, y_train)
    print('Best score for {}: {}'.format(est.__class__.__name__, grid.best_score_))
    return grid.best_estimator_

## first stacking

In [262]:
rfc = RandomForestClassifier()
rfcp = {
    'n_estimators': [15, 20, 25, 30],
    'criterion': ['entropy'],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [2, 3, 5, 8],
    'min_samples_leaf': [2, 3, 5, 7],
    'random_state': [random_state],
}

estimators.append(get_best(rfc, rfcp))

Best score for RandomForestClassifier: 0.8294051627384961


In [263]:
etc = ExtraTreesClassifier()
etcp = rfcp.copy()

estimators.append(get_best(etc, etcp))

Best score for ExtraTreesClassifier: 0.8237934904601572


In [264]:
abc = AdaBoostClassifier()
abcp = {
    'n_estimators': [1000, 1050, 1100, 1150, 1200],
    'learning_rate': [0.01],
    'random_state': [random_state],
}

estimators.append(get_best(abc, abcp))

Best score for AdaBoostClassifier: 0.8237934904601572


In [265]:
xgbc = XGBClassifier()
xgbcp = {
    'n_estimators' : range(550, 650, 25),
    'learning_rate' : [0.01],
    'max_depth' : [2, 3, 5],
    'gamma' : [0.45, 0.5],
    'objective': ["reg:linear", "reg:logistic"],
    'silent' : [1],
    'colsample_bytree': [1.0],
    'reg_alpha': [0.18, 0.2, 0.22],
    'reg_lambda': [1.0],
    'random_state': [random_state],
}

estimators.append(get_best(xgbc, xgbcp))

Best score for XGBClassifier: 0.8316498316498316


In [266]:
estimators

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=8, max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=2, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
             oob_score=False, random_state=123123, verbose=0,
             warm_start=False),
 ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=123123, verbose=0,
            warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
           learning_rate=0.01, n_estimators=1000, random_state=123123),
 XGBClassif

In [269]:
X_train_stack = np.array([est.predict_proba(X_train)[:,1] for est in estimators]).T
X_test_stack = np.array([est.predict_proba(X_test)[:,1] for est in estimators]).T
estimators_stack = []

## second stacking

In [270]:
lr = LogisticRegression()
lrp = {
    'C': [0.01, 0.03, 0.08, 0.1, 0.3, 0.6, 1.0, 3., 5., 9., 10.0, 11., 12.],
    'penalty': ['l1', 'l2'],
    'random_state': [random_state],
}

estimators_stack.append(get_best(lr, lrp, X_train_stack))

Best score for LogisticRegression: 0.9090909090909091


In [271]:
xgbc = XGBClassifier()
xgbcp = {
    'n_estimators' : range(800, 900, 25),
    'learning_rate' : [0.01],
    
    'max_depth' : [2, 3, 5],
    #'gamma' : [0.45, 0.5, 0.55],
    'objective': ["reg:linear", "reg:logistic"],
    'silent' : [1],
    'colsample_bytree': [1.0],
    'reg_alpha': [0.7, 0.9, 1.1, 1.3, 3., 5.],
    'reg_lambda': [1.0],
    'random_state': [random_state],
}

estimators_stack.append(get_best(xgbc, xgbcp, X_train_stack))

Best score for XGBClassifier: 0.9057239057239057


In [272]:
estimators_stack

[LogisticRegression(C=9.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=123123, solver='liblinear',
           tol=0.0001, verbose=0, warm_start=False),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1.0, gamma=0, learning_rate=0.01, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=875,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=123123,
        reg_alpha=0.7, reg_lambda=1.0, scale_pos_weight=1, seed=None,
        silent=1, subsample=1)]

In [273]:
X_train_stack_second = np.array([est.predict_proba(X_train_stack)[:,1] for est in estimators_stack]).T
X_test_stack_second = np.array([est.predict_proba(X_test_stack)[:,1] for est in estimators_stack]).T

## final classifier

In [274]:
lr = LogisticRegression()
lrp = {
    'C': [0.01, 0.03, 0.08, 0.1, 0.3, 0.6, 1.0, 3., 5., 9., 10.0, 11., 12.],
    'penalty': ['l1', 'l2'],
    'random_state': [random_state],
}

classifier = get_best(lr, lrp, X_train_stack_second)

Best score for LogisticRegression: 0.936026936026936


## predictions to file

In [275]:
predicted = classifier.predict(X_test_stack_second)

In [276]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test_passenger_id, predicted):
        out.write('%s,%s\n' % (passenger, y))