# Stacking + Boosting

In [13]:
# https://www.kaggle.com/c/titanic

In [14]:
import sklearn
import pandas as pd

In [15]:
# Загружаем данные из файлов
train = pd.read_csv('./data/titanic/train.csv')
test = pd.read_csv('./data/titanic/test.csv')

In [16]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Предобработка данных

In [17]:
# Заполняем пропуски в данных медианными 
# значениями факторов на обучающей выборке
train_median = train.median()
train_imp = train.fillna(train_median)
test_imp = test.fillna(train_median)

In [18]:
# Бинаризуем категориальные признаки
CATEGORY_COL = ['Sex', 'Pclass', 'Embarked']
train_dummies = pd.get_dummies(train_imp, columns=CATEGORY_COL, drop_first=True)
test_dummies = pd.get_dummies(test_imp, columns=CATEGORY_COL, drop_first=True)

In [19]:
train_dummies.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1,0,1


In [20]:
# Удаляем лишние столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']
TARGET_COL = 'Survived'
X_train = train_dummies.drop(DROP_COL + [TARGET_COL], axis=1)
y_train = train_dummies[TARGET_COL]
X_test = test_dummies.drop(DROP_COL, axis=1)

## Тюнинг моделей. Зададим сетку параметров

In [22]:
import numpy as np
from sklearn.model_selection import KFold

clf = RandomForestClassifier()

params_grid = { # параметры для RandomForest, которые будем тюнить
    'n_estimators': [1, 2, 3, 10, 35],
    'min_samples_split': [2, 5, 10]
}

## Тюнинг моделей. Способ 1
"В лоб"

In [61]:
from sklearn.metrics import roc_auc_score

kf = KFold(n_splits=4, shuffle=True) # Всегда делайте shuffle если обучаете не на последовательных данных!

# Переберём все возможные комбинации параметров
params = [{}]
for parameter_name in params_grid:
    parameter_values = params_grid[parameter_name]
    new_params = []
    for value in parameter_values:    
        for param in params:
            updated_param = param.copy()
            updated_param[parameter_name] = value
            new_params.append(updated_param)
    params = new_params
    
# Выберем из всех вариаций параметров наилучшую
best_params = {}
best_auc = 0
for param in params:
    print(('Training RandomForest with params: ', param))
    clf.set_params(**param)
    
    fold_aucs = []
    for train_idx, test_idx in kf.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
        clf.fit(X_train_fold, y_train_fold)
        preds = clf.predict_proba(X_test_fold)
        auc = roc_auc_score(y_test_fold, preds[:, 1])
        fold_aucs.append(auc)
    auc = np.mean(fold_aucs)
    print(('AUC: ', auc))
    if auc > best_auc:
        best_params = param
        best_auc = auc

print('Best params:')
best_params

('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 2})
('AUC: ', 0.73538067805930762)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 2})
('AUC: ', 0.77211097688445918)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 2})
('AUC: ', 0.82049092492078735)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_split': 2})
('AUC: ', 0.84550641287111183)
('Training RandomForest with params: ', {'n_estimators': 35, 'min_samples_split': 2})
('AUC: ', 0.86157427693560851)
('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 5})
('AUC: ', 0.78801827545908376)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 5})
('AUC: ', 0.82451887783307265)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 5})
('AUC: ', 0.82575013134091657)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samp

{'min_samples_split': 10, 'n_estimators': 35}

## Тюнинг моделей. Способ 2
Используем GridSearchCV cо своим KFold

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

# функция, скор которой будет выводиться в гридсёче
roc_scorer = make_scorer(lambda y_true, y_pred: roc_auc_score(y_true, y_pred[:, 1]), needs_proba=True)
kf = KFold(n_splits=4, shuffle=True)
gs = GridSearchCV(clf, param_grid=params_grid, verbose=5, scoring=roc_scorer, cv=kf)
# запуск гридсёча
gs.fit(X_train, y_train)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.763001, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.748371, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.746945, total=   0.0s
[CV] min_samples_split=2, n_estimators=1 .............................
[CV]  min_samples_split=2, n_estimators=1, score=0.781358, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.794032, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]  min_samples_split=2, n_estimators=2, score=0.749958, total=   0.0s
[CV] min_samples_split=2, n_estimators=2 .............................
[CV]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s



[CV] min_samples_split=2, n_estimators=3 .............................
[CV]  min_samples_split=2, n_estimators=3, score=0.828986, total=   0.0s
[CV] min_samples_split=2, n_estimators=3 .............................
[CV]  min_samples_split=2, n_estimators=3, score=0.805681, total=   0.0s
[CV] min_samples_split=2, n_estimators=3 .............................
[CV]  min_samples_split=2, n_estimators=3, score=0.829389, total=   0.0s
[CV] min_samples_split=2, n_estimators=3 .............................
[CV]  min_samples_split=2, n_estimators=3, score=0.812026, total=   0.0s
[CV] min_samples_split=2, n_estimators=10 ............................
[CV]  min_samples_split=2, n_estimators=10, score=0.835081, total=   0.0s
[CV] min_samples_split=2, n_estimators=10 ............................
[CV]  min_samples_split=2, n_estimators=10, score=0.807310, total=   0.0s
[CV] min_samples_split=2, n_estimators=10 ............................
[CV]  min_samples_split=2, n_estimators=10, score=0.855207, to

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    5.1s finished


GridSearchCV(cv=KFold(n_splits=4, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=35, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [1, 2, 3, 10, 35], 'min_samples_split': [2, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(<lambda>, needs_proba=True), verbose=5)

In [58]:
gs.best_score_, gs.best_params_

(0.86051603495717599, {'min_samples_split': 10, 'n_estimators': 35})

In [59]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=35, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Тюнинг моделей. Способ 3
Ипользуем GridSearchCV со встроенным KFold и встроенной метрикой качества

In [None]:
gs = GridSearchCV(clf, param_grid=params_grid, verbose=5, cv=4)
gs.fit(X_train, y_train)

## Тюнинг моделей. Способ 4
Используя OOB-score (работает только для НЕбустинговых ансамблей)

In [64]:
from sklearn.metrics import roc_auc_score

kf = KFold(n_splits=4, shuffle=True) # Всегда делайте shuffle если обучаете не на последовательных данных!

best_score = 0
best_params = {}

# Опять создадим всевозможные комбинации параметров модели
params = [{}]
for parameter_name in params_grid:
    parameter_values = params_grid[parameter_name]
    new_params = []
    for value in parameter_values:    
        for param in params:
            updated_param = param.copy()
            updated_param[parameter_name] = value
            new_params.append(updated_param)
    params = new_params
    
for param in params:
    print(('Training RandomForest with params: ', param))
    clf.set_params(**param)
    clf.set_params(oob_score=True)
    
    # Это то, что нам требовалось делать раньше
#     fold_aucs = []
#     for train_idx, test_idx in kf.split(X_train):
#         X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
#         y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]
#         clf.fit(X_train_fold, y_train_fold)
#         preds = clf.predict_proba(X_test_fold)
#         auc = roc_auc_score(y_test_fold, preds[:, 1])
#         fold_aucs.append(auc)
#     print(('AUC: ', np.mean(fold_aucs)))

    # А это то, что мы можем делать сейчас без всех махинаций с KFold'ом выше
    clf.fit(X_train, y_train)
    oob_score = clf.oob_score_
    
    print(('OOB: ', oob_score))
    if oob_score > best_score:
        best_score = oob_score
        best_params = param

print('Best params:')
best_params

('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 2})
('OOB: ', 0.6835016835016835)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 2})
('OOB: ', 0.71604938271604934)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 2})
('OOB: ', 0.75084175084175087)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_split': 2})
('OOB: ', 0.79573512906846244)
('Training RandomForest with params: ', {'n_estimators': 35, 'min_samples_split': 2})
('OOB: ', 0.8204264870931538)
('Training RandomForest with params: ', {'n_estimators': 1, 'min_samples_split': 5})
('OOB: ', 0.67340067340067344)
('Training RandomForest with params: ', {'n_estimators': 2, 'min_samples_split': 5})
('OOB: ', 0.6835016835016835)
('Training RandomForest with params: ', {'n_estimators': 3, 'min_samples_split': 5})
('OOB: ', 0.7407407407407407)
('Training RandomForest with params: ', {'n_estimators': 10, 'min_samples_

{'min_samples_split': 10, 'n_estimators': 10}

## Практическое задание 1
Используя понравившийся метод, попробуйте подобрать самые важные на ваш взгляд параметры для RandomForestClassifier и GradientBoostingClassifier

## Предсказание моделей для стеккинга

## Предсказание моделей для стекинга. Способ 1
"В лоб"

In [73]:
def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    estimator_scores = np.zeros_like(y_train)
    for train_idx, test_idx in kfold.split(X_train):
        X_train_fold, X_pred_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, _ = y_train.iloc[train_idx], y_train.iloc[test_idx]
        estimator.fit(X_train_fold, y_train_fold)
        estimator_scores[test_idx] = estimator.predict_proba(X_test_fold)[:, 1]
    return estimator_scores
#     return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# инициализирем модели с подобранными гиперпараметрами
rf_estimator = RandomForestClassifier()
gb_estimator = GradientBoostingClassifier()

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred, gb_train_pred], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1]], axis=1)

  


In [74]:
X_test_stack

array([[ 0.        ,  0.04635935],
       [ 0.3       ,  0.12779691],
       [ 0.2       ,  0.13986669],
       [ 0.4       ,  0.14068034],
       [ 0.4       ,  0.4036792 ],
       [ 0.1       ,  0.11170511],
       [ 0.4       ,  0.24903108],
       [ 0.        ,  0.28030803],
       [ 0.9       ,  0.90190717],
       [ 0.1       ,  0.08107034],
       [ 0.        ,  0.09280331],
       [ 0.        ,  0.07143607],
       [ 1.        ,  0.933683  ],
       [ 0.1       ,  0.12858014],
       [ 1.        ,  0.86025295],
       [ 1.        ,  0.9270053 ],
       [ 0.        ,  0.07317554],
       [ 0.7       ,  0.16838866],
       [ 0.6       ,  0.53839304],
       [ 0.5       ,  0.37241087],
       [ 0.6       ,  0.28427084],
       [ 0.6       ,  0.49216374],
       [ 1.        ,  0.94539501],
       [ 0.7       ,  0.40325586],
       [ 0.9       ,  0.93250822],
       [ 0.1       ,  0.04924125],
       [ 1.        ,  0.95974208],
       [ 0.7       ,  0.16838866],
       [ 0.4       ,

## Предсказание моделей для стекинга. Способ 2
Красивый с использованием метода cross_val_predict()

In [79]:
from sklearn.model_selection import cross_val_predict

def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# TODO: подобрать гиперпараметры для ансамблей

# инициализирем модели с подобранными гиперпараметрами
rf_estimator = RandomForestClassifier()
gb_estimator = GradientBoostingClassifier()

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred[:, 1], gb_train_pred[:, 1]], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1]], axis=1)

In [81]:
X_test_stack

array([[ 0.1       ,  0.04635935],
       [ 0.1       ,  0.12779691],
       [ 0.5       ,  0.13986669],
       [ 0.6       ,  0.14068034],
       [ 0.3       ,  0.4036792 ],
       [ 0.3       ,  0.11170511],
       [ 0.3       ,  0.24903108],
       [ 0.2       ,  0.28030803],
       [ 1.        ,  0.90190717],
       [ 0.        ,  0.08107034],
       [ 0.        ,  0.09280331],
       [ 0.1       ,  0.07143607],
       [ 1.        ,  0.933683  ],
       [ 0.2       ,  0.12858014],
       [ 1.        ,  0.86025295],
       [ 0.9       ,  0.9270053 ],
       [ 0.1       ,  0.07317554],
       [ 0.7       ,  0.16838866],
       [ 0.5       ,  0.53839304],
       [ 0.4       ,  0.37241087],
       [ 0.2       ,  0.28427084],
       [ 0.5       ,  0.49216374],
       [ 1.        ,  0.94539501],
       [ 0.2       ,  0.40325586],
       [ 1.        ,  0.93250822],
       [ 0.1       ,  0.04924125],
       [ 1.        ,  0.95974208],
       [ 0.6       ,  0.16838866],
       [ 0.6       ,

## Объединяем предсказания ансамблей с помощью логистической регрессии

In [70]:
from sklearn.linear_model import LogisticRegression

# TODO: подобрать гиперпараметры LogisticRegression

logreg = LogisticRegression().fit(X_train_stack, y_train)
predicted = logreg.predict(X_test_stack)

## Формируем файл для отправки

In [71]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))