# Титаник. Кто выживет?

https://www.kaggle.com/c/titanic/

In [447]:
import numpy as np
import pandas as pd

In [448]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

### Данные

In [449]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

### Фичи

чтобы одинаковым образом обработать train и test и не дублировать все операции 2 раза, соединим эти два набора данных в один, не забыв при этом:
1. выкинуть целевую переменную из train
2. проверить на соответствие набора признаков друг другу
3. добавить флаг того, является ли объект тестовым или нет

In [450]:
y_train = train.Survived
train.drop('Survived', axis=1, inplace=True)

In [451]:
train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True], dtype=bool)

In [452]:
train['is_test'] = 0
test['is_test'] = 1

In [453]:
df = pd.concat([train, test])
df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_test
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,0
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,0
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,0


супер, теперь полный набор данных можно обрабатывать вместе и в любой момент, уже обработанными, обратно разъединить на обучающую и тестовую выборки

Пол male/female закодируем в 1/0 и удалим переменные, с которыми мы не будем сейчас работать

In [454]:
df["isMale"] = df.Sex.replace({"male": 1, "female":0})
#df["NameLen"] = df.Name.apply(lambda s: len(s.split(" ")))
df.drop(["Sex", "Cabin", "Ticket", "Name", "PassengerId"], axis=1, inplace=True)

признаки, значения которых составляют небольшой перечислимый набор, закодируем в отдельные столбцы 

In [455]:
df_dummies = pd.get_dummies(df, columns=['Pclass', 'Embarked']) #, 'NameLen'])

In [456]:
df_dummies.isnull().sum()

Age           263
SibSp           0
Parch           0
Fare            1
is_test         0
isMale          0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
dtype: int64

In [457]:
X_train = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)
X_test = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [458]:
columns = X_train.columns
columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'isMale', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

### Заполнение пустых значений

заполним пустые значения средними по соответственным признакам

In [459]:
from sklearn.preprocessing import Imputer

In [460]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

In [461]:
imputer.fit(X_train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [462]:
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

### Нормировка значений

In [463]:
from sklearn.preprocessing import StandardScaler

In [464]:
scaler = StandardScaler()

In [465]:
scaler.fit(X_train_imputed)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [501]:
X_train_imputed_scaled = scaler.transform(X_train_imputed)
X_train_imputed_scaled = pd.DataFrame(X_train_imputed_scaled, columns=columns)

In [504]:
X_test_imputed = imputer.transform(X_test)
X_test_imputed_scaled = scaler.transform(X_test_imputed)

### Разделение на обучающую и тестирующую выборки

In [505]:
from sklearn.model_selection import train_test_split

In [506]:
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_imputed_scaled, y_train, test_size=0.2)

### Обучение с кросс-валидацией

кросс-валидация поможет нам подобрать лучший параметр регуляризации

In [470]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [471]:
grid = {
    'criterion': ['entropy'],
    'min_samples_leaf': [x for x in range(1, 20)],
    'max_depth': [x for x in range(1, 15)],
    'max_leaf_nodes': [10*x for x in range(1, 10)],
    'max_features': [None],
    'presort': [False],
    'class_weight': [None]
}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='accuracy', cv=7, n_jobs=-1)

In [472]:
%%time
gridsearch.fit(X_train_imputed_scaled, y_train)

Wall time: 1min 30s


GridSearchCV(cv=7, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'criterion': ['entropy'], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 'max_leaf_nodes': [10, 20, 30, 40, 50, 60, 70, 80, 90], 'max_features': [None], 'presort': [False], 'class_weight': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [473]:
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: 0.83053, std: 0.03036, params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 60, 'min_samples_leaf': 10, 'presort': False},
 mean: 0.83053, std: 0.03036, params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 70, 'min_samples_leaf': 10, 'presort': False},
 mean: 0.83053, std: 0.03036, params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': None, 'max_leaf_nodes': 80, 'min_samples_leaf': 10, 'presort': False},
 mean: 0.83053, std: 0.03738, params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 7, 'max_features': None, 'max_leaf_nodes': 30, 'min_samples_leaf': 7, 'presort': False},
 mean: 0.83053, std: 0.03323, params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 7, 'max_features': None, 'max_leaf_nodes': 30, 'min_samples_leaf': 9, 'presort': False},
 mean: 0.83053, std: 0.03166, params: {'class_weight': None, '

In [474]:
b = gridsearch.best_params_
b

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 6,
 'max_features': None,
 'max_leaf_nodes': 60,
 'min_samples_leaf': 10,
 'presort': False}

In [475]:
b = {
    'class_weight': None,
    'criterion':
    'entropy',
    'max_depth': 3,
    'max_features': None,
    'max_leaf_nodes': 10,
    'min_samples_leaf': 3,
    'presort': False
} # real best parameters

# Оценка точности

In [476]:
from sklearn.metrics import accuracy_score

In [507]:
clf = DecisionTreeClassifier(\
    criterion=b['criterion'],\
    min_samples_leaf=b['min_samples_leaf'],\
    max_depth=b['max_depth'],\
    max_leaf_nodes=b['max_leaf_nodes'],\
    max_features=b['max_features'],\
    presort=b['presort'],\
    class_weight=b['class_weight'])

In [478]:
clf.fit(X_train_fin, y_train_fin)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [479]:
y_val_pred = clf.predict(X_val)

In [480]:
accuracy_score(y_val, y_val_pred)

0.81564245810055869

# Финальное предсказание

In [508]:
clf.fit(X_train_imputed, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [482]:
params = [
    #{
    #    'class_weight': None,
    #    'criterion': 'entropy',
    #    'max_depth': 3,
    #    'max_features': None,
    #    'max_leaf_nodes': 10,
    #    'min_samples_leaf': 2,
    #    'presort': False
    #},
    #{
    #    'class_weight': None,
    #    'criterion': 'entropy',
    #    'max_depth': 3,
    #    'max_features': None,
    #    'max_leaf_nodes': 10,
    #    'min_samples_leaf': 1,
    #    'presort': False
    #},
    #{
    #    'class_weight': None,
    #    'criterion': 'entropy',
    #    'max_depth': 6,
    #    'max_features': None,
    #    'max_leaf_nodes': 60,
    #    'min_samples_leaf': 10,
    #    'presort': False
    #},
]

In [483]:
pred = []
for param in params:
    clf = DecisionTreeClassifier(\
        criterion=param['criterion'],\
        min_samples_leaf=param['min_samples_leaf'],\
        max_depth=param['max_depth'],\
        max_leaf_nodes=param['max_leaf_nodes'],\
        max_features=param['max_features'],\
        presort=param['presort'],\
        class_weight=param['class_weight'])
    clf.fit(X_train_imputed_scaled, y_train)
    pred.append(clf.predict_proba(X_test_imputed_scaled))

#from functools import reduce
#pred = reduce(lambda x, y: np.add(x, y), pred) / float(len(params))

In [484]:
#predictions = np.where(pred[:, 0] > 0.5, 0, 1)

предсказание номера класса:

In [509]:
predictions = clf.predict(X_test_imputed)
#clf.predict_proba(X_test_imputed_scaled), predictions

In [486]:
submussion = 'PassengerId,Survived\n'
submussion += "\n".join(["{},{}".format(pid, prediction) for pid, prediction in zip(test.PassengerId, predictions)])

In [487]:
with open('submission.txt', 'w') as file:
    file.write(submussion)

In [510]:
for col, val in zip(X_train.columns, clf.feature_importances_):
    print("{:30} {:.2f}".format(col, val))
#X_test

Age                            0.13
SibSp                          0.03
Parch                          0.00
Fare                           0.12
isMale                         0.54
Pclass_1                       0.00
Pclass_2                       0.00
Pclass_3                       0.18
Embarked_C                     0.00
Embarked_Q                     0.00
Embarked_S                     0.00


In [511]:
from sklearn.tree import export_graphviz

def get_tree_dot_view(clf_, feature_names=None, class_names=None):
    print(export_graphviz(clf_, out_file=None, filled=True, feature_names=feature_names, class_names=class_names))

In [512]:
get_tree_dot_view(clf, list(X_train.columns), ["Not survived", "Survived"])

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="isMale <= 0.5\nentropy = 0.9607\nsamples = 891\nvalue = [549, 342]\nclass = Not survived", fillcolor="#e5813960"] ;
1 [label="Pclass_3 <= 0.5\nentropy = 0.8237\nsamples = 314\nvalue = [81, 233]\nclass = Survived", fillcolor="#399de5a6"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
3 [label="Fare <= 28.8563\nentropy = 0.2988\nsamples = 170\nvalue = [9, 161]\nclass = Survived", fillcolor="#399de5f1"] ;
1 -> 3 ;
17 [label="entropy = 0.469\nsamples = 70\nvalue = [7, 63]\nclass = Survived", fillcolor="#399de5e3"] ;
3 -> 17 ;
18 [label="entropy = 0.1414\nsamples = 100\nvalue = [2, 98]\nclass = Survived", fillcolor="#399de5fa"] ;
3 -> 18 ;
4 [label="Fare <= 23.35\nentropy = 1.0\nsamples = 144\nvalue = [72, 72]\nclass = Not survived", fillcolor="#e5813900"] ;
1 -> 4 ;
9 [label="Age <= 36.5\nentropy = 0.9766\nsamples = 117\nvalue = [48, 69]\nclass = Survived", fillcolor="#399de54e"] ;
4 -> 9 ;
15 [labe

Регрессия позволяет посмотреть влияние различных факторов на принятое решение. Так, видно, что женский пол, маленький возраст и первый класс являлись сильными предпосылками к выживанию