# Titanic: Machine Learning from Disaster

In [1]:
# https://www.kaggle.com/c/titanic

In [2]:
import sklearn
import pandas as pd

In [3]:
train = pd.read_csv('./titanic/train.csv')


In [4]:
train_dummies = pd.get_dummies(train, columns=['Sex', 'Pclass', 'Embarked'])

## Preprocess data

In [5]:
# Extract features
X_train = train_dummies.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
columns = X_train.columns.values

In [6]:
# Extract targets
y_train = train_dummies['Survived']

In [7]:
from sklearn.preprocessing import Imputer
imp = Imputer()
X_train = imp.fit(X_train, y_train).transform(X_train)

## Train model and make predictions

In [8]:
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

#### Создаем kfold для оценок

In [9]:
kfold = KFold(n_splits=5,shuffle=True,random_state=12345)

#### Создаем и обучаем дерево

In [10]:
pipeline = Pipeline([('scale', imp),
            ('model', DecisionTreeClassifier())])

param_grid = {"model__criterion": ["gini", "entropy"],
              "model__min_samples_split": [2, 10, 20],
              "model__max_depth": [None, 2, 5, 10, 15, 20, 30, 40],
              "model__min_samples_leaf": [1, 5, 10],
              "model__max_leaf_nodes": [None, 5, 10, 20],
              }

grid = GridSearchCV(pipeline, param_grid, scoring=make_scorer(accuracy_score), cv=kfold)
grid = grid.fit(X_train,y_train)
grid.best_params_

{'model__criterion': 'gini',
 'model__max_depth': 5,
 'model__max_leaf_nodes': 20,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2}

In [11]:
model = DecisionTreeClassifier(criterion='gini', max_depth=5, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=2)\
    .fit(X_train,y_train)

In [12]:
from sklearn.tree import export_graphviz
print(export_graphviz(model, out_file=None, filled=True, feature_names=list(columns)))

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="Sex_male <= 0.5\ngini = 0.473\nsamples = 891\nvalue = [549, 342]", fillcolor="#e5813960"] ;
1 [label="Pclass_3 <= 0.5\ngini = 0.3828\nsamples = 314\nvalue = [81, 233]", fillcolor="#399de5a6"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
3 [label="Fare <= 28.8563\ngini = 0.1003\nsamples = 170\nvalue = [9, 161]", fillcolor="#399de5f1"] ;
1 -> 3 ;
37 [label="gini = 0.18\nsamples = 70\nvalue = [7, 63]", fillcolor="#399de5e3"] ;
3 -> 37 ;
38 [label="gini = 0.0392\nsamples = 100\nvalue = [2, 98]", fillcolor="#399de5fa"] ;
3 -> 38 ;
4 [label="Fare <= 23.35\ngini = 0.5\nsamples = 144\nvalue = [72, 72]", fillcolor="#e5813900"] ;
1 -> 4 ;
9 [label="Age <= 36.5\ngini = 0.4839\nsamples = 117\nvalue = [48, 69]", fillcolor="#399de54e"] ;
4 -> 9 ;
13 [label="Embarked_S <= 0.5\ngini = 0.4721\nsamples = 110\nvalue = [42, 68]", fillcolor="#399de561"] ;
9 -> 13 ;
21 [label="Fare <= 15.6208\ngini = 0.4058\nsample

## Estimate quality

In [13]:
scores = cross_val_score(model, X_train, y_train, groups=None, scoring=make_scorer(accuracy_score),cv=kfold)
scores

array([ 0.7877095 ,  0.78089888,  0.88202247,  0.84831461,  0.83707865])

## Create submission 

In [14]:
test = pd.read_csv('./titanic/test.csv')
test_dummies = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'])
X_test = test_dummies.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
X_test = imp.transform(X_test)
#X_test = scaler.transform(X_test)
predicted = model.predict(X_test)
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))

In [15]:
# Kaggle score is 0.779900

In [16]:
scores.mean()

0.82720482078965551

#### Score на kaggle получился не только меньше среднего, но и меньше минимального значения в K-fold