In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('../data/titanic_train.csv',
                  index_col='PassengerId')

data['Age'].fillna(data['Age'].median(), inplace=True)
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
X = data[["Fare", "Age"]].head(200).to_numpy()
#X = data[["Fare", "Age"]].to_numpy()

In [25]:
Y = data["Survived"].head(200).to_numpy()
#Y = data["Survived"].to_numpy()

In [26]:
import numpy as np
import pandas as pd


class DecisionTreeBinaryClassifier():
    '''
    Реализация алгоритма бинарного дерева для классификации
    на основе максимизации неопределенности Джини
    '''
    def __init__(self, max_depth = None, n_classes = 2, verboose = True):
        self.max_depth = max_depth # максимальная глубина дерева
        self.n_classes = n_classes # количество классов (по умолчанию бинарная классификация)
        self.classes = range(0, self.n_classes) # список уникальных классов
        self.verboose = verboose
        self.node = None

    class Node():
        ''' Класс представляющий узел бинарного дерева '''
        def __init__(self, pred_class,
                     gini,
                     feature_id,
                     feature_value,
                     depth,
                     right_node = None,
                     left_node = None):
            self.pred_class = pred_class
            self.gini = gini
            self.feature_id = feature_id
            self.feature_value = feature_value
            self.depth = depth
            self.right_node = right_node
            self.left_node = left_node

        def __str__(self):
            return """
                Decision Tree Node
                ------------------
                pred_class: {pred_class}
                gini: {gini}
                feature_id: {feature_id}
                feature_value: {feature_value}
                depth: {depth}
                
                right_node: {right_node}
                ========================
                left_node: {left_node}
                
                """.format(pred_class=str(self.pred_class),
                           gini=str(self.gini),
                           feature_id=str(self.feature_id),
                           feature_value=str(self.feature_value),
                           depth=str(self.depth),
                           right_node=str(self.right_node),
                           left_node=str(self.left_node))

    def log(self, s):
        if self.verboose:
            print(s)


    def _gini(self, node_y):
        '''Расчитывает неопределенность Джини по одному узлу
        :param node_y: метки классов в узле
        '''
        return 1.0 - sum([(float(np.sum(node_y == cl))/node_y.size)**2 for cl in self.classes])


    def _split(self, data, id):
        ''' Разбивает датасет по определенному параметру
        :param data: numpy array
        :param id: индекс по которому идет разбиение
        :return: left_data, right_data
        '''
        if id == 0:
            return data[:id+1], data[id+1:]
        else:
            return data[:id], data[id:]


    def _search_split_id(self, Y):
        '''Возвращает список индексов в которых целевая переменная изменяется'''
        split_ids = []
        for i in range(0, len(Y) - 1):
            if Y[i] - Y[i + 1] != 0:
                split_ids.append(i)
        return split_ids


    def _best_split(self, X, Y, depth):
        ''' Ищем наилучшее разбиение '''
        n_features = X.shape[1]
        max_gini = float()
        best_feature_id = None
        best_feature_value = None
        for feature in range(0,  n_features):
            learning_data, target_data = zip(*sorted(zip(X[:, feature], Y)))
            split_ids = self._search_split_id(Y)
            for id in split_ids:
                left_Y, right_Y = self._split(Y, id)
                gini = (
                    (float(len(left_Y))/ len(target_data)) * self._gini(left_Y)
                    +
                    (float(len(right_Y)) / len(target_data)) * self._gini(right_Y)
                )
                fv = (learning_data[id]+learning_data[id+1])/2
                if gini > max_gini:
                    max_gini = gini
                    best_feature_id = feature
                    best_feature_value = fv

                self.log("depth: {depth}, feature_id: {feature}, feature_value: {feature_value}, gini: {gini}, max_gini: {max_gini}" \
                         .format(depth=str(depth),
                                 feature=str(feature),
                                 feature_value=str(fv),
                                 gini=str(gini),
                                 max_gini=str(max_gini)
                         ))

        return self.Node(pred_class = round(np.mean(Y),0),
                         gini = max_gini,
                         feature_id = best_feature_id,
                         feature_value = best_feature_value,
                         depth = depth,
                         right_node = None,
                         left_node = None)


    def _fit(self, X, Y, depth):        
        if depth <= self.max_depth:
            node = self._best_split(X, Y, depth)
            id_left = np.ndarray.flatten(X[:, node.feature_id] < node.feature_value)
            if id_left is not None:
                if id_left.size == Y.size:
                    X_left, Y_left = X[id_left], Y[id_left]
                    X_right, Y_right = X[~id_left], Y[~id_left]
                    node.left_node = self._fit(X_left, Y_left, depth+1)
                    node.right_node = self._fit(X_right, Y_right, depth+1)
                    return node

    def fit(self, X, Y):
        self.node = self._fit(X, Y, 1)


    def _predict(self, node, row):
        '''Рекурсивно делает предсказание для одной строки данных'''
        if row[node.feature_id] < node.feature_value:
            if node.left_node is not None:
                return self._predict(node.left_node, row)
            else:
                return node.pred_class
        else:
            if node.right_node is not None:
                return self._predict(node.right_node, row)
            else:
                return node.pred_class


    def predict(self, X):
        return [self._predict(self.node, row) for row in X]

In [27]:
tree = DecisionTreeBinaryClassifier(max_depth=3, verboose = False)
tree.fit(X, Y)
print(tree.node)


                Decision Tree Node
                ------------------
                pred_class: 0.0
                gini: 0.451945854484
                feature_id: 0
                feature_value: 255.2604
                depth: 1
                
                right_node: None
                left_node: 
                Decision Tree Node
                ------------------
                pred_class: 0.0
                gini: 0.450971250971
                feature_id: 0
                feature_value: 146.5208
                depth: 2
                
                right_node: 
                Decision Tree Node
                ------------------
                pred_class: 1.0
                gini: 0.333333333333
                feature_id: 0
                feature_value: 146.5208
                depth: 3
                
                right_node: None
                left_node: None
                
                
                left_node: 
                Decision Tree

In [28]:
accuracy_score(Y, tree.predict(X))

0.66

In [29]:
tree_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=17)
tree_gini.fit(X, Y)
accuracy_score(Y, tree_gini.predict(X))

0.7

In [30]:
tree_ent = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=17)
tree_ent.fit(X, Y)
accuracy_score(Y, tree_ent.predict(X))

0.685

In [19]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [288]:
train_df = pd.read_csv("../data/titanic_train.csv") 
test_df = pd.read_csv("../data/titanic_test.csv") 

In [290]:
y_train = train_df['Survived']
#y_test = test_df['Survived']

In [291]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna('S', inplace=True)
test_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

In [292]:
train_df = pd.concat([train_df, pd.get_dummies(train_df['Pclass'], 
                                               prefix="PClass"),
                      pd.get_dummies(train_df['Sex'], prefix="Sex"),
                      pd.get_dummies(train_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(train_df['Parch'], prefix="Parch"),
                     pd.get_dummies(train_df['Embarked'], prefix="Embarked")],
                     axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df['Pclass'], 
                                             prefix="PClass"),
                      pd.get_dummies(test_df['Sex'], prefix="Sex"),
                      pd.get_dummies(test_df['SibSp'], prefix="SibSp"),
                      pd.get_dummies(test_df['Parch'], prefix="Parch"),
                    pd.get_dummies(test_df['Embarked'], prefix="Embarked")],
                     axis=1)

In [293]:
train_df.drop(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 
               'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
              axis=1, inplace=True)
test_df.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], 
             axis=1, inplace=True)

In [174]:
train_df.head()

Unnamed: 0,Age,Fare,PClass_1,PClass_2,PClass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,0,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
1,38.0,71.2833,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,26.0,7.925,0,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,35.0,53.1,1,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,35.0,8.05,0,0,1,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [294]:
train_df.shape, test_df.shape

((891, 24), (418, 25))

In [295]:
set(test_df.columns) - set(train_df.columns)

{'Parch_9'}

In [296]:
test_df.drop(['Parch_9'], axis=1, inplace=True)

Обучите на имеющейся выборке дерево решений (DecisionTreeClassifier) максимальной глубины 2. Используйте параметр random_state=17 для воспроизводимости результатов.

In [297]:
from sklearn.tree import DecisionTreeClassifier

tree_ent = DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=17)
tree_gini = DecisionTreeClassifier(criterion='gini', max_depth=2, random_state=17)
my_tree = DecisionTreeBinaryClassifier(max_depth=2, verboose = False)


In [298]:
tree_ent.fit(train_df, y_train)
tree_gini.fit(train_df, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

Сделайте с помощью полученной модели прогноз для тестовой выборки

In [299]:
pred_ent = tree_ent.predict(train_df)
pred_gigni = tree_gini.predict(train_df)

In [300]:
print("entropy: " + str(accuracy_score(pred_ent, y_train)))
print("gini: " + str(accuracy_score(pred_gigni, y_train)))

entropy: 0.7867564534231201
gini: 0.7957351290684624


Отобразите дерево с помощью export_graphviz и dot.

In [301]:
# используем .dot формат для визуализации дерева
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

In [302]:
dot_data = StringIO()
export_graphviz(tree_gini, feature_names=train_df.columns, 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(value=graph.create_png())

TypeError: unicode argument expected, got 'str'

Обучите на имеющейся выборке дерево решений (DecisionTreeClassifier). Также укажите random_state=17. Максимальную глубину и минимальное число элементов в листе настройте на 5-кратной кросс-валидации с помощью GridSearchCV.

In [303]:
tree_params = {'max_depth': list(range(1, 5)), 
               'min_samples_leaf': list(range(1, 5))}

In [304]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [305]:
tree_grid = GridSearchCV(tree_gini, tree_params, cv=5, n_jobs=1, verbose=True)

In [306]:
tree_grid.fit(train_df, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.6s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [307]:
tree_grid.best_score_

0.8103254769921436

In [308]:
tree_grid.best_params_

{'max_depth': 3, 'min_samples_leaf': 3}

In [309]:
tree_grid.predict(test_df)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,