# Семинар 3 - Решающее деревья

In [7]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(palette='deep', style='darkgrid', rc={"figure.figsize": (15, 4)})
import scipy.stats as st

import warnings
warnings.simplefilter('ignore')

In [78]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

### Будем использовать известный нам датасет с задержками рейсов

In [3]:
# Загрузим данные и проведем все предобработки как на семинаре: 
data = pd.read_csv('../data/flight_delays_train.csv')
data['dep_delayed_15min'] = data['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)
data['Month'] = data['Month'].str.replace('c-', '').astype('int16')
data['DayofMonth'] = data['DayofMonth'].str.replace('c-', '').astype('int16')
data['DayOfWeek'] = data['DayOfWeek'].str.replace('c-', '').astype('int16')
data['UniqueCarrier'] = pd.factorize(data['UniqueCarrier'])[0]
data['Origin'] = pd.factorize(data['Origin'])[0]
data['Dest'] = pd.factorize(data['Dest'])[0]

x = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'].values

data.shape

(100000, 9)

In [4]:
#Разделим выборку на обущающую и тестовую
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, \
                                                    shuffle=True, random_state=18)

In [33]:
#Обучим дерево
tree = DecisionTreeClassifier(min_samples_split=100, max_depth=8)
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [34]:
features = list(x.columns) 
export_graphviz(tree, feature_names=features,
                      class_names=['Y','N'], out_file='tree.dot', filled=True)

In [35]:
# для этого понадобится библиотека pydot (pip install pydot)
!dot -Tpng 'tree.dot' -o 'tree.png'

<img src='tree.png'>

In [50]:
print('Accuracy', accuracy_score(y_test, tree.predict(x_test)))
print('F1', f1_score(y_test, tree.predict(x_test)))

Accuracy 0.81125
F1 0.14301929625425652


# Попробуем подобрать гиперпараметры

In [51]:
tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=100, max_depth=4, min_samples_leaf=100)
tree.fit(x_train, y_train)
print('Accuracy', accuracy_score(y_test, tree.predict(x_test)))
print('F1', f1_score(y_test, tree.predict(x_test)))

Accuracy 0.81125
F1 0.14301929625425652


In [69]:
params = {'criterion': ['entropy', 'gini'],
        'max_depth': [2,8,25],
        'min_samples_split': [100, 500,1000],
        'min_samples_leaf': [100, 200,1000]
        }

In [72]:
grid = GridSearchCV(tree, params, scoring='f1', cv=5, n_jobs=-1)

In [73]:
grid.fit(x,y)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'criterion': ['entropy', 'gini'], 'min_samples_leaf': [100, 200, 1000], 'min_samples_split': [100, 200, 500, 1000], 'max_depth': [2, 8, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [75]:
#Посмотрим, что получилось: 
grid.cv_results_

{'mean_fit_time': array([0.14906683, 0.13846893, 0.1345109 , 0.13389006, 0.12655034,
        0.1351747 , 0.12908421, 0.1350193 , 0.12428975, 0.14971433,
        0.13840866, 0.14438939, 0.44710827, 0.4253345 , 0.4185317 ,
        0.41220145, 0.42001657, 0.41369834, 0.41953807, 0.39835439,
        0.33927536, 0.34112711, 0.34256091, 0.33333249, 0.69096017,
        0.73314557, 0.6838697 , 0.72532363, 0.85659003, 1.07065234,
        0.61169343, 0.51411071, 0.36435347, 0.35259032, 0.37203617,
        0.37011847, 0.12059612, 0.13710265, 0.13109283, 0.13389006,
        0.14989686, 0.1220984 , 0.14376621, 0.14330277, 0.12735376,
        0.13573141, 0.15123568, 0.13588047, 0.41174388, 0.43004036,
        0.41043696, 0.39104805, 0.40990405, 0.40327401, 0.39098506,
        0.38081789, 0.34665303, 0.38355923, 0.36844587, 0.7598289 ,
        0.94167724, 0.64431343, 0.60353613, 0.55169992, 0.54564986,
        0.56070285, 0.49362273, 0.46631041, 0.37085786, 0.42631321,
        0.38437977, 0.50342293]

In [76]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=25,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [77]:
grid.best_score_

0.1625252485330728

# А если гиперпараметров, слишком много?

In [87]:
random = RandomizedSearchCV(tree, params, scoring='f1', cv=5, n_jobs=-1, n_iter=5)

In [88]:
random.fit(x,y)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'criterion': ['entropy', 'gini'], 'min_samples_leaf': [100, 200, 1000], 'min_samples_split': [100, 200, 500, 1000], 'max_depth': [2, 8, 25]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='f1', verbose=0)

In [89]:
random.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=25,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=500,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [90]:
random.best_score_

0.15700159721954943

In [91]:
random.cv_results_

{'mean_fit_time': array([0.43543811, 0.13086061, 0.56746206, 0.42202601, 0.35117898]),
 'mean_score_time': array([0.01144576, 0.00844235, 0.01324763, 0.00838733, 0.00683928]),
 'mean_test_score': array([0.08230342, 0.00913182, 0.1570016 , 0.13313979, 0.14303996]),
 'mean_train_score': array([0.0857545 , 0.00966209, 0.16811574, 0.13846769, 0.15156411]),
 'param_criterion': masked_array(data=['entropy', 'gini', 'entropy', 'gini', 'gini'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[8, 2, 25, 8, 8],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[1000, 200, 100, 200, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[100, 200, 500, 100, 1000],
              mask=[False, False