# 1. Импорты

In [86]:
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

from sklearn.metrics import f1_score, roc_auc_score, roc_curve, mean_squared_error

# 2. Обучение

In [22]:
train = pd.read_csv("train.csv")

y = train["Survived"]
X = train.drop(columns=["PassengerId", "Survived", "Name", "Ticket"])
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.2500,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.9250,,S
3,1,female,35.0,1,0,53.1000,C123,S
4,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,,S
887,1,female,19.0,0,0,30.0000,B42,S
888,3,female,,1,2,23.4500,,S
889,1,male,26.0,0,0,30.0000,C148,C


In [36]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, step_of_partitions=10, mode="r"):
        self.mode = mode
        self.step_of_partitions = step_of_partitions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        ages = [x for x in range(self.step_of_partitions, int(X_copy.Age.max())+self.step_of_partitions, self.step_of_partitions)]
        for step in ages:
            X_copy.Age.mask(((X_copy.Age > step-self.step_of_partitions) & (X_copy.Age <= step)), step, inplace=True)
        
        if self.mode == "r":
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choice(ages) if np.isnan(x) else x)

        if self.mode == "s":
            counts = X_copy.Age.value_counts(normalize=True).round(3).sort_index()
            
            ages = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Age'] = X_copy.Age.apply(lambda x: random.choices(ages, weights, k=1)[0] if np.isnan(x) else x)

        X_copy.Age /= 10
        return X_copy

In [37]:
class CabinTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mode="r"):
        self.mode = mode
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        
        X_copy.Cabin = X_copy.Cabin.apply(lambda x: x if pd.isnull(x) else x[0])
        
        if self.mode == "r":
            cabins = X_copy.Cabin.unique()
            X_copy.loc[:, 'Cabin'] = X_copy.Cabin.apply(lambda x: random.choice(cabins) if pd.isnull(x) else x)

        if self.mode == "s":
            counts = X_copy.Cabin.value_counts(normalize=True).round(3).sort_index()
            
            cabins = counts.index.values
            weights = counts.values
            
            X_copy.loc[:, 'Cabin'] = X_copy.Cabin.apply(lambda x: random.choices(cabins, weights, k=1)[0] if pd.isnull(x) else x)
        
        X_copy.Cabin, self.uniques = pd.factorize(X_copy.Cabin)
        
        return X_copy

In [56]:
estimators = [
    ("age-transformer", AgeTransformer(step_of_partitions=10, mode="r")),
    ("cabin-transformer", CabinTransformer(mode="r")),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder())
]

In [94]:
preprocessing = Pipeline(estimators)
X_ = pd.DataFrame(preprocessing.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.20, random_state=42)

In [95]:
X_.head(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2.0,1.0,2.0,1.0,0.0,18.0,1.0,2.0
1,0.0,0.0,3.0,1.0,0.0,207.0,2.0,0.0
2,2.0,0.0,2.0,0.0,0.0,41.0,2.0,2.0
3,0.0,0.0,3.0,1.0,0.0,189.0,2.0,2.0
4,2.0,1.0,3.0,0.0,0.0,43.0,3.0,2.0
5,2.0,1.0,2.0,0.0,0.0,51.0,2.0,1.0
6,0.0,1.0,5.0,0.0,0.0,186.0,4.0,2.0
7,2.0,1.0,0.0,3.0,1.0,124.0,1.0,2.0
8,2.0,0.0,2.0,0.0,2.0,74.0,5.0,2.0
9,1.0,0.0,1.0,1.0,0.0,154.0,6.0,0.0


In [96]:
model = DecisionTreeClassifier(random_state=42)

In [97]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"ROC-AUC Score is {roc_auc_score(y_test, y_pred)}")

ROC-AUC Score is 0.7668597168597169


## 2.1. Решётчатый поиск с перекрёстной проверкой

In [110]:
%%time
parameters = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [depth for depth in range(1, 30, 1)],
    "min_samples_split": [split for split in range(20, 50, 10)],
    "min_samples_leaf": [leaf for leaf in range(10, 100, 10)],
    "max_features": [1, 2, 3, 4, 5, 6, 7, 8]
}

grid = GridSearchCV(model, parameters, cv=10, scoring="roc_auc", error_score="raise")
grid.fit(X_train, y_train)

print(f"Best params is `{grid.best_params_}` with ROC-AUC score: {grid.best_score_}", end="\n")

Best params is `{'criterion': 'entropy', 'max_depth': 5, 'max_features': 3, 'min_samples_leaf': 10, 'min_samples_split': 30, 'splitter': 'best'}` with ROC-AUC score: 0.8511323347434459
CPU times: user 11min 54s, sys: 1.18 s, total: 11min 55s
Wall time: 11min 56s


In [111]:
model = grid.best_estimator_

In [112]:
export_graphviz(
    model, 
    out_file="models/decision_tree.dot", 
    feature_names=X.columns,
    rounded=True,
    filled=True
)