# Случайный лес

1. Построить параллельный ансамбль (бэггинг) решающих деревьев используя случайный лес.
2. Получить предсказания и проверить качество через каппа-метрику.

## Подключить библиотеки

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Предобработка данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)

In [4]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

In [5]:
data.fillna(value=-1, inplace=True)
print(data.head())

   Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   2               1              10        0.076923               2   
1   5               1              26        0.076923               2   
2   6               1              26        0.076923               2   
3   7               1              10        0.487179               2   
4   8               1              26        0.230769               2   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0               1               1  0.641791  0.581818  0.148536  ...   
1               3               1  0.059701  0.600000  0.131799  ...   
2               3               1  0.029851  0.745455  0.288703  ...   
3               3               1  0.164179  0.672727  0.205021  ...   
4               3               1  0.417910  0.654545  0.234310  ...   

   Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48  Response  \
0                   0                   0            

## Набор столбцов для расчета

In [6]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Нормализация данных

In [7]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data, columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

In [8]:
data_transformed = reduce_mem_usage(data_transformed)
print(data_transformed.info())

Потребление памяти меньше на - 42.87 Мб (минус 75.1%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 126 entries, 0 to Response
dtypes: float16(125), int8(1)
memory usage: 14.2 MB
None


## Разделение данных

In [9]:
data_train, data_test = train_test_split(data_transformed, test_size=0.2)
print(data_train.head())

              0         1         2         3         4         5         6  \
12840 -0.161133  0.759277 -0.921387 -0.607422  0.611816 -0.169434 -1.159180   
18004 -1.242188 -0.710449 -1.753906 -1.175781  0.611816 -0.169434 -1.159180   
23743 -0.043610  0.759277  0.213989 -0.472656  0.611816 -0.169434  0.862305   
51276 -0.090637 -0.465576  1.878906  0.228394  0.611816 -0.169434  0.862305   
28561  1.013672  0.024353  1.273438  1.288086 -1.634766  5.902344  0.862305   

              7         8         9  ...       116       117       118  \
12840  1.100586 -1.156250  1.130859  ... -0.083679  0.441650 -0.149292   
18004  1.100586 -1.156250  1.130859  ... -0.083679  0.441650 -0.149292   
23743  0.043671  0.863770 -0.928711  ... -0.083679  0.441650 -0.149292   
51276  0.043671  0.861328 -0.928711  ... -0.083679 -2.263672 -0.149292   
28561 -1.013672  0.866211 -0.928711  ... -0.083679  0.441650 -0.149292   

            119       120       121       122       123      124  Response  
128

## Перекрестная проверка случайного леса

Каждое дерево (по умолчанию их 100) строиться на своей части выборки со своим набором параметров (max_features). Решение принимается путем голосования деревьев.

$$ estimators = \dfrac{N}{(20-100) * fold * class} $$

где:
- N - размер выборки
- fold - число разбиений
- class - количество классов в предсказании

In [10]:
x = pd.DataFrame(data_train, columns=columns_transformed)
model = RandomForestClassifier(
    random_state=7,
    n_estimators=77,
    max_depth=17,
    max_features=27,
    min_samples_leaf=30,
)

In [11]:
tree_params = {
    "max_depth": range(15, 17),
    "max_features": range(26, 28),
    "n_estimators": range(75, 77),
    "min_samples_leaf": range(19, 21),
} 

tree_grid = GridSearchCV(
    model,
    tree_params,
    cv=5,
    n_jobs=2,
    verbose=True, scoring=make_scorer(cohen_kappa_score)
)

tree_grid.fit(x, data_train["Response"])

Fitting 5 folds for each of 16 candidates, totalling 80 fits


## Оптимальные параметры

In [12]:
print(tree_grid.best_params_)

{'max_depth': 16, 'max_features': 27, 'min_samples_leaf': 19, 'n_estimators': 76}


## Итоговая модель

In [13]:
model = RandomForestClassifier(
    random_state=17,
    min_samples_leaf=tree_grid.best_params_["min_samples_leaf"],
    max_features=tree_grid.best_params_["max_features"],
    max_depth=tree_grid.best_params_["max_depth"],
    n_estimators=tree_grid.best_params_["n_estimators"],
)

model.fit(x, data_train["Response"])

## Предсказание данных

In [14]:
x_test = pd.DataFrame(data_test, columns=columns_transformed)
data_test["target"] = model.predict(x_test)

## Оценка модели

In [15]:
print(f'Случайный лес: {round(cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic"), 3)}')

Случайный лес: 0.499


## Матрица неточностей

In [18]:
print(f'Случайный лес:\n\n {confusion_matrix(data_test["target"], data_test["Response"])}')

Случайный лес:

 [[ 139   69   16   14   12   34   11    3]
 [ 181  323   10    0   69   53    5    4]
 [  16    8   52   11    1    0    0    0]
 [  43   26   56  195    0    5    0    1]
 [ 125  163   21    0  572   83    7    5]
 [ 296  301   26   47  229 1228  342  127]
 [ 127  112    6    2   51  279  617  132]
 [ 324  307    9   40  102  558  651 3631]]
