# Бустинг с XGBOOST

1. Построить XGBoost модель
2. Сделать предсказание и проверить качество через каппа-метрику

## Подключение библиотек

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from xgboost import XGBClassifier
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Предобработка данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)

In [4]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

In [5]:
data.fillna(value=-1, inplace=True)
data["Response"] = data["Response"] - 1
print(data.head())

   Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   2               1              10        0.076923               2   
1   5               1              26        0.076923               2   
2   6               1              26        0.076923               2   
3   7               1              10        0.487179               2   
4   8               1              26        0.230769               2   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0               1               1  0.641791  0.581818  0.148536  ...   
1               3               1  0.059701  0.600000  0.131799  ...   
2               3               1  0.029851  0.745455  0.288703  ...   
3               3               1  0.164179  0.672727  0.205021  ...   
4               3               1  0.417910  0.654545  0.234310  ...   

   Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48  Response  \
0                   0                   0            

## Набор столбцов для расчета

In [6]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Нормализация данных

In [8]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data, columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

In [9]:
data_transformed = reduce_mem_usage(data_transformed)
print(data_transformed.info())

Потребление памяти меньше на - 42.87 Мб (минус 75.1%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 126 entries, 0 to Response
dtypes: float16(125), int8(1)
memory usage: 14.2 MB
None


## Разделить данные

In [10]:
data_train, data_test = train_test_split(data_transformed, test_size=0.2)
print(data_train.head())

              0         1         2         3         4         5         6  \
32811  0.238281 -0.465576  1.500977  0.665039 -1.634766 -0.169434  0.862305   
39512  0.543945 -1.200195 -0.618652  1.694336  0.611816 -0.169434 -1.159180   
10505  0.426270  0.269287 -0.164429  0.384033  0.611816 -0.169434 -1.159180   
38681  0.543945  0.024353  0.365479  0.700195  0.611816  5.902344  0.862305   
12350  0.073853  0.024353 -0.391602  0.112183  0.611816 -0.169434  0.862305   

              7         8         9  ...       116      117       118  \
32811 -1.013672  0.862305 -0.928711  ... -0.083679  0.44165 -0.149292   
39512  1.100586 -1.156250  1.130859  ... -0.083679  0.44165 -0.149292   
10505  1.100586 -1.156250  1.130859  ... -0.083679  0.44165 -0.149292   
38681  0.043671  0.864746 -0.928711  ... -0.083679  0.44165 -0.149292   
12350 -1.013672  0.864258 -0.928711  ... -0.083679  0.44165 -0.149292   

            119       120       121       122       123      124  Response  
32811 -0.

## XGBoost

In [11]:
x = pd.DataFrame(data_train, columns=columns_transformed)
model =XGBClassifier(
    max_depth=17,
    max_features=27,
    n_estimators=76,
    min_samples_leaf=20
)

## Построить итоговую модель

In [12]:
model.fit(x, data_train["Response"])

Parameters: { "max_features", "min_samples_leaf" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




## Предсказание данных

In [13]:
x_test = pd.DataFrame(data_test, columns=columns_transformed)
data_test["target"] = model.predict(x_test)

## Оценка модели

In [14]:
print(f'XGBoost: {round(cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic"), 3)}')

XGBoost: 0.546


## Матрица неточностей

In [15]:
print(f'XGBoost:\n\n {confusion_matrix(data_test["target"], data_test["Response"])}')

XGBoost:

 [[ 304  145   12    8   56  106   44   33]
 [ 184  328   16    7   94   97   24   30]
 [  24   16  100   15    0    1    0    0]
 [  40   26   62  176    1    6    0    0]
 [  94  140    8    0  596   80   12   15]
 [ 249  285   15   27  233 1235  285  135]
 [ 134  117    2    9   72  282  670  219]
 [ 239  236    4   29   88  421  528 3463]]
