# Курсовой проект:
# Машинное обучение 2 "Классификация и ансамбли"
ПОСТАНОВКА ЗАДАЧИ:
Загрузите данные, приведите их к числовым, заполните пропуски, нормализуйте данные и оптимизируйте память.

Сформируйте параллельный ансамбль из CatBoost, градиентного бустинга, XGBoost и LightGBM. Используйте лучшие гиперпараметры, подобранные ранее, или найдите их через перекрестную проверку. Итоговое решение рассчитайте на основании самого точного предсказания класса у определенной модели ансамбля: выберите для каждого класса модель, которая предсказывает его лучше всего.

Проведите расчеты и выгрузите результат в виде submission.csv

Данные:
* video.ittensive.com/machine-learning/prudential/train.csv.gz
* video.ittensive.com/machine-learning/prudential/test.csv.gz
* video.ittensive.com/machine-learning/prudential/sample_submission.csv.gz

Итоговый файл с кодом (.py или .ipynb) выложите в github с портфолио.

In [1]:
# Подключение библиотек

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix,make_scorer
from sklearn import preprocessing
from catboost import Pool, CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Загрузка данных
data = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/train.csv.gz")
print (data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


In [3]:
# Предобработка данных

def data_preprocess (df):
    df["Product_Info_2_1"] = df["Product_Info_2"].str.slice(0, 1)
    df["Product_Info_2_2"] = pd.to_numeric(df["Product_Info_2_1"].str.slice(1, 2))
    df.drop(labels=["Product_Info_2"], axis=1, inplace=True)
    for l in df["Product_Info_2_1"].unique():
        df["Product_Info_2_1" + l] = df["Product_Info_2_1"].isin([l]).astype("int8")
    df.drop(labels=["Product_Info_2_1"],axis=1, inplace=True)
    df.fillna(value=-1, inplace=True)
    data["Response"] = data["Response"] - 1                                       
    return df 

In [4]:
data = data_preprocess(data)

In [5]:
# Набор столбцов для расчета
columns_groups = ["Insurance_History", "InsuredInfo", "Medical_Keyword",
          "Family_Hist", "Medical_History", "Product_Info"]
columns = ["Wt", "Ht", "Ins_Age", "BMI"]
for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print (columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

In [6]:
# Нормализуем весь исходный массив данных.
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data,
                                                                  columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

In [7]:
# Оптимизация потребления памяти.

def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        else:
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print ("Потребление памяти меньше на",
          round(start_mem - end_mem, 2),
          "Мб (минус",
           round(100 * (start_mem - end_mem) / start_mem, 1),
           "%)")
    return df

In [8]:
data_transformed = reduce_mem_usage(data_transformed)
print (data_transformed.info())
print (data_transformed.head())

Потребление памяти меньше на 42.87 Мб (минус 75.1 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 126 entries, 0 to Response
dtypes: float16(125), int8(1)
memory usage: 14.2 MB
None
          0         1         2         3         4         5         6  \
0 -1.618164 -1.690430  1.198242 -1.198242 -1.634766 -0.169434  0.862305   
1 -1.805664 -1.445312 -1.753906 -1.613281  0.611816 -0.169434  0.862305   
2 -0.043610  0.514160 -1.905273 -0.332764  0.611816 -0.169434 -1.159180   
3 -0.983398 -0.465576 -1.224609 -0.957520  0.611816 -0.169434 -1.159180   
4 -0.654297 -0.710449  0.062622 -0.371582  0.611816 -0.169434 -1.159180   

          7         8         9  ...       116       117       118  119  \
0 -1.013672  0.862305 -0.928711  ... -0.083679 -2.263672 -0.149292  0.0   
1 -1.013672  0.861328 -0.928711  ... -0.083679  0.441650 -0.149292  0.0   
2  1.100586 -1.156250  1.130859  ... -0.083679  0.441650 -0.149292  0.0   
3  1.100586 -1.156250  1.13

In [9]:
# Построение базовых моделей
x = pd.DataFrame(data_transformed, columns=columns_transformed)

In [10]:
# CatBoost
model_cb = CatBoostClassifier(iterations=10000, learning_rate=0.57,
                random_seed=17, depth=6, loss_function="MultiClass",
                bootstrap_type="MVS", l2_leaf_reg=2)
model_cb.fit(Pool(data=x, label=data["Response"]))

0:	learn: 1.5448510	total: 211ms	remaining: 35m 14s
1:	learn: 1.4060379	total: 270ms	remaining: 22m 29s
2:	learn: 1.3593674	total: 325ms	remaining: 18m 2s
3:	learn: 1.3258375	total: 373ms	remaining: 15m 31s
4:	learn: 1.3012680	total: 425ms	remaining: 14m 10s
5:	learn: 1.2874980	total: 482ms	remaining: 13m 22s
6:	learn: 1.2747848	total: 532ms	remaining: 12m 38s
7:	learn: 1.2594135	total: 586ms	remaining: 12m 11s
8:	learn: 1.2500449	total: 633ms	remaining: 11m 42s
9:	learn: 1.2423612	total: 698ms	remaining: 11m 37s
10:	learn: 1.2270759	total: 755ms	remaining: 11m 25s
11:	learn: 1.2238039	total: 796ms	remaining: 11m 2s
12:	learn: 1.2200677	total: 855ms	remaining: 10m 57s
13:	learn: 1.2117266	total: 936ms	remaining: 11m 7s
14:	learn: 1.2025534	total: 1s	remaining: 11m 7s
15:	learn: 1.1990212	total: 1.06s	remaining: 11m 3s
16:	learn: 1.1943186	total: 1.13s	remaining: 11m 4s
17:	learn: 1.1822878	total: 1.19s	remaining: 11m 1s
18:	learn: 1.1757510	total: 1.25s	remaining: 10m 56s
19:	learn: 1.

<catboost.core.CatBoostClassifier at 0x23880083b80>

In [11]:
# Градиентный бустинг

model_gb = GradientBoostingClassifier(random_state=17, max_depth=13, max_features=26,
                      n_estimators=75,min_samples_leaf=21)
model_gb.fit(x, data["Response"])


In [12]:
model_xgb = XGBClassifier(max_depth=17, max_features=27,
                      n_estimators=76,min_samples_leaf=20)
model_xgb.fit(x, data["Response"])

Parameters: { "max_features", "min_samples_leaf" } are not used.



In [13]:
# LightGBM

model_lgb = lgb.LGBMRegressor(random_state=17,max_depth=18,
                      min_child_samples=17, num_leaves=35,
                             n_estimators=10000)
model_lgb.fit(x, data["Response"])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2489
[LightGBM] [Info] Number of data points in the train set: 59381, number of used features: 124
[LightGBM] [Info] Start training from score 4.636837


In [14]:
# Загрузка данных для расчета

data_test = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/test.csv.gz")
data_test = data_preprocess(data_test)
data_test = reduce_mem_usage(data_test)
data_test_transformed = pd.DataFrame(scaler.transform(pd.DataFrame(data_test,
                                             columns=columns)))
print (data_test_transformed.info())
print (data_test_transformed.head())

Потребление памяти меньше на 16.32 Мб (минус 84.8 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19765 entries, 0 to 19764
Columns: 125 entries, 0 to 124
dtypes: float32(125)
memory usage: 9.4 MB
None
        0         1         2         3         4         5         6    \
0  0.519789  1.002921  1.045952  0.022140  0.611857 -0.169414 -1.159587   
1  0.215425  0.266273  1.122714  0.126020 -1.634368 -0.169414  0.862391   
2  0.308653  0.022915  0.894903  0.405695  0.611857 -0.169414 -1.159587   
3 -0.278139 -0.707156  0.592804  0.143999 -1.634368 -0.169414  0.862391   
4 -0.513953 -0.463799 -0.542540 -0.333447  0.611857 -0.169414 -1.159587   

        7         8         9    ...       115       116       117       118  \
0  1.101046 -1.156735  1.130555  ...  0.559558 -0.083689  0.441621 -0.149284   
1 -1.013721  0.864261 -0.928723  ... -0.892015 -0.083689  0.441621 -0.149284   
2  1.101046 -1.156735  1.130555  ... -0.652247 -0.083689  0.441621 -0.149284   
3 -1.013721  0.862242 

In [15]:
# Предсказание данных CatBoost

data_test["target_cb"] = model_cb.predict(Pool(data=data_test_transformed))


  data_test["target_cb"] = model_cb.predict(Pool(data=data_test_transformed))


In [16]:
# Градиентный бустинг

data_test["target_gb"] = model_gb.predict(data_test_transformed)


  data_test["target_gb"] = model_gb.predict(data_test_transformed)


In [17]:
# XGBoost

data_test["target_xgb"] = model_xgb.predict(data_test_transformed)


  data_test["target_xgb"] = model_xgb.predict(data_test_transformed)


In [18]:
# LightGBM

data_test["target_lgb"] = np.round(model_lgb.predict(data_test_transformed).astype("int8"))


  data_test["target_lgb"] = np.round(model_lgb.predict(data_test_transformed).astype("int8"))


In [19]:
def vote_class (x):
    if x.target_xgb == 2:
        class_= x.target_xgb
    elif x.target_lgb == 7:
        class_ = x.target_lgb
    elif x.target_cb == 0:
        class_ = x.target_cb
    else:
        class_ = x.target_gb
    x["Response"] = class_ + 1
    
    return x

    

In [20]:
data_test = data_test.apply(vote_class, axis=1)
print (data_test.head())


     Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   1.0             1.0            26.0        0.487061             2.0   
1   3.0             1.0            26.0        0.076904             2.0   
2   4.0             1.0            26.0        0.144653             2.0   
3   9.0             1.0            26.0        0.151733             2.0   
4  12.0             1.0            26.0        0.076904             2.0   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0             3.0             1.0  0.611816  0.781738  0.338867  ...   
1             3.0             1.0  0.626953  0.727051  0.311768  ...   
2             3.0             1.0  0.582031  0.708984  0.320068  ...   
3             1.0             1.0  0.522461  0.654785  0.267822  ...   
4             3.0             1.0  0.298584  0.672852  0.246826  ...   

   Product_Info_2_1D  Product_Info_2_1A  Product_Info_2_1E  Product_Info_2_1B  \
0                1.0               

In [21]:
# Формирование результата
# Для формирования результата загрузим пример данных для отправки и заменим в нем столбец класса на рассчитанный ранее

submission = pd.read_csv("https://video.ittensive.com/machine-learning/prudential/sample_submission.csv.gz")
print (submission.head())

   Id  Response
0   1         8
1   3         8
2   4         8
3   9         8
4  12         8


In [22]:
submission["Response"] = data_test["Response"].astype("int8")
print (submission.head())

   Id  Response
0   1         7
1   3         8
2   4         6
3   9         7
4  12         8


In [23]:
# Выгрузка результатов

submission.to_csv("submission.csv", index=False)
print (len(submission["Response"]) + 1)

19766


In [24]:
# Само_проверка
# Проверим насколько ансамбль хорошо предсказывает обучающие данные

data_copy = data_transformed.copy()
x_copy = pd.DataFrame(data_copy, columns= columns_transformed)

In [25]:
copy_dataset = Pool(data=x_copy, label=data_copy["Response"])
data_copy["target_cb"] = model_cb.predict(copy_dataset)
data_copy["target_gb"] = model_gb.predict(x.copy())
data_copy["target_xgb"] = model_xgb.predict(x.copy())
data_copy_lgb = np.round(model_lgb.predict(x.copy()).astype("int8"))

In [26]:
class_target = ["target_gb"]*8
def vote_class_enumerate (x):
    for target in enumerate(class_target):
        if target == _:
            x["Response"] = x[target]
            break
    return x
    

In [27]:
kappa_min = 0
for target_model in["cb", "gb", "xgb", "lgb"]:
    print ("Проверка модели:", target_model)
    target_model = "target_" + target_model
    for class_ in range(0, 8):
        target_model_prob = class_target[class_]
        class_target[class_] = target_model
        data_copy = data_copy.apply(vote_class_enumerate, axis=1)
        kappa = cohen_kappa_score(data_copy["Response"], 
                                  data["Response"], weights="quadratic")
        if kappa > kappa_min:
            kappa_min = kappa
        else:
            class_target[class_] = target_model_prob
    print ("Максимальная оценка:", kappa_min)
print (class_target)

Проверка модели: cb
Максимальная оценка: 0.9234997580080131
Проверка модели: gb
Максимальная оценка: 0.9234997580080131
Проверка модели: xgb
Максимальная оценка: 0.9234997580080131
Проверка модели: lgb
Максимальная оценка: 0.9234997580080131
['target_cb', 'target_gb', 'target_gb', 'target_gb', 'target_gb', 'target_gb', 'target_gb', 'target_gb']


In [29]:
data_copy = data_copy.apply(vote_class_enumerate, axis=1)

In [31]:
print ("Результат:",
           round(cohen_kappa_score(data_copy["Response"],
                        data["Response"], weights='quadratic'), 3))

Результат: 0.923


In [32]:
print (confusion_matrix(data_copy["Response"], data["Response"]))

[[    0     0     0     0     0     0     0     0     0]
 [ 6207     0     0     0     0     0     0     0     0]
 [    0  6552     0     0     0     0     0     0     0]
 [    0     0  1013     0     0     0     0     0     0]
 [    0     0     0  1428     0     0     0     0     0]
 [    0     0     0     0  5432     0     0     0     0]
 [    0     0     0     0     0 11233     0     0     0]
 [    0     0     0     0     0     0  8027     0     0]
 [    0     0     0     0     0     0     0 19489     0]]
