# Иерархия логистической регрессии

Построить 4 модели логистической регрессии:
1. для 8, 6 и остальных классов
2. для 2, 5 и остальных классов
3. для 1, 7 и остальных классов
4. для 3 и 4 -по убыванию частоты значения

Использовать перекрестную проверку при принятии решения об оптимальном наборе столбцов

Проверить предсказание через каппа-метрику.

## Подключить библиотеки

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import cohen_kappa_score, confusion_matrix, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Предобработка данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)

In [4]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

In [5]:
data.fillna(value=-1, inplace=True)
print(data.head())

   Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   2               1              10        0.076923               2   
1   5               1              26        0.076923               2   
2   6               1              26        0.076923               2   
3   7               1              10        0.487179               2   
4   8               1              26        0.230769               2   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0               1               1  0.641791  0.581818  0.148536  ...   
1               3               1  0.059701  0.600000  0.131799  ...   
2               3               1  0.029851  0.745455  0.288703  ...   
3               3               1  0.164179  0.672727  0.205021  ...   
4               3               1  0.417910  0.654545  0.234310  ...   

   Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48  Response  \
0                   0                   0            

## Оптимизировать память

In [6]:
data = reduce_mem_usage(data)
print(data.info())

Потребление памяти меньше на - 49.49 Мб (минус 84.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 133 entries, Id to Product_Info_2_1_B
dtypes: float16(18), int16(1), int32(1), int8(113)
memory usage: 8.8 MB
None


## Набор столбцов для расчета

In [7]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Предобработка данных

In [8]:
scaler = preprocessing.StandardScaler()
data_transformed = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data, columns=columns)))
columns_transformed = data_transformed.columns
data_transformed["Response"] = data["Response"]

## Разделение данных

In [9]:
data_train, data_test = train_test_split(data_transformed, test_size=0.2)
print(data_train.head())

              0         1         2         3         4         5         6  \
18391 -1.429833 -0.707398  1.424839 -1.434082  0.611857 -0.169414 -1.159587   
32137 -1.124099 -1.687657 -0.921336 -0.431258  0.611857 -0.169414  0.862391   
2700   0.662319  1.246540 -0.164865  0.030201  0.611857 -0.169414 -1.159587   
21529 -1.453140 -1.937656  0.592844 -0.794832  0.611857 -0.169414 -1.159587   
1914  -0.514003 -0.463978  0.743891 -0.333373 -1.634368 -0.169414  0.862391   

              7         8         9  ...       116       117       118  \
18391  1.101046 -1.156735  1.130555  ... -0.083689  0.441621 -0.149284   
32137 -1.013721  0.862242 -0.928723  ... -0.083689  0.441621 -0.149284   
2700   1.101046 -1.156735  1.130555  ... -0.083689 -2.264385 -0.149284   
21529  1.101046 -1.156735  1.130555  ... -0.083689  0.441621 -0.149284   
1914  -1.013721  0.862242 -0.928723  ... -0.083689  0.441621 -0.149284   

            119       120       121       122       123       124  Response  
18

## Логистическая регрессия

В обучающих данных пометим все классы, кроме 6 и 8, как 0 - и проведем обучение.
Затем в оставшихся данных (в кот класс не равен 6 или 8) заменим все классы, кроме 7 и 1, на 0 - и снова проведем обучение и т.д.

### Оптимальный набор столбцов
Для каждого уровня иерархии это будет свой набор столбцов в исходных данных.

### Перекрестная проверка
Разбиваем обущающую выборку на k частей. На каждой части обучаем модель. Затем проверяем 1-ю, 2-ю, 3-ю, 4-ю на 5-ой; 1-ю, 2-ю, 3-ю, 5-ю на 4-ой и т.д.

In [10]:
def regression_model(df, columns):
    x = pd.DataFrame(df, columns=columns)
    model = LogisticRegression(max_iter=1000)
    model.fit(x, df["Response"])
    return model

In [11]:
def logistic_regression(df_train, columns):
    model = regression_model(df_train, columns)
    logr_grid = GridSearchCV(model, {}, cv=5, n_jobs=2, scoring=make_scorer(cohen_kappa_score))
    x = pd.DataFrame(df_train, columns=columns)
    logr_grid.fit(x, df_train["Response"])
    return logr_grid.best_score_

In [15]:
def find_opt_columns(data_train):
    kappa_score_opt = 0
    columns_opt = []
    for col in columns_transformed:
        kappa_score = logistic_regression(data_train, [col])
        if kappa_score > kappa_score_opt:
            columns_opt = [col]
            kappa_score_opt = kappa_score
    for col in columns_transformed:
        if col not in columns_opt:
            columns_opt.append(col)
            kappa_score = logistic_regression(data_train, columns_opt)
            if kappa_score < kappa_score_opt:
                columns_opt.pop()
            else:
                kappa_score_opt = kappa_score
    return columns_opt, kappa_score_opt

Будем последовательно урезать набор данных при расчете более глубоких моделей: после получения разделения на 8 отсечем все данные со значение 8 и т.д.
После каждого расчета модели будем вычислять значения в проверочной выборке. Проверочную выборку нулями заполнять не будем.
Набор разделений 6/8, 2/5, 1/7, 3/4 дает наибольшую точность

In [16]:
responses = [
    [6, 8],
    [2, 5],
    [1, 7],
    [3, 4],
]

logr_models = [{} for _ in range(len(responses))]
data_train_current = data_train.copy()

for i, response in enumerate(responses):
    m_train = data_train_current.copy()
    if response != [3, 4]:
        m_train["Response"] = m_train["Response"].apply(lambda x: 0 if x not in response else x)
    columns_opt, kappa_score_opt = find_opt_columns(m_train)
    logr_models[i] = {
        "model": regression_model(m_train, columns_opt),
        "columns": columns_opt,
    }
    if response != [3, 4]:
        data_train_current = data_train_current[~data_train_current["Response"].isin(response)]
    
    print(f"{i} - {kappa_score_opt} - {columns_opt}", end="/n/n")

0 - 0.4238871994942393 - [3, 0, 2, 4, 5, 6, 7, 8, 13, 15, 16, 17, 18, 21, 22, 23, 27, 29, 30, 33, 34, 37, 41, 42, 43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 67, 68, 70, 71, 72, 75, 76, 77, 78, 79, 86, 87, 88, 89, 91, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 105, 106, 108, 109, 110, 111, 112, 113, 115, 116, 117, 120, 123]/n/n1 - 0.18115480282977808 - [21, 2, 3, 4, 5, 9, 13, 16, 17, 18, 19, 20, 22, 23, 25, 32, 33, 35, 36, 38, 39, 41, 46, 47, 48, 49, 50, 52, 53, 64, 66, 67, 68, 70, 72, 73, 75, 76, 80, 81, 86, 87, 89, 92, 94, 95, 96, 97, 99, 102, 106, 107, 113, 115, 119, 120, 121, 122, 123, 124]/n/n2 - 0.5419864599391615 - [75, 0, 1, 2, 4, 5, 6, 8, 13, 14, 16, 17, 18, 19, 20, 21, 23, 24, 27, 30, 32, 33, 38, 40, 43, 44, 45, 48, 50, 54, 56, 57, 58, 59, 60, 61, 62, 64, 66, 70, 71, 72, 73, 76, 77, 82, 83, 84, 86, 87, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 102, 103, 106, 107, 109, 110, 111, 112, 115, 116, 118, 120]/n/n3 - 0.48838286102787176 - [3, 4, 5,

## Предсказание данных и оценка модели
Последовательно считаем предсказания для каждой классификации. После этого объединяем предсказание по иерархии.

In [17]:
def logr_hierarchy(x):
    for response in range(len(responses)):
        if x["target" + str(response)] > 0:
            x["target"] = x["target" + str(response)]
            break;
    return x

In [18]:
for response in range(len(responses)):
    model = logr_models[response]["model"]
    columns_opt = logr_models[response]["columns"]
    x = pd.DataFrame(data_test, columns=columns_opt)
    data_test["target" + str(response)] = model.predict(x)

In [19]:
data_test = data_test.apply(logr_hierarchy, axis=1, result_type="expand")
print(data_test.head())

              0         1         2         3         4         5         6  \
42084  1.367014  0.759700 -0.694766  1.144893  0.611857 -0.169414  0.862391   
30541 -0.091734  1.003120 -0.164865 -0.658991  0.611857 -0.169414 -1.159587   
19980  2.773664 -0.707398 -1.526761  4.325164  0.611857 -0.169414 -1.159587   
2819  -1.242006 -1.444237  0.062943 -0.764867  0.611857 -0.169414  0.862391   
13532  0.544412  0.516280  0.289513  0.361812  0.611857 -0.169414  0.862391   

              7         8         9  ...       121       122       123  \
42084 -1.013721  0.867624 -0.928723  ... -0.623305 -0.216001 -0.128866   
30541  1.101046 -1.156735  1.130555  ... -0.623305 -0.216001 -0.128866   
19980  1.101046 -1.156735  1.130555  ... -0.623305 -0.216001 -0.128866   
2819   0.043662  0.864934 -0.928723  ... -0.623305 -0.216001 -0.128866   
13532 -1.013721  0.864260 -0.928723  ... -0.623305 -0.216001 -0.128866   

            124  Response  target0  target1  target2  target3  target  
42084 -0

In [20]:
print(f'{cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic"), 3}')

(0.4983902944210179, 3)


## Матрица неточностей

In [21]:
print(confusion_matrix(data_test["target"], data_test["Response"]))

[[ 467  347   27   26  193  470  149  141]
 [ 173  230    4    1   87   75   10    4]
 [  41   49   76   33   86  161   22    4]
 [  57   54   60  147   23  274   50   64]
 [  68  132   12    0  210   58   22    9]
 [  18   25   13   21   11  138   23   38]
 [ 232  274   13   13  342  739  827  492]
 [ 190  163   10   45  141  353  507 3133]]
