# Логистическая регрессия

## Подключение библиотек

In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Предобработка данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)

In [4]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

In [6]:
data.fillna(value=-1, inplace=True)
print(data.head())

   Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
0   2               1              10        0.076923               2   
1   5               1              26        0.076923               2   
2   6               1              26        0.076923               2   
3   7               1              10        0.487179               2   
4   8               1              26        0.230769               2   

   Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
0               1               1  0.641791  0.581818  0.148536  ...   
1               3               1  0.059701  0.600000  0.131799  ...   
2               3               1  0.029851  0.745455  0.288703  ...   
3               3               1  0.164179  0.672727  0.205021  ...   
4               3               1  0.417910  0.654545  0.234310  ...   

   Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword_48  Response  \
0                   0                   0            

## Оптимизация памяти

In [8]:
data = reduce_mem_usage(data)
print(data.info())

Потребление памяти меньше на - 49.49 Мб (минус 84.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 133 entries, Id to Product_Info_2_1_B
dtypes: float16(18), int16(1), int32(1), int8(113)
memory usage: 8.8 MB
None


## Набор столбцов для расчета

In [9]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Z-нормализация

In [10]:
scaler = preprocessing.StandardScaler()
scaler.fit(pd.DataFrame(data, columns=columns))

## Разделение данных

In [12]:
data_train, data_test = train_test_split(data, test_size=0.2)
print(data_train.head())

          Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
3241    4327               1              26        0.230713               2   
691      937               1              26        0.245850               2   
29637  39413               1              26        0.128174               2   
55530  73966               1              26        0.230713               2   
1991    2680               1              26        0.076904               2   

       Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
3241                3               1  0.626953  0.854492  0.343018  ...   
691                 3               1  0.656738  0.727051  0.282471  ...   
29637               3               1  0.492432  0.672852  0.393311  ...   
55530               3               1  0.432861  0.618164  0.382812  ...   
1991                3               1  0.626953  0.654785  0.288818  ...   

       Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword

## Логистическая регрессия

In [13]:
def regression_model(df, columns):
    y = df["Response"]
    x = scaler.transform(pd.DataFrame(df, columns=columns))
    model = LogisticRegression(max_iter=1000,
                               class_weight="balanced",
                               multi_class='multinomial')
    model.fit(x, y)
    return model

In [14]:
def logistic_regression(columns):
    x = scaler.transform(pd.DataFrame(data_test, columns=columns))
    model = regression_model(data_train, columns)
    data_test["target"] = model.predict(x)
    return cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic")

In [15]:
print(f'Логистическая регрессия: {round(logistic_regression(columns), 3)}')

Логистическая регрессия: 0.521


## Матрица неточностей

In [16]:
print(confusion_matrix(data_test["target"], data_test["Response"]))

[[ 357  230    4    4   86  188  116   75]
 [ 168  290    9    1  128  193   80   58]
 [ 146  139  114   75  104  244   42   19]
 [  70   65   43  170   29  340   67  143]
 [ 145  253   19    1  461  261  155  133]
 [  55   72    7   15   49  286  115  122]
 [ 127  124    3    6  110  347  661  352]
 [ 160  158   12   25  115  316  411 3004]]
