# Наивный Байес

1. Применить наивный Байес для классификации скоринга. Использовать все столбцы.
2. Проверить качество через каппа-метрику и матрицу неточностей.

## Подключение библиотек

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Категоризация данных

In [3]:
data["Product_Info_2_1"] = data["Product_Info_2"].str.slice(0, 1)
data["Product_Info_2_2"] = pd.to_numeric(data["Product_Info_2"].str.slice(1, 2))
data.drop(labels=["Product_Info_2"], axis=1, inplace=True)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 129 entries, Id to Product_Info_2_2
dtypes: float64(18), int64(110), object(1)
memory usage: 58.4+ MB
None


## Оптимизация памяти

In [4]:
data = reduce_mem_usage(data)
print(data.info())

Потребление памяти меньше на - 49.89 Мб (минус 85.4%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 129 entries, Id to Product_Info_2_2
dtypes: category(1), float16(18), int16(1), int32(1), int8(108)
memory usage: 8.6 MB
None


## Предобработка

In [5]:
list_df = [data]
for un in data["Product_Info_2_1"].unique():
    s = pd.DataFrame()
    s["Product_Info_2_1_" + un] = data["Product_Info_2_1"].isin([un]).astype("int8")
    list_df.append(s)
data = pd.concat(list_df, axis=1)
data.drop(labels=["Product_Info_2_1"], axis=1, inplace=True)

## Заполнить отсутствующие значения
-1 увеличивает расстояние при расчёте ближайших соседей

In [6]:
data.fillna(value=-1, inplace=True)

## Столбцы для модели

In [7]:
columns_groups = [
    "Insurance_History",
    "InsuredInfo",
    "Medical_Keyword",
    "Family_Hist",
    "Medical_History",
    "Product_Info",
]
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]

for cg in columns_groups:
    columns.extend(data.columns[data.columns.str.startswith(cg)])
print(columns)

['Wt', 'Ht', 'Ins_Age', 'BMI', 'Insurance_History_1', 'Insurance_History_2', 'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_5', 'Insurance_History_7', 'Insurance_History_8', 'Insurance_History_9', 'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 'InsuredInfo_6', 'InsuredInfo_7', 'Medical_Keyword_1', 'Medical_Keyword_2', 'Medical_Keyword_3', 'Medical_Keyword_4', 'Medical_Keyword_5', 'Medical_Keyword_6', 'Medical_Keyword_7', 'Medical_Keyword_8', 'Medical_Keyword_9', 'Medical_Keyword_10', 'Medical_Keyword_11', 'Medical_Keyword_12', 'Medical_Keyword_13', 'Medical_Keyword_14', 'Medical_Keyword_15', 'Medical_Keyword_16', 'Medical_Keyword_17', 'Medical_Keyword_18', 'Medical_Keyword_19', 'Medical_Keyword_20', 'Medical_Keyword_21', 'Medical_Keyword_22', 'Medical_Keyword_23', 'Medical_Keyword_24', 'Medical_Keyword_25', 'Medical_Keyword_26', 'Medical_Keyword_27', 'Medical_Keyword_28', 'Medical_Keyword_29', 'Medical_Keyword_30', 'Medical_Keywor

## Z-нормализация данных

In [8]:
scaler = preprocessing.StandardScaler()
scaler.fit(pd.DataFrame(data, columns=columns))

## Разделение данных

In [9]:
data_train, data_test = train_test_split(data, test_size=0.2)
print(data_train.head())

          Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
8365   11148               1              26        0.384521               2   
4790    6385               1              26        0.384521               2   
15915  21192               1              26        0.487061               2   
50376  67093               1              26        1.000000               2   
20246  26972               1              26        0.076904               2   

       Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
8365                3               1  0.373047  0.708984  0.257324  ...   
4790                3               1  0.238770  0.781738  0.351562  ...   
15915               3               1  0.641602  0.654785  0.225952  ...   
50376               3               1  0.298584  0.727051  0.257324  ...   
20246               3               1  0.597168  0.799805  0.376465  ...   

       Medical_Keyword_46  Medical_Keyword_47  Medical_Keyword

## Расчет модели наивного Байеса

$$ P(A|B) = \dfrac{P(B|A)*P(A)}{P(B)} $$

In [10]:
y = data_train["Response"]
x = scaler.transform(pd.DataFrame(data_train, columns=columns))

In [11]:
bayes = GaussianNB()
bayes.fit(x, y)

## Предсказание данных

In [12]:
data_test = pd.DataFrame(data_test)
x_test = scaler.transform(pd.DataFrame(data_test, columns=columns))
data_test["target"] = bayes.predict(x_test)
print(data_test.head())

          Id  Product_Info_1  Product_Info_3  Product_Info_4  Product_Info_5  \
57656  76821               1              31        0.076904               2   
28952  38539               1              26        1.000000               2   
10202  13560               1              26        0.743652               2   
1576    2116               1              26        0.128174               2   
41589  55260               1              10        0.025635               2   

       Product_Info_6  Product_Info_7   Ins_Age        Ht        Wt  ...  \
57656               1               1  0.000000  0.763672  0.236450  ...   
28952               3               1  0.149292  0.727051  0.200806  ...   
10202               3               1  0.164185  0.727051  0.299072  ...   
1576                3               1  0.089539  0.763672  0.278320  ...   
41589               3               1  0.447754  0.563477  0.215454  ...   

       Medical_Keyword_47  Medical_Keyword_48  Response  Produ

## Оценка модели

In [13]:
print(f'Байес: {cohen_kappa_score(data_test["target"], data_test["Response"], weights="quadratic")}')

Байес: 0.36771835938131736


## Матрица неточностей

In [14]:
print(confusion_matrix(data_test["target"], data_test["Response"]))

[[ 284  180   14    8   89  190   79  119]
 [  89  164    7    7   62   78   36   48]
 [ 139  150   15    7   57  102   20   38]
 [ 214  255  104  175  225  698  320  595]
 [  39   44    2    0   25   23    6    7]
 [  18   30    1    1    8   28    0    1]
 [ 299  380   22   38  348  689  616  547]
 [ 157  167   22   74  318  433  513 2453]]
