# Метод ближайших соседей

1. Разделить выборку 80/20
2. Применить метод ближайших соседей для классификации скоринга.
3. Проверить качество предсказания через каппа-метрику и матрицу неточностей.

## Подключить библиотеки

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

## Загрузка данных

In [2]:
data = pd.read_csv("../data/prudential/train.csv.gz")
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59381 entries, 0 to 59380
Columns: 128 entries, Id to Response
dtypes: float64(18), int64(109), object(1)
memory usage: 58.0+ MB
None


## Разделить данные

In [3]:
data_train, data_test = train_test_split(data, test_size=0.2)
print(data_train.head())

          Id  Product_Info_1 Product_Info_2  Product_Info_3  Product_Info_4  \
54888  73099               1             D3              26        0.487179   
43935  58426               1             A8              26        0.179487   
22339  29750               1             D3              26        0.076923   
23296  31062               2             D4              26        0.641026   
12420  16513               1             D4              26        0.230769   

       Product_Info_5  Product_Info_6  Product_Info_7   Ins_Age        Ht  \
54888               2               1               1  0.208955  0.818182   
43935               2               3               1  0.641791  0.727273   
22339               2               3               1  0.417910  0.600000   
23296               2               3               1  0.208955  0.672727   
12420               2               3               1  0.477612  0.727273   

       ...  Medical_Keyword_40  Medical_Keyword_41  Medical_Ke

## Расчет модели kNN

Вычислить не центры(кластеры) исходных групп, а расстояния до всех значений. Выбираем то значение, кот. превалирует у k ближайших соседей.

Для оценки качества возьмем **k** равным **10**, **100**, **1000**, **10000**

In [23]:
columns = [
    "Wt",
    "Ht",
    "Ins_Age",
    "BMI",
]
max_nn = data_train.groupby("Response").count()["Id"].min()
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_100 = KNeighborsClassifier(n_neighbors=100)
knn_1000 = KNeighborsClassifier(n_neighbors=1000)
knn_10000 = KNeighborsClassifier(n_neighbors=10000)
knn_max = KNeighborsClassifier(n_neighbors=max_nn)

In [24]:
y = data_train["Response"]
x = pd.DataFrame(data_train, columns=columns)
knn_10.fit(x, y)
knn_100.fit(x, y)
knn_1000.fit(x, y)
knn_10000.fit(x, y)
knn_max.fit(x, y)

## Предсказание данных

In [26]:
x_test = pd.DataFrame(data_test, columns=columns)
data_test["target_10"] = knn_10.predict(x_test)
data_test["target_100"] = knn_100.predict(x_test)
data_test["target_1000"] = knn_1000.predict(x_test)
data_test["target_10000"] = knn_10000.predict(x_test)
data_test["target_max"] = knn_max.predict(x_test)
print(data_test.head(20))

          Id  Product_Info_1 Product_Info_2  Product_Info_3  Product_Info_4  \
57099  76069               1             D4              10        0.487179   
49739  66270               1             A2              10        0.102564   
24768  33008               1             D4              26        1.000000   
25788  34345               1             A8              10        0.230769   
25045  33360               1             D1              26        0.076923   
5763    7669               1             D3              26        1.000000   
3626    4835               1             D3              26        0.282051   
12666  16822               1             D3              26        0.282051   
47776  63631               1             D3              10        0.230769   
47488  63229               1             D3              26        0.230769   
16271  21654               1             D3              26        0.487179   
12628  16774               1             B2         

## Оценка модели

In [None]:
print(f'kNN, 10: {cohen_kappa_score(data_test["target_10"], data_test["Response"], weights="quadratic")}')
print(f'kNN, 100: {cohen_kappa_score(data_test["target_100"], data_test["Response"], weights="quadratic")}')
print(f'kNN, 1000: {cohen_kappa_score(data_test["target_1000"], data_test["Response"], weights="quadratic")}')
print(f'kNN, 10000: {cohen_kappa_score(data_test["target_10000"], data_test["Response"], weights="quadratic")}')
print(f'kNN, max: {cohen_kappa_score(data_test["target_max"], data_test["Response"], weights="quadratic")}')

kNN, 10: 0.29959945217641726
kNN, 100: 0.30075204263323985
kNN, 1000: 0.2779927179447802
kNN, 10000: 0.1477187823024796
kNN, max: 0.28051452752111206


## Матрицы неточностей

In [28]:
print(confusion_matrix(data_test["target_10"], data_test["Response"]))

[[ 194  174   11    8  116  194  140  145]
 [ 154  273   12   10  162  138   96   93]
 [   0    2    1    0    3    1    0    1]
 [   2    1    0    0    1    2    2    6]
 [  92  132   39    3  354  156   34   37]
 [ 285  296   45   46  227  679  414  427]
 [ 132  121   23   37   66  285  243  265]
 [ 370  350   70  191  185  763  692 2876]]


In [29]:
print(confusion_matrix(data_test["target_10000"], data_test["Response"]))

[[   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0]
 [ 555  691   85   31  712  880  451  174]
 [   0    0    0    0    0    0    0    0]
 [ 674  658  116  264  402 1338 1170 3676]]
