# KNN zaliczenie

In [3]:
import random
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn.metrics import mean_absolute_error, log_loss, accuracy_score, balanced_accuracy_score
import random
import matplotlib.pyplot as plt

## Description

Car Evaluation Database was derived from a simple hierarchical decision model originally developed for the demonstration of DEX, M. Bohanec, V. Rajkovic: Expert system for decision making. Sistemica 1(1), pp. 145-157, 1990.). The model evaluates cars according to the following concept structure:

CAR car acceptability
. PRICE overall price
. . buying buying price
. . maint price of the maintenance
. TECH technical characteristics
. . COMFORT comfort
. . . doors number of doors
. . . persons capacity in terms of persons to carry
. . . lug_boot the size of luggage boot
. . safety estimated safety of the car 

In [53]:
df = pd.read_csv('files/car.data')

In [54]:
df.columns=['buying','maint' ,'doors','persons','lug_boot','safety','classes']

In [55]:
pd.unique(df.classes)

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

Nadawanie wartości liczbowych

w kolumnie buying vhigh=4, high=3, med=2, low=1\
w kolumnie maint vhigh=4, high=3, med=2, low=1\
w kolumnie doors 2=2, 3=3, 4=4, 5-more=5\
w kolumnie persons 2-2, 4=4, more=3\
w kolumnie lug_boot small=1, med=2, big=3\
w kolumnie safety low=1, med=2, high=3\
w kolumnie classes unacc=1, acc=2, good=3, vgood=4\


In [56]:
df['buying'] = df['buying'].map({'vhigh': 4, 'high': 3, 'med':2, 'low':1})
df['maint'] = df['maint'].map({'vhigh': 4, 'high': 3, 'med':2, 'low':1})
df['doors'] = df['doors'].map({'2': 2, '3': 3,'4':4, '5more':3})
df['persons'] = df['persons'].map({'2': 2, '4':4, 'more':3})
df['lug_boot'] = df['lug_boot'].map({'small': 1, 'med': 2,'big':3})
df['safety'] = df['safety'].map({'low': 1, 'med': 2,'high':3})
df['classes'] = df['classes'].map({'unacc': 1, 'acc': 2,'good':3, 'vgood':4})

In [57]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,4,4,2,2,1,2,1
1,4,4,2,2,1,3,1
2,4,4,2,2,2,1,1
3,4,4,2,2,2,2,1
4,4,4,2,2,2,3,1
...,...,...,...,...,...,...,...
1722,1,1,3,3,2,2,3
1723,1,1,3,3,2,3,4
1724,1,1,3,3,3,1,1
1725,1,1,3,3,3,2,3


In [64]:
df.iloc[1722]

buying      1
maint       1
doors       3
persons     3
lug_boot    2
safety      2
classes     3
Name: 1722, dtype: int64

###  Tu są wartości kolumny klasy , do której przyporządkowuje się jakiś obiekt - rekord czyli zakupiony samochód

In [65]:
values=df.classes

In [66]:
values

0       1
1       1
2       1
3       1
4       1
       ..
1722    3
1723    4
1724    1
1725    3
1726    4
Name: classes, Length: 1727, dtype: int64

Usuwam ostatnią kolumnę, gdzie jest wartość klasyfikatora.

In [67]:
df= df.drop(df.columns[[-1]], axis=1) 

In [68]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,4,4,2,2,1,2
1,4,4,2,2,1,3
2,4,4,2,2,2,1
3,4,4,2,2,2,2
4,4,4,2,2,2,3
...,...,...,...,...,...,...
1722,1,1,3,3,2,2
1723,1,1,3,3,2,3
1724,1,1,3,3,3,1
1725,1,1,3,3,3,2


## Normalizacja

In [69]:
df = (df - df.mean())/(df.max()-df.min())

In [70]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,0.50029,0.50029,-0.50029,-0.50029,-0.50029,-0.00029
1,0.50029,0.50029,-0.50029,-0.50029,-0.50029,0.49971
2,0.50029,0.50029,-0.50029,-0.50029,-0.00029,-0.50029
3,0.50029,0.50029,-0.50029,-0.50029,-0.00029,-0.00029
4,0.50029,0.50029,-0.50029,-0.50029,-0.00029,0.49971
...,...,...,...,...,...,...
1722,-0.49971,-0.49971,-0.00029,-0.00029,-0.00029,-0.00029
1723,-0.49971,-0.49971,-0.00029,-0.00029,-0.00029,0.49971
1724,-0.49971,-0.49971,-0.00029,-0.00029,0.49971,-0.50029
1725,-0.49971,-0.49971,-0.00029,-0.00029,0.49971,-0.00029


# Tworzenie modelu

##  Podział danych
trening 80% test 20%

In [71]:
test_rows = random.sample(df.index.tolist(), int(round(len(df)*.2)))
train_rows=set(range(len(df))) - set(test_rows)

In [72]:
df_test = df.loc[test_rows]
df_train = df.loc[train_rows]

test_values = values.loc[test_rows]
train_values = values.loc[train_rows]

  df_train = df.loc[train_rows]
  train_values = values.loc[train_rows]


## KNN implementacja manualna

In [73]:
kdtree = KDTree(df)

In [74]:
type(kdtree)

scipy.spatial._kdtree.KDTree

#### Funkcja przydzielająca predykcję


In [75]:
def predict(query_point, k):
    ''' k liczba sąsiadów'''
    _, idx = kdtree.query(query_point, k)
    return np.mean(values.iloc[idx])

In [76]:
train_predicted_values = []
train_actual_values = []

In [77]:
# Sprawdzamy zachowanie modelu dla k=5 sąsiadów
for _id, row in df_train.iterrows():
  prediction = predict(row, 5)
  train_predicted_values.append(prediction)
  train_actual_values.append(train_values[_id])

### Wyznaczanie błędu regresji za pomocą MAE

In [78]:
mae = mean_absolute_error(train_actual_values, train_predicted_values)

In [79]:
mae

0.09913169319826338

##  Przy podziale 80% 20%  jest mały błąd czyli jesteśmy zadowoleni :)

### Ilosc sąsiadow i błąd

In [80]:
# Ilosc sąsiadów
k=int(input('Podaj ilość sąsiadów k='))

number_neigbours = []
absolute_error = []
for i in range(1,k+1):
    train_predicted_values = []
    train_actual_values = []
    number_neigbours.append(i)
    for _id, row in df_train.iterrows():
        prediction = predict(row, i)
        train_predicted_values.append(prediction)
        train_actual_values.append(train_values[_id])
    mae = mean_absolute_error(train_actual_values, train_predicted_values)
    absolute_error.append(mae)
    

    

Podaj ilość sąsiadów k=5


In [81]:
number_neigbours

[1, 2, 3, 4, 5]

In [82]:
for i in range(len(absolute_error)):
    print(f'Dla {i+1} sąsiadów błąd wynosi {absolute_error[i]}') 

Dla 1 sąsiadów błąd wynosi 0.01085383502170767
Dla 2 sąsiadów błąd wynosi 0.030390738060781478
Dla 3 sąsiadów błąd wynosi 0.06512301013024602
Dla 4 sąsiadów błąd wynosi 0.09460926193921852
Dla 5 sąsiadów błąd wynosi 0.09913169319826338


## Klasyfikacja przy użyciu metody KNN

In [83]:
def classify(query_point, k):
    _, idx = kdtree.query(query_point, k)
    return np.argmax(np.bincount(values.iloc[idx]))

###  Tu przykładowo sklasyfikowany element o indeksie 1722 z "5" sąsiadami

In [84]:
classify(df.iloc[1722],5)

3