In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine_data = pd.read_csv(url, sep=";")
wine_data = wine_data.dropna()
wine_data.head(6)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [None]:
from sklearn.model_selection import train_test_split

X = wine_data.drop('quality', axis=1)
y = wine_data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=21)

# Without scaling

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Standart scaler

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

X_train_standard = standard_scaler.fit_transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

knn_standard = KNeighborsClassifier()
knn_standard.fit(X_train_standard, y_train)
y_pred_standard = knn_standard.predict(X_test_standard)
accuracy_standard = accuracy_score(y_test, y_pred_standard)

# Min-max scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

X_train_min_max = min_max_scaler.fit_transform(X_train)
X_test_min_max = min_max_scaler.transform(X_test)

knn_min_max = KNeighborsClassifier()
knn_min_max.fit(X_train_min_max, y_train)
y_pred_min_max = knn_min_max.predict(X_test_min_max)
accuracy_min_max = accuracy_score(y_test, y_pred_min_max)

In [None]:
print(f'without any scaling the accuracy is {accuracy}')
print(f'with standart scaler the accuracy is {accuracy_standard}')
print(f'with min-max scaler the accuracy is {accuracy_min_max}')

without any scaling the accuracy is 0.5020833333333333
with standart scaler the accuracy is 0.55
with min-max scaler the accuracy is 0.5645833333333333


# Validation

In [None]:
import numpy as np
X_val_min_max = min_max_scaler.transform(X_val)

for k in np.arange(1,101,2):
  knn_min_max = KNeighborsClassifier(n_neighbors=k)
  knn_min_max.fit(X_train_standard, y_train)
  y_pred_standard = knn_min_max.predict(X_val_min_max)
  accuracy_standard = accuracy_score(y_val, y_pred_standard)
  print(f'{k}={accuracy_standard}')

1=0.25892857142857145
3=0.35714285714285715
5=0.44642857142857145
7=0.48214285714285715
9=0.4732142857142857
11=0.47767857142857145
13=0.46875
15=0.45089285714285715
17=0.45089285714285715
19=0.46875
21=0.4732142857142857
23=0.4732142857142857
25=0.45535714285714285
27=0.4419642857142857
29=0.45982142857142855
31=0.4375
33=0.45982142857142855
35=0.44642857142857145
37=0.45982142857142855
39=0.46875
41=0.48214285714285715
43=0.4732142857142857
45=0.4642857142857143
47=0.4642857142857143
49=0.46875
51=0.45535714285714285
53=0.45982142857142855
55=0.4732142857142857
57=0.48660714285714285
59=0.5
61=0.4955357142857143
63=0.48660714285714285
65=0.5
67=0.4955357142857143
69=0.48660714285714285
71=0.48660714285714285
73=0.4955357142857143
75=0.5044642857142857
77=0.4955357142857143
79=0.49107142857142855
81=0.4955357142857143
83=0.4955357142857143
85=0.47767857142857145
87=0.5044642857142857
89=0.5133928571428571
91=0.5089285714285714
93=0.5133928571428571
95=0.5535714285714286
97=0.526785714

# Final test error

In [None]:
knn_min_max = KNeighborsClassifier(n_neighbors=95)
knn_min_max.fit(X_train_min_max, y_train)
y_pred_min_max = knn_min_max.predict(X_test_min_max)
accuracy_min_max = accuracy_score(y_test, y_pred_min_max)
print(accuracy_min_max)

0.5541666666666667
