In [1]:
# import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# 1. load and clean data
sparklingwine_data  = pd.read_csv('sparklingwine.csv')
sparklingwine_data = sparklingwine_data.drop(columns=["Unnamed: 0"])
sparklingwine_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,5.4,0.29,0.38,1.2,0.029,31.0,132.0,0.98895,3.28,0.36,12.40,6
1,6.7,0.24,0.29,14.9,0.053,55.0,136.0,0.99839,3.03,0.52,9.00,5
2,6.8,0.33,0.31,7.4,0.045,34.0,143.0,0.99226,3.06,0.55,12.20,6
3,6.4,0.27,0.19,2.0,0.084,21.0,191.0,0.99516,3.49,0.63,9.60,4
4,6.1,0.30,0.30,2.1,0.031,50.0,163.0,0.98950,3.39,0.43,12.70,7
...,...,...,...,...,...,...,...,...,...,...,...,...
1595,7.1,0.33,0.64,13.2,0.056,12.0,105.0,0.99720,3.05,0.39,9.20,5
1596,7.8,0.21,0.39,1.8,0.034,62.0,180.0,0.99100,3.09,0.75,12.60,8
1597,6.4,0.42,0.19,9.3,0.043,28.0,145.0,0.99433,3.23,0.53,10.98,5
1598,8.4,0.35,0.71,12.2,0.046,22.0,160.0,0.99820,2.98,0.65,9.40,5


note that first index column was redundant to built-in pandas column for indexing so we dropped it

In [3]:
# 2. construct the good wine column
sparklingwine_data["good wine"] = sparklingwine_data["quality"].apply(lambda x: 1 if x >= 6 else 0)

In [5]:
sparklingwine_data[sparklingwine_data["good wine"] == 1]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,good wine
0,5.4,0.29,0.38,1.20,0.029,31.0,132.0,0.98895,3.28,0.36,12.4,6,1
2,6.8,0.33,0.31,7.40,0.045,34.0,143.0,0.99226,3.06,0.55,12.2,6,1
4,6.1,0.30,0.30,2.10,0.031,50.0,163.0,0.98950,3.39,0.43,12.7,7,1
5,6.2,0.10,0.41,1.00,0.040,17.0,76.0,0.98988,3.14,0.56,11.4,7,1
6,7.3,0.20,0.37,1.20,0.037,48.0,119.0,0.99200,3.32,0.49,10.9,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1591,5.8,0.13,0.22,12.70,0.058,24.0,183.0,0.99560,3.32,0.42,11.7,6,1
1592,6.9,0.35,0.39,2.40,0.048,25.0,157.0,0.99133,3.20,0.54,11.1,7,1
1593,7.8,0.30,0.36,4.60,0.024,20.0,198.0,0.99222,3.06,0.66,11.9,6,1
1594,7.0,0.26,0.46,15.55,0.037,61.0,171.0,0.99860,2.94,0.35,8.8,6,1


In [6]:
# 3. Split the data into training (first 900 samples), validation (next 300), and test (last 400) sets
train_data = sparklingwine_data.iloc[:900]
validation_data = sparklingwine_data.iloc[900:1200]
test_data = sparklingwine_data.iloc[1200:]

# Separate features and labels
X_train, y_train = train_data.drop(columns=["quality", "good wine"]), train_data["good wine"]
X_val, y_val = validation_data.drop(columns=["quality", "good wine"]), validation_data["good wine"]
X_test, y_test = test_data.drop(columns=["quality", "good wine"]), test_data["good wine"]

In [9]:
# 4 normalise data using Z-score
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [35]:
# 5 load and train k nearest
best_k = 0
best_model = 0

for k in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_val_pred = knn.predict(X_val_scaled)
    cm = confusion_matrix(y_val, y_val_pred)
    tn, fp, fn, tp = cm.ravel()
    #metric = tn / (tn + fp) #specificity
    metric = (tp + tn) / (tp + tn + fp + fn) #accuracy
    #metric = tp / (tp + fp) #precision
    #metric = tp / (tp + fn) #recall
    #metric = 2 * (tp) / (2 * tp + fp + fn) #f1_score

    if metric > best_model or (metric == best_model and k < best_k):
        best_model = metric
        best_k = k

Note that  we choose specificity as our evaluating metric, since we don't want to low-quality wines predicted as high-quality.
Also, in case of a tie in best accuracy we choose the smaller k sine it makes the classifier less complex and more responsive to local patterns in the data

In [26]:
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)
y_val_pred = best_knn.predict(X_val_scaled)
# print best k
print("Best k:", best_k)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))


Best k: 17
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.52      0.58        97
           1       0.79      0.87      0.83       203

    accuracy                           0.76       300
   macro avg       0.72      0.69      0.70       300
weighted avg       0.75      0.76      0.75       300



In [28]:
# 7. Evaluate the selected classifier on the test set
best_knn.fit(X_train_scaled, y_train)  # Re-train on the training set
y_test_pred = best_knn.predict(X_test_scaled)
cm_test = confusion_matrix(y_test, y_test_pred)
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()
test_specificity = tn_test / (tn_test + fp_test)

# print best k
print("Best k:", best_k)
print("Classification Report:")
print(classification_report(y_test, y_test_pred))


Best k: 17
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.51      0.59       132
           1       0.79      0.89      0.84       268

    accuracy                           0.77       400
   macro avg       0.74      0.70      0.71       400
weighted avg       0.76      0.77      0.75       400



In [34]:
# 8. try new split
train_data_new = sparklingwine_data.iloc[:400]
validation_data_new = sparklingwine_data.iloc[400:800]
test_data_new = sparklingwine_data.iloc[800:]


# Separate features and labels
X_train_new, y_train_new = train_data_new.drop(columns=["quality", "good wine"]), train_data_new["good wine"]
X_val_new, y_val_new = validation_data_new.drop(columns=["quality", "good wine"]), validation_data_new["good wine"]
X_test_new, y_test_new = test_data_new.drop(columns=["quality", "good wine"]), test_data_new["good wine"]


# Normalize the data using Z-score transformation (fit only on new training data)
X_train_scaled_new = scaler.fit_transform(X_train_new)
X_val_scaled_new = scaler.transform(X_val_new)
X_test_scaled_new = scaler.transform(X_test_new)


# Train k-NN classifiers for k = 1 to 100 and evaluate on the validation set
best_k_new = 0
best_model_new = 0

for k in range(1, 101):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled_new, y_train_new)
    y_val_pred_new = knn.predict(X_val_scaled_new)
    cm_new = confusion_matrix(y_val_new, y_val_pred_new)
    tn_new, fp_new, fn_new, tp_new = cm_new.ravel()
    #metric_new = tn_new / (tn_new + fp_new) #specificity
    metric_new = (tp_new + tn_new) / (tp_new + tn_new + fp_new + fn_new) #accuracy
    #metric_new = tp_new / (tp_new + fp_new) #precision
    #metric_new = tp_new / (tp_new + fn_new) #recall
    #metric_new = 2 * (tp_new) / (2 * tp_new + fp_new + fn_new) #f1_score

    if metric_new > best_model_new or (metric_new == best_model_new and k < best_k_new):
        best_model_new = metric_new
        best_k_new = k

# Evaluate the selected classifier on the test set
best_knn_new = KNeighborsClassifier(n_neighbors=best_k_new)
best_knn_new.fit(X_train_scaled_new, y_train_new)
y_test_pred_new = best_knn_new.predict(X_test_scaled_new)


# print best k
print("Best k:", best_k_new)
print("Classification Report:")
print(classification_report(y_test_new, y_test_pred_new))


Best k: 5
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.58      0.60       259
           1       0.81      0.83      0.82       541

    accuracy                           0.75       800
   macro avg       0.71      0.70      0.71       800
weighted avg       0.74      0.75      0.75       800

