# KNN Classifier

**Import Lib**

In [248]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from pandas.plotting import parallel_coordinates
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

**Load the dataset**

In [249]:
data = pd.read_csv("./removed_outlier.csv")

**Overview**

In [250]:
data.head()

Unnamed: 0.1,Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [251]:
data.value_counts(['quality'])

quality
6          1860
5          1101
7           792
8           145
4            88
3             8
9             4
Name: count, dtype: int64

In [252]:


# Separate the features (X) and the target (y)
X = data.drop('quality', axis=1)
y = data['quality']

# Apply the SelectKBest feature selection method with chi-squared test as an example
k = 5  # You can adjust this value based on your dataset and problem
best_features = SelectKBest(score_func=chi2, k=k)
fit = best_features.fit(X, y)

# Get the scores and feature names
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': fit.scores_})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

# Select the top k features
selected_features = feature_scores.head(k)['Feature'].tolist()
X_selected = X[selected_features]


                 Feature         Score
0             Unnamed: 0  20604.211731
7   total sulfur dioxide   2762.772314
6    free sulfur dioxide    642.532669
4         residual sugar    397.917432
11               alcohol    113.269678
2       volatile acidity      2.649899
1          fixed acidity      1.189336
5              chlorides      0.848352
10             sulphates      0.272781
9                     pH      0.173861
3            citric acid      0.099409
8                density      0.003784


In [253]:
# data = data[['residual sugar', 'alcohol','density','chlorides', 'quality']]
data.value_counts([ 'quality'])

quality
6          1860
5          1101
7           792
8           145
4            88
3             8
9             4
Name: count, dtype: int64

**Split Data**

In [254]:

# Define features and target
X = data.drop(columns=["quality"])
y = data["quality"]
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)


In [255]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [256]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.068985,-0.146046,0.226083,0.172066,0.506056,0.567234,-0.186112,-0.636604,0.232655,0.448504,0.5862,0.350895
1,-0.741965,-0.97355,-0.570042,1.910482,-0.978279,0.869553,-1.112885,-1.218202,-0.368717,0.807766,-0.224248,-0.803672
2,-1.745149,0.267705,0.093396,0.403855,2.872968,0.264915,0.674463,0.793158,2.449138,-1.347804,-0.325554,-1.463425
3,0.692262,-0.008129,-1.233479,0.172066,-0.877986,-0.23895,0.211076,-0.636604,-1.330911,0.376652,-0.325554,1.917808
4,-0.975477,-0.421881,0.358771,-0.870984,0.847051,-0.339723,-0.186112,-0.418505,0.799662,0.879618,-1.439921,-0.391327


**Train And Evaluate Model**

In [257]:


# Initialize and train the model
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.57375


**Train And Evaluate Model Using Repeated hold-out**

In [258]:
# n_repeats = 10
# test_size = 0.3
# # Define features and target
# X = data.drop(columns=["quality"])
# y = data["quality"]
# # Initialize a list to store performance metrics
# accuracies = []
# for _ in range(n_repeats):
#     # Split the data into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,stratify=y, random_state=None)
    
#     # Create and train your machine learning model
#     model = RandomForestClassifier()
#     model.fit(X_train, y_train)
    
#     # Make predictions on the test set
#     y_pred = model.predict(X_test)
    
#     # Calculate and store the accuracy
#     accuracy = accuracy_score(y_test, y_pred)
#     accuracies.append(accuracy)
#     print(f"Accuracy: {accuracy:.2f}")

# # Calculate and print the average accuracy and its standard deviation
# avg_accuracy = sum(accuracies) / n_repeats
# std_accuracy = np.std(accuracies)
# print(f"Average Accuracy: {avg_accuracy:.2f}")
# print(f"Standard Deviation: {std_accuracy:.2f}")

