In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [1]:
!pip install gdown
!gdown 1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/multiclass.csv
100% 14.6k/14.6k [00:00<00:00, 52.3MB/s]


In [5]:
df = pd.read_csv("multiclass.csv")


In [6]:
print("Shape of dataset:", df.shape)
print("First 5 rows:\n", df.head())

Shape of dataset: (440, 8)
First 5 rows:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1


In [7]:
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64


In [8]:
print("\nDuplicate rows:", df.duplicated().sum())


Duplicate rows: 0


In [9]:
print("\nClass distribution:\n", df['class'].value_counts())



Class distribution:
 class
2    180
3    173
1     87
Name: count, dtype: int64


In [14]:
X = df.drop(columns=["class"])
y = df["class"]

In [15]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
print(X)

[[ 0.59066829  0.05293319  0.52356777 ... -0.58936716 -0.04356873
  -0.06633906]
 [ 0.59066829 -0.39130197  0.54445767 ... -0.27013618  0.08640684
   0.08915105]
 [ 0.59066829 -0.44702926  0.40853771 ... -0.13753572  0.13323164
   2.24329255]
 ...
 [ 0.59066829  0.20032554  1.31467078 ... -0.54337975  2.51121768
   0.12145607]
 [ 0.59066829 -0.13538389 -0.51753572 ... -0.41944059 -0.56977032
   0.21304614]
 [ 0.59066829 -0.72930698 -0.5559243  ... -0.62009417 -0.50488752
  -0.52286938]]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [18]:
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y.values if isinstance(y, pd.Series) else y

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_labels).most_common(1)
        return most_common[0][0]


In [20]:
knn_scratch = KNN(k=5)
knn_scratch.fit(X_train, y_train)
y_pred_scratch = knn_scratch.predict(X_test)
print(y_pred_scratch)

[2 3 2 1 3 3 3 2 3 3 3 1 2 3 2 3 2 2 1 3 2 2 2 3 1 1 1 1 3 3 2 2 1 2 1 3 3
 2 2 3 3 3 3 3 2 1 2 2 2 1 2 3 3 1 2 3 2 3 2 2 1 3 2 2 2 2 3 2 3 1 2 3 3 3
 3 2 2 1 2 3 3 3 1 2 3 2 3 3]


In [21]:
print("\nScratch KNN Accuracy:", accuracy_score(y_test, y_pred_scratch))


Scratch KNN Accuracy: 0.8636363636363636


In [22]:
knn_sklearn = KNeighborsClassifier(n_neighbors=5)
knn_sklearn.fit(X_train, y_train)
y_pred_sklearn = knn_sklearn.predict(X_test)
print("Sklearn KNN Accuracy:", accuracy_score(y_test, y_pred_sklearn))

Sklearn KNN Accuracy: 0.8636363636363636
