In [2]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [3]:
!gdown 1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/multiclass.csv
  0% 0.00/14.6k [00:00<?, ?B/s]100% 14.6k/14.6k [00:00<00:00, 28.0MB/s]


In [4]:
df = pd.read_csv('/content/multiclass.csv')
display(df.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3,12669,9656,7561,214,2674,1338,2
1,3,7057,9810,9568,1762,3293,1776,2
2,3,6353,8808,7684,2405,3516,7844,2
3,3,13265,1196,4221,6404,507,1788,1
4,3,22615,5410,7198,3915,1777,5185,1


In [5]:
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
display(df_imputed.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3.0,12669.0,9656.0,7561.0,214.0,2674.0,1338.0,2.0
1,3.0,7057.0,9810.0,9568.0,1762.0,3293.0,1776.0,2.0
2,3.0,6353.0,8808.0,7684.0,2405.0,3516.0,7844.0,2.0
3,3.0,13265.0,1196.0,4221.0,6404.0,507.0,1788.0,1.0
4,3.0,22615.0,5410.0,7198.0,3915.0,1777.0,5185.0,1.0


In [8]:
label_encoder = LabelEncoder()
df_encoded = df_imputed.copy()
df_encoded['class'] = label_encoder.fit_transform(df_encoded['class'])
display(df_encoded.head())
display(df_encoded['class'].value_counts())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3.0,12669.0,9656.0,7561.0,214.0,2674.0,1338.0,1
1,3.0,7057.0,9810.0,9568.0,1762.0,3293.0,1776.0,1
2,3.0,6353.0,8808.0,7684.0,2405.0,3516.0,7844.0,1
3,3.0,13265.0,1196.0,4221.0,6404.0,507.0,1788.0,0
4,3.0,22615.0,5410.0,7198.0,3915.0,1777.0,5185.0,0


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,180
2,173
0,87


In [9]:
scaler = StandardScaler()
numerical_cols = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])
display(df_encoded.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3.0,0.052933,0.523568,-0.041115,-0.589367,-0.043569,-0.066339,1
1,3.0,-0.391302,0.544458,0.170318,-0.270136,0.086407,0.089151,1
2,3.0,-0.447029,0.408538,-0.028157,-0.137536,0.133232,2.243293,1
3,3.0,0.100111,-0.62402,-0.392977,0.687144,-0.498588,0.093411,0
4,3.0,0.840239,-0.052396,-0.079356,0.173859,-0.231918,1.299347,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (432, 7)
Shape of X_test: (108, 7)
Shape of y_train: (432,)
Shape of y_test: (108,)


In [11]:
# Separate features (X) and target (y)
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Shape of X after SMOTE:", X_resampled.shape)
print("Shape of y after SMOTE:", y_resampled.shape)

Shape of X after SMOTE: (540, 7)
Shape of y after SMOTE: (540,)


In [13]:
X = df_encoded.drop('class', axis=1)
y = df_encoded['class']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Shape of original data:", X.shape)
print("Shape of resampled data:", X_resampled.shape)
print("\nValue counts of original target variable:")
display(y.value_counts())
print("\nValue counts of resampled target variable:")
display(y_resampled.value_counts())

Shape of original data: (440, 7)
Shape of resampled data: (540, 7)

Value counts of original target variable:


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,180
2,173
0,87



Value counts of resampled target variable:


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,180
0,180
2,180


In [15]:
display(X.var())

Unnamed: 0,0
Region,0.599498
Fresh,1.002278
Milk,1.002278
Grocery,1.002278
Frozen,1.002278
Detergents_Paper,1.002278
Delicassen,1.002278


In [17]:
correlation_matrix = X.corr(method='pearson')
display(correlation_matrix)

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Region,1.0,0.055287,0.032288,0.007696,-0.021044,-0.001483,0.045212
Fresh,0.055287,1.0,0.10051,-0.011854,0.345881,-0.101953,0.24469
Milk,0.032288,0.10051,1.0,0.728335,0.123994,0.661816,0.406368
Grocery,0.007696,-0.011854,0.728335,1.0,-0.040193,0.924641,0.205497
Frozen,-0.021044,0.345881,0.123994,-0.040193,1.0,-0.131525,0.390947
Detergents_Paper,-0.001483,-0.101953,0.661816,0.924641,-0.131525,1.0,0.069291
Delicassen,0.045212,0.24469,0.406368,0.205497,0.390947,0.069291,1.0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (432, 7)
Shape of X_test: (108, 7)
Shape of y_train: (432,)
Shape of y_test: (108,)


In [24]:
def knn(X, Y, queryPoint, k):
    """Predict the class label for the query point"""

    # Euclidean Distance
    dist = np.sqrt(np.sum((queryPoint - X)**2, axis=1))

    # Storing distance and class labels together
    distances = [(dist[i], Y[i]) for i in range(len(dist))]

    # sort the distances
    distances = sorted(distances)

    # Nearest/First K points
    distances = distances[:k]

    distances = np.array(distances)

    classes_counts = np.unique(distances[:,1], return_counts=True)

    index = classes_counts[1].argmax()
    pred = classes_counts[0][index]

    return int(pred), distances

In [35]:
pred,neighbors = knn(X_resampled, y_resampled, X_test.iloc[40],7)

print(f'k nearest neighbors with the distnace and class label : {neighbors}')

print(f'The predicted class labels: {pred}')

k nearest neighbors with the distnace and class label : [[0.         1.        ]
 [1.67682541 2.        ]
 [1.70123467 1.        ]
 [1.78063151 1.        ]
 [1.81021488 1.        ]
 [1.84153681 2.        ]
 [1.95573236 2.        ]]
The predicted class labels: 1
