In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("customers.csv")
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [5]:
target = 'Channel'   # change this if metadata specifies a different target
X = df.drop(columns=[target])
y = df[target]

In [8]:
# handle categorical variables
cat_cols = X.select_dtypes(include='object').columns

le = LabelEncoder()
for cols in cat_cols:
    X[col] = le.fit_transform(X[col])
    # works well for tree based model. Acceptable for basic classification task

In [11]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, test_size=0.2)

In [13]:
#Feature Scaling (Important for Logistic Regression & KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Logistic Regression Implementation
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)
lr_acc = accuracy_score(y_test ,y_pred_lr)

print("Logistic_regressio accuracy",lr_acc)
#Suitable for binary classification
#Learns linear decision boundary
#Sensitive to feature scaling

Logistic_regressio accuracy 0.9204545454545454


In [17]:
# Decision Tree Classifier
dt= DecisionTreeClassifier(random_state=42,max_depth=5)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
dt_acc = accuracy_score(y_test,y_pred_dt)

print('decision tree classifier',dt_acc)
#Tree-based model
#Handles non-linear relationships
#No need for feature scaling
#max_depth helps prevent overfitting

decision tree classifier 0.8977272727272727


In [19]:
# K-NEAREST NEIGHBORS (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)
knn_acc = accuracy_score(y_test, y_pred_knn)

print('KNN accuracy',knn_acc)
#Classifies based on majority vote of nearest neighbors
#Highly sensitive to scaling
#Performance depends on value of k

KNN accuracy 0.8863636363636364


In [20]:
# ACCURACY COMPARISION
accuracy_df = pd.DataFrame({
    'model':['Logistic Regression','Decision tree', 'KNN'],
    'accuracy':[lr_acc, dt_acc, knn_acc]
})
accuracy_df

Unnamed: 0,model,accuracy
0,Logistic Regression,0.920455
1,Decision tree,0.897727
2,KNN,0.886364
