In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
dataset = pd.read_csv('Wholesale customers data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [6]:
# dataset = pd.read_csv('/content/Wholesale customers data.csv')
# X = dataset.iloc[:, :-1].values
# y = dataset.iloc[:, -1].values

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.5)
X_train = binarizer.fit_transform(X_train)
X_test = binarizer.transform(X_test)
print(X_train[:5])

[[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]


In [11]:
print(X_train)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-null    int64
 7   Region            440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


In [13]:
dataset.describe()

Unnamed: 0,Channel,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,Region
count,440.0,440.0,440.0,440.0,440.0,440.0,440.0,440.0
mean,1.322727,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455,2.543182
std,0.468052,12647.328865,7380.377175,9503.162829,4854.673333,4767.854448,2820.105937,0.774272
min,1.0,3.0,55.0,3.0,25.0,3.0,3.0,1.0
25%,1.0,3127.75,1533.0,2153.0,742.25,256.75,408.25,2.0
50%,1.0,8504.0,3627.0,4755.5,1526.0,816.5,965.5,3.0
75%,2.0,16933.75,7190.25,10655.75,3554.25,3922.0,1820.25,3.0
max,2.0,112151.0,73498.0,92780.0,60869.0,40827.0,47943.0,3.0


Feature selection for machine

Univariate learning

Recursive Feature Elimination

In [14]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

model = SVC(kernel="linear")
rfe = RFE(model, n_features_to_select=2)
X_new = rfe.fit_transform(X_train, y_train)

print("Feature Ranking:", rfe.ranking_)

print("Selected Features Shape:", X_new.shape)


Feature Ranking: [4 3 2 1 5 1 6]
Selected Features Shape: (330, 2)


Principal Component Analysis

In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_new = pca.fit_transform(X_train)

print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Selected Features Shape:", X_new.shape)

Explained Variance Ratio: [0.76877892 0.11460853]
Selected Features Shape: (330, 2)


Feature Importance

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_train, y_train)

print("Feature Importances:", model.feature_importances_)


Feature Importances: [0.13033246 0.03034962 0.46976932 0.11940151 0.10277989 0.1180291
 0.02933811]


Evaluate Performance by Resampling

K - Fold Cross Validation

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut, RepeatedStratifiedKFold

Leave One Out cross validation

In [18]:
loo = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=loo)

print(f"LOOCV Accuracy: {np.mean(scores):.4f}")

LOOCV Accuracy: 0.7023


Repeated Random Train Test Splits

In [19]:
repeated_kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

scores = cross_val_score(model, X, y, cv=repeated_kf)

print(f"Repeated Holdout Accuracy Scores: {scores}")
print(f"Mean Accuracy: {np.mean(scores):.4f}")


Repeated Holdout Accuracy Scores: [0.72727273 0.70454545 0.68181818 0.72727273 0.68181818 0.70454545
 0.68181818 0.68181818 0.70454545 0.68181818 0.70454545 0.70454545
 0.70454545 0.68181818 0.72727273 0.72727273 0.68181818 0.63636364
 0.65909091 0.70454545 0.68181818 0.65909091 0.72727273 0.65909091
 0.70454545 0.70454545 0.72727273 0.70454545 0.70454545 0.68181818]
Mean Accuracy: 0.6955


Machine Learning Algorithm Performance Metrics

In [20]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, confusion_matrix, classification_report

model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, y_pred_prob)
# roc_auc = roc_auc_score(y_test, y_pred_prob[:, 1])
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Classification Accuracy: {accuracy:.4f}")
print(f"Logarithmic Loss: {logloss:.4f}")
print(f"Area Under ROC Curve: {roc_auc:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Classification Accuracy: 0.7273
Logarithmic Loss: 0.7821
Area Under ROC Curve: 0.4882
Confusion Matrix:
[[ 0  0 18]
 [ 0  0 12]
 [ 0  0 80]]
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        18
           2       0.00      0.00      0.00        12
           3       0.73      1.00      0.84        80

    accuracy                           0.73       110
   macro avg       0.24      0.33      0.28       110
weighted avg       0.53      0.73      0.61       110



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Regression Tasks

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

Mean Absolute Error: 0.4364
Mean Squared Error: 0.7636
R2 Score: -0.3322
