# Customer Basket Size Prediction


## Problem Statement

Simple problem to illustrate a classification problem using several algorithms

Objective is to illustrate certain key concepts for several classification algorithms.

<font color="red">We shall not be focusing on accuracy or improving perfomance of the model</font>

### Workbench

#### Importing the required libraries

In [27]:
# Import the numpy and pandas package
import numpy as np
import pandas as pd

# Data Visualisation
import matplotlib.pyplot as plt 
import seaborn as sns

# Import the warnings
import warnings

# Import the standard scalar
from sklearn.preprocessing import StandardScaler

# Import train test split
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict, train_test_split

# Imort Linear Model
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Import the metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score,roc_curve, auc


# Import necessary library to draw multiclass ROC Curve
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# configuration settings
%matplotlib inline 
sns.set(color_codes=True)
warnings.filterwarnings('ignore') ## Surpress the warnings

#### Load the data into a dataframe

In [16]:
# load the data into a dataframe called supermarket_till_transactions_df
supermarket_till_transactions_df = pd.read_csv("./supermarket_till_transactions.csv")

In [17]:
# view the top five records
supermarket_till_transactions_df.head(5)

Unnamed: 0,SHOP_WEEK,SHOP_DATE,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,PROD_CODE,PROD_CODE_10,PROD_CODE_20,PROD_CODE_30,...,CUST_PRICE_SENSITIVITY,CUST_LIFESTAGE,BASKET_ID,BASKET_SIZE,BASKET_PRICE_SENSITIVITY,BASKET_TYPE,BASKET_DOMINANT_MISSION,STORE_CODE,STORE_FORMAT,STORE_REGION
0,200607,20060413,5,20,1,103,PRD0900097,CL00001,DEP00001,G00001,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
1,200607,20060412,4,19,1,28,PRD0900353,CL00070,DEP00020,G00007,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02
2,200607,20060413,5,20,3,84,PRD0900550,CL00167,DEP00055,G00016,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02
3,200607,20060412,4,19,1,221,PRD0901647,CL00010,DEP00003,G00002,...,LA,YF,994100100532897,M,MM,Small Shop,Fresh,STORE00001,LS,E02
4,200607,20060413,5,20,1,334,PRD0902064,CL00073,DEP00021,G00007,...,LA,YF,994100100532898,L,LA,Top Up,Fresh,STORE00001,LS,E02


In [18]:
supermarket_till_transactions_df = supermarket_till_transactions_df[["SHOP_WEEKDAY","SHOP_HOUR","QUANTITY","SPEND","BASKET_SIZE"]]
supermarket_till_transactions_df.head(5)

Unnamed: 0,SHOP_WEEKDAY,SHOP_HOUR,QUANTITY,SPEND,BASKET_SIZE
0,5,20,1,103,L
1,4,19,1,28,M
2,5,20,3,84,L
3,4,19,1,221,M
4,5,20,1,334,L


In [19]:
supermarket_till_transactions_df["BASKET_SIZE"].value_counts()

L    83
M    30
S     6
Name: BASKET_SIZE, dtype: int64

In [20]:
X = supermarket_till_transactions_df[["SHOP_WEEKDAY","SHOP_HOUR","QUANTITY","SPEND"]]
y = supermarket_till_transactions_df["BASKET_SIZE"]

**Splitting the data using train_test_split**
1. Standard Scalar
2. One Vs Rest Classifier

In [21]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [22]:
# Using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



**Rnadom Forest**

In [23]:
# Create the model with 100 trees
random_forest_model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')

random_forest_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
random_forest_pred = random_forest_model.predict(X_test)
# Review the predictions
random_forest_pred

array(['L', 'L', 'L', 'M', 'L', 'L', 'M', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'M', 'M', 'L', 'L', 'M', 'M', 'L', 'L', 'M', 'M', 'M', 'L',
       'L', 'L', 'M', 'M', 'L', 'L', 'L', 'L', 'M', 'L'], dtype=object)

In [11]:
print("Accuracy:",metrics.accuracy_score(y_test, random_forest_pred))

Accuracy: 0.8888888888888888


**Precision, Recall,F-Measure and support**

In [12]:
print(classification_report(y_test, random_forest_pred))

              precision    recall  f1-score   support

           L       0.88      0.96      0.92        23
           M       0.91      1.00      0.95        10
           S       0.00      0.00      0.00         3

    accuracy                           0.89        36
   macro avg       0.60      0.65      0.62        36
weighted avg       0.81      0.89      0.85        36



In [25]:
random_forest_confusion_matrix = confusion_matrix(y_test, random_forest_pred)
print(random_forest_confusion_matrix)

[[22  1  0]
 [ 0 10  0]
 [ 2  1  0]]


**KNN**

In [30]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [31]:
knn_pred = knn_model.predict(X_test)
knn_pred

array(['M', 'L', 'L', 'M', 'L', 'L', 'L', 'L', 'M', 'L', 'L', 'L', 'S',
       'L', 'M', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'M',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L'], dtype=object)

In [32]:
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           L       0.60      0.78      0.68        23
           M       0.20      0.10      0.13        10
           S       0.00      0.00      0.00         3

    accuracy                           0.53        36
   macro avg       0.27      0.29      0.27        36
weighted avg       0.44      0.53      0.47        36



In [33]:
knn_confusion_matrix = confusion_matrix(y_test, knn_pred)
print(knn_confusion_matrix)

[[18  4  1]
 [ 9  1  0]
 [ 3  0  0]]
