#Mount Google Drive

In [None]:
from google.colab import files
from google.colab import drive

#Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Import Library

In [None]:
import pandas as pd
import csv
from matplotlib import pyplot as plt
import numpy
import datetime
import seaborn as sb
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from datetime import timedelta

from sklearn.cluster import MiniBatchKMeans

!cp drive/MyDrive/base/base.py .
import base

from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

#Initialize The CSV Location

In [None]:
#Initialize The CSV Directory
dataset_directory = "drive/MyDrive/Dataset Bakery XYZ/"

#Initialize The CSV File Names
directory_area = dataset_directory + "AREA.csv"
directory_customer = dataset_directory + "CUST.csv"
directory_inventory = dataset_directory + "INVENTORY.csv"
directory_sales_header = dataset_directory + "SALESHEADER.csv"
directory_sales_detail = dataset_directory + "SALESDETAIL.csv"

#Data Preprocessing

##Perform Data Preprocessing on All Tables

In [None]:
#Data Preprocessing The "Area" Table
area = pd.read_csv(directory_area, ";")

area.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

desc = area['DESC'].unique()
desc_dict = dict(zip(desc, range(len(desc))))
area = area.applymap(lambda s: desc_dict.get(s) if s in desc_dict else s)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#Data Preprocessing The "Customer" Table
customer = pd.read_csv(directory_customer, ";")

customer.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salestypes = customer['SALTYPE'].unique()
salestypes_dict = dict(zip(salestypes, range(len(salestypes))))
customer = customer.applymap(lambda s: salestypes_dict.get(s) if s in salestypes_dict else s)

(unique, counts) = numpy.unique(customer['INACTIVE'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T
customer.drop(customer.loc[customer['INACTIVE']==2].index, inplace=True)

inactive = customer['INACTIVE'].unique()
inactive_dict = dict(zip(inactive, range(len(inactive))))
customer = customer.applymap(lambda s: inactive_dict.get(s) if s in inactive_dict else s)

In [None]:
#Data Preprocessing The "Inventory" Table
inventory = pd.read_csv(directory_inventory, ";")

inventory.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

inventory['SPRICE'] = inventory['SPRICE'].str.replace(',00', '')
inventory['SPRICE'] = pd.to_numeric(inventory['SPRICE'],errors = 'coerce')

inventory['UCOST'] = inventory['UCOST'].str.replace(',00', '')
inventory['UCOST'] = pd.to_numeric(inventory['UCOST'],errors = 'coerce')

inventory['WEIGHT'] = inventory['WEIGHT'].str.replace(',00', '')
inventory['WEIGHT'] = pd.to_numeric(inventory['WEIGHT'],errors = 'coerce')

In [None]:
#Data Preprocessing The "Sales Detail" Table
salesDetail = pd.read_csv(directory_sales_detail, ";")

salesDetail.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salesDetail['UPRICE'] = salesDetail['UPRICE'].str.replace(',00', '')
salesDetail['UPRICE'] = pd.to_numeric(salesDetail['UPRICE'],errors = 'coerce')

salesDetail['UCOST'] = salesDetail['UCOST'].str.replace(',00', '')
salesDetail['UCOST'] = pd.to_numeric(salesDetail['UCOST'],errors = 'coerce')

salesDetail['AMOUNT'] = salesDetail['AMOUNT'].str.replace(',00', '')
salesDetail['AMOUNT'] = pd.to_numeric(salesDetail['AMOUNT'],errors = 'coerce')

salesDetail['DISCAMT'] = salesDetail['DISCAMT'].str.replace(',00', '')
salesDetail['DISCAMT'] = pd.to_numeric(salesDetail['DISCAMT'],errors = 'coerce')


In [None]:
#Data Preprocessing The "Sales Header" Table
salesHeader = pd.read_csv(directory_sales_header, ";")

salesHeader.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salesHeader['TOTAL'] = salesHeader['TOTAL'].str.replace(',00', '')
salesHeader['TOTAL'] = pd.to_numeric(salesHeader['TOTAL'],errors = 'coerce')

STYPE = salesHeader['STYPE'].unique()
STYPE_dict = dict(zip(STYPE, range(len(STYPE))))
salesHeader = salesHeader.applymap(lambda s: STYPE_dict.get(s) if s in STYPE_dict else s)

salesHeader['TRDATE'] = pd.to_datetime(salesHeader['TRDATE'])

##Performing JOIN on All Tables

In [None]:
#JOIN The "Customer" Table and "Area" Table 
area.rename(columns={"CODE": "AreaID"}, inplace=True)
customer.rename(columns={"AREACD": "AreaID", "CODE": "CustomerID"}, inplace=True)

(unique, counts) = numpy.unique(customer['AreaID'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T

testJoin = customer.merge(area,
                    on=['AreaID'],
                    how="outer"
                    )

In [None]:
#JOIN The "Sales Header" Table with The "Customer and Area" Table
salesHeader.rename(columns={"CUSTCODE": "CustomerID"}, inplace=True)

(unique, counts) = numpy.unique(salesHeader['CustomerID'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T

testJoin.rename(columns={"CODE": "CustomerID"}, inplace=True)

testMultiJoin = testJoin.merge(salesHeader,
                               on=['CustomerID'],
                               how="right"
                               )

In [None]:
#JOIN The "Sales Detail" Table with The "Sales Header, Customer, and Area" Table
salesDetail.rename(columns={"TRNO": "SalesHeaderID"}, inplace=True)
testMultiJoin.rename(columns={"TRNO": "SalesHeaderID"}, inplace=True)

saleHeaderSalesDetailJoin = testMultiJoin.merge(salesDetail,
                                                on=['SalesHeaderID'],
                                                how="right"
                                                )

In [None]:
#JOIN The "Inventory" Table with The "Sales Detail, Sales Header, Customer, and Area" Table
saleHeaderSalesDetailJoin.rename(columns={"ITEMNO": "ProductID"}, inplace=True)
inventory.rename(columns={"ITEMNO": "ProductID"}, inplace=True)

MergedData= saleHeaderSalesDetailJoin.merge(inventory,
                                      on=['ProductID'],
                                      how="left"
                               )

#Creating The Recency, Frequency, and Monetary value (RFM) Table

In [None]:
#Removing Unused Columns
MergedData.drop(['STYPE', 'SALTYPE', 'ARBAL', 'LINENO','DISCAMT','ITEMNAME','WEIGHT','UCOST_y','UCOST_x','INACTIVE', 'SPRICE','DESC','SALPERSON','TOTAL','ProductID','UPRICE','AreaID', 'QTY', 'QTYRET', 'CUSTNAME'], inplace=True, axis=1)

In [None]:
#Calculating the Recency, Frequency, and Monetary Value of Each Customer
max_date = MergedData['TRDATE'].max() + timedelta(days=1)

rfm_data = MergedData.groupby(['CustomerID']).agg({
    'TRDATE': lambda x: (max_date - x.max()).days,
    'SalesHeaderID': 'count',
    'AMOUNT': 'sum' 
})

rfm_data.rename(columns = {'TRDATE':'recency', 'SalesHeaderID':'frequency','AMOUNT':'monetary_value'}, inplace = True)
rfm_data.index.names = ['customer_id']

#Changing The RFM Value To Scale 1-6

In [None]:
#Creating a "set_quantile" Function

#Purpose:
#The "set_quantile" function splits the input data into several equal length parts and then number each part with the appropriate number.
#In this case it splits the data into 6 sections and then number the sections into a number from 1-6.

#Input Parameter:
#The function that accepts two parameter as an input: 
#   1. "rfm_dataframe_without_quantile": The dataframe that will be used by the function.
#   2. "number_of_quantile": The maximum number that can be assigned to a data.

#Return Value:
#The function returns one item:
#   1. "rfm_dataframe_with_quantile": The dataframe that has been converted in quantile form.

def set_quantile(rfm_dataframe_without_quantile , number_of_quantile):
  r_labels = range(number_of_quantile, 0, -1) 
  r_groups = pd.qcut(rfm_dataframe_without_quantile['recency'], q = number_of_quantile, labels = r_labels)
  f_labels = range(1, number_of_quantile + 1)
  f_groups = pd.qcut(rfm_dataframe_without_quantile['frequency'], q = number_of_quantile, labels = f_labels)
  m_labels = range(1, number_of_quantile + 1)
  m_groups = pd.qcut(rfm_dataframe_without_quantile['monetary_value'], q = number_of_quantile, labels = m_labels)
  rfm_dataframe_with_quantile = rfm_dataframe_without_quantile.assign(R = r_groups.values, F = f_groups.values, M = m_groups.values)
  rfm_dataframe_with_quantile[['R', 'F','M']] = rfm_dataframe_with_quantile[['R', 'F', 'M']].apply(pd.to_numeric)
  rfm_dataframe_with_quantile.drop(['recency','frequency','monetary_value'], inplace=True, axis=1)
  return rfm_dataframe_with_quantile

In [None]:
#Calling The "set_quantile" Function to Divide the Data into Number 1-6.
rfm_quantile_dataframe = set_quantile(rfm_data, 6)

#Creating The Mini Batch K-Means Model

In [None]:
#Creating The "mini_batch_kmeans" Function

#Purpose:
#The "mini_batch_kmeans" function creates a Mini Batch K-Means model and assigns each customer to a cluster based on the customer's RFM value.

#Input Parameter:
#The function that accepts two parameter as an input: 
#   1. "df_rfm": The dataframe that will be used as training dataset.
#   2. "clusters_number": The number of clusters to be created.

#Return Value:
#The function returns two item:
#   1. "df_new": A dataframe containing the customer's name, the RFM value of the customer, and the categorization of the customer.
#   2. "y_pred": A list containing the categorization of each customer.

def mini_batch_kmeans(df_rfm, clusters_number=3):
  model_mbkm = MiniBatchKMeans(n_clusters = clusters_number,
                               random_state = 1, 
                               max_iter=30, 
                               init='k-means++',
                               tol=0,
                               max_no_improvement=10,
                               init_size=140,
                               n_init=3,
                               reassignment_ratio = 0.01,
                               batch_size=1536
                               )
  model_mbkm.fit(df_rfm)
  y_pred = model_mbkm.predict(df_rfm)
  mbkm_cluster_labels = model_mbkm.labels_
  df_new = df_rfm.assign(Cluster = mbkm_cluster_labels)
  return df_new, y_pred

In [None]:
#Initialize The Number of Clusters
number_of_clusters = 6

In [None]:
#Execute The "mini_batch_kmeans" Function
rfm_df_post_mbkm, mbkm_prediction = mini_batch_kmeans(rfm_quantile_dataframe, number_of_clusters)

#Model Evaluation

In [None]:
#Creating The "get_cluster_list" Function

#Purpose:
#The "get_cluster_list" function  groups data points that have the same category into a list.

#Input Parameter:
#The function that accepts two parameter as an input: 
#   1. "rfm_df_post_model": The dataframe that will be used as the input dataset.
#   2. "number_of_clusters": The number of clusters created.

#Return Value:
#The function returns two item:
#   1. "cluster_list": A list containing each data point that is grouped by category.

def get_cluster_list(rfm_df_post_model, number_of_clusters):
  clus_i_temp=[]
  cluster_list = []
  for i in range(number_of_clusters):
    clus_i_temp = rfm_df_post_model.loc[rfm_df_post_model.Cluster == i]
    cluster_list.append(clus_i_temp.values)
  return cluster_list

In [None]:
#Dunn Index
#The higher the better

cluster_list_mbkm = get_cluster_list(rfm_df_post_mbkm, number_of_clusters)
print(f"Dunn Index: {base.dunn(cluster_list_mbkm)}")

Dunn Index: 0.4264014327112209


In [None]:
#Silhouette Score
#The higher the better

print(f"Silhouette Score: {silhouette_score(rfm_quantile_dataframe, mbkm_prediction)}")

Silhouette Score: 0.42822601036824676


In [None]:
#Davies Bouldin Index
#The lower the better

print(f"Davies Bouldin Index: {davies_bouldin_score(rfm_quantile_dataframe, mbkm_prediction)}")

Davies Bouldin Index: 0.8724920185820473


#Converting The Dataframe to CSV

In [None]:
#Merge the dataframes containing the clustering results and the RFM values that have not been converted into 1-6.
rfm_combined = rfm_df_post_mbkm.merge(rfm_data,
                    on=['customer_id'],
                    how="inner"
                    )

In [None]:
#Converting the previously combined dataframe results into CSV and then download the CSV.
rfm_combined.to_csv("rfm_per_customer.csv")
files.download("rfm_per_customer.csv")