#Import Library

In [None]:
import pandas as pd
import csv
from matplotlib import pyplot as plt
import numpy
import datetime
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import isnan
from google.colab import files
from google.colab import drive

#Initialize CSV Location and Mount Google Drive

In [None]:
#Initialize The CSV Directory
dataset_directory = "drive/MyDrive/Dataset Bakery XYZ/"

#Initialize The CSV File Names
directory_area = dataset_directory + "AREA.csv"
directory_customer = dataset_directory + "CUST.csv"
directory_inventory = dataset_directory + "INVENTORY.csv"
directory_sales_header = dataset_directory + "SALESHEADER.csv"
directory_sales_detail = dataset_directory + "SALESDETAIL.csv"

In [None]:
#Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


#Data Preprocessing

##Perform JOIN on All Tables

In [None]:
#Data Preprocessing The "Area" Table
area = pd.read_csv(directory_area, ";")

area.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

desc = area['DESC'].unique()
desc_dict = dict(zip(desc, range(len(desc))))
area = area.applymap(lambda s: desc_dict.get(s) if s in desc_dict else s)

In [None]:
#Data Preprocessing The "Customer" Table
customer = pd.read_csv(directory_customer, ";",converters={'CODE':str, 'AREACD':str, 'SALTYPE':str, 'CUSTNAME':str, 'ARBAL':str, 'INACTIVE':str})

customer.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salestypes = customer['SALTYPE'].unique()
salestypes_dict = dict(zip(salestypes, range(len(salestypes))))
customer = customer.applymap(lambda s: salestypes_dict.get(s) if s in salestypes_dict else s)

(unique, counts) = numpy.unique(customer['INACTIVE'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T
customer.drop(customer.loc[customer['INACTIVE']==2].index, inplace=True)

inactive = customer['INACTIVE'].unique()
inactive_dict = dict(zip(inactive, range(len(inactive))))
customer = customer.applymap(lambda s: inactive_dict.get(s) if s in inactive_dict else s)

In [None]:
#Data Preprocessing The "Inventory" Table
inventory = pd.read_csv(directory_inventory, ";")

inventory.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

inventory['SPRICE'] = inventory['SPRICE'].str.replace(',00', '')
inventory['SPRICE'] = pd.to_numeric(inventory['SPRICE'],errors = 'coerce')

inventory['UCOST'] = inventory['UCOST'].str.replace(',00', '')
inventory['UCOST'] = pd.to_numeric(inventory['UCOST'],errors = 'coerce')

inventory['WEIGHT'] = inventory['WEIGHT'].str.replace(',00', '')
inventory['WEIGHT'] = pd.to_numeric(inventory['WEIGHT'],errors = 'coerce')

In [None]:
#Data Preprocessing The "Sales Detail" Table
salesDetail = pd.read_csv(directory_sales_detail, ";", quoting=csv.QUOTE_NONE, error_bad_lines=False, converters={'UPRICE':str, 'UCOST':str, 'AMOUNT':str, 'DISCAMT':str})

salesDetail.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salesDetail['UPRICE'] = salesDetail['UPRICE'].str.replace(',00', '')
salesDetail['UPRICE'] = pd.to_numeric(salesDetail['UPRICE'],errors = 'coerce')

salesDetail['UCOST'] = salesDetail['UCOST'].str.replace(',00', '')
salesDetail['UCOST'] = pd.to_numeric(salesDetail['UCOST'],errors = 'coerce')

salesDetail['AMOUNT'] = salesDetail['AMOUNT'].str.replace(',00', '')
salesDetail['AMOUNT'] = pd.to_numeric(salesDetail['AMOUNT'],errors = 'coerce')

salesDetail['DISCAMT'] = salesDetail['DISCAMT'].str.replace(',00', '')
salesDetail['DISCAMT'] = pd.to_numeric(salesDetail['DISCAMT'],errors = 'coerce')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
#Data Preprocessing The "Sales Header" Table
salesHeader = pd.read_csv(directory_sales_header, ";", quoting=csv.QUOTE_NONE, error_bad_lines=False, converters={'TOTAL':str})

salesHeader.drop(['UPDDATE', 'UPDTIME'], inplace=True, axis=1)

salesHeader['TOTAL'] = salesHeader['TOTAL'].str.replace(',00', '')
salesHeader['TOTAL'] = pd.to_numeric(salesHeader['TOTAL'],errors = 'coerce')

STYPE = salesHeader['STYPE'].unique()
STYPE_dict = dict(zip(STYPE, range(len(STYPE))))
salesHeader = salesHeader.applymap(lambda s: STYPE_dict.get(s) if s in STYPE_dict else s)

In [None]:
#JOIN The "Customer" Table and "Area" Table 
area.rename(columns={"CODE": "AreaID"}, inplace=True)
customer.rename(columns={"AREACD": "AreaID", "CODE": "CustomerID"}, inplace=True)

(unique, counts) = numpy.unique(customer['AreaID'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T

testJoin = customer.merge(area,
                    on=['AreaID'],
                    how="outer"
                    )

In [None]:
#JOIN The "Sales Header" Table with The "Customer and Area" Table
salesHeader.rename(columns={"CUSTCODE": "CustomerID"}, inplace=True)

(unique, counts) = numpy.unique(salesHeader['CustomerID'], return_counts=True)
frequencies = numpy.asarray((unique, counts)).T

testJoin.rename(columns={"CODE": "CustomerID"}, inplace=True)

testMultiJoin = testJoin.merge(salesHeader,
                               on=['CustomerID'],
                               how="right"
                               )

In [None]:
#JOIN The "Sales Detail" Table with The "Sales Header, Customer, and Area" Table
salesDetail.rename(columns={"TRNO": "SalesHeaderID"}, inplace=True)
testMultiJoin.rename(columns={"TRNO": "SalesHeaderID"}, inplace=True)

saleHeaderSalesDetailJoin = testMultiJoin.merge(salesDetail,
                                                on=['SalesHeaderID'],
                                                how="right"
                                                )

In [None]:
#JOIN The "Inventory" Table with The "Sales Detail, Sales Header, Customer, and Area" Table
saleHeaderSalesDetailJoin.rename(columns={"ITEMNO": "ProductID"}, inplace=True)
inventory.rename(columns={"ITEMNO": "ProductID"}, inplace=True)

MergedData= saleHeaderSalesDetailJoin.merge(inventory,
                                      on=['ProductID'],
                                      how="left"
                               )

##Dropping Unused Columns

In [None]:
#Creating The "YEAR", "MONTH", and "DAY" (Day of The Month) Columns from "TRDATE"
MergedData["TRDATE"] = pd.to_datetime(MergedData["TRDATE"])

MergedData['YEAR'] = MergedData['TRDATE'].dt.year 
MergedData['MONTH'] = MergedData['TRDATE'].dt.month
MergedData['DAY'] = MergedData['TRDATE'].dt.day

In [None]:
#Creating The "SALDCODE" Column That Acts as The Primary Key
MergedData["LINENO"] = MergedData["LINENO"].astype(str)
MergedData['SALDCODE'] = MergedData['SalesHeaderID'] + MergedData['LINENO']

In [None]:
#Dropping The Unused Columns
MergedData.drop(['AreaID', 'ProductID', 'CUSTNAME', 'ITEMNAME','TRDATE', 'SALTYPE', 'ARBAL', 'INACTIVE', 'DESC', 'SALPERSON', 'STYPE', 'UCOST_x', 'DISCAMT', 'AMOUNT', 'SPRICE', 'UCOST_y', 'WEIGHT', 'UPRICE', 'SalesHeaderID', 'LINENO'], inplace=True, axis=1)

#Dropping The Empty Rows
cleanMergedData = MergedData.dropna()

##Splitting The Transaction Data Per Customer

In [None]:
#Creating a list of unique CustomerIDs

#unique4: A list containing the CustomerID. This list is unique (no duplicate CustomerID).
#counts4: A list containing the number of occurrences of a CustomerID.
#frequencies4: A 2-dimensional list containing the CustomerID and the number of occurrences of that CustomerID.

(unique4, counts4) = numpy.unique(cleanMergedData['CustomerID'], return_counts=True)
frequencies4 = numpy.asarray((unique4, counts4)).T

In [None]:
#Creating an "allCustomer" variable that contains all unique CustomerIDs
allCustomer = ["" for x in range(len(unique4))]

for i in range(0, len(unique4)):
  allCustomer[i] = unique4[i]

In [None]:
#Creating a "dataBasedOnAllCustomer" dictionary whose key is CustomerID and the value is the transaction data from the customer who have the CustomerID
#Creating a "saldCodeBasedOnAllCustomer" dictionary containing the SALDCODE of each CustomerID to simplify the Extract, Transform, and Load (ETL) process to the Data Warehouse

dataBasedOnAllCustomer = {}
saldCodeBasedOnAllCustomer = {}

for customerID in allCustomer:
  segmentedData = cleanMergedData.query('CustomerID == @customerID')
  segmentedData.drop(['CustomerID'], inplace=True, axis=1)
  dataBasedOnAllCustomer[customerID] = segmentedData
  saldCodeBasedOnAllCustomer[customerID] = segmentedData

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
#Dropping The Unused Columns
for i in dataBasedOnAllCustomer:
  dataBasedOnAllCustomer[i] = dataBasedOnAllCustomer[i].drop(['SALDCODE'], axis=1)
  saldCodeBasedOnAllCustomer[i] = saldCodeBasedOnAllCustomer[i].drop(['TOTAL', 'QTY', 'QTYRET', 'YEAR', 'MONTH', 'DAY'], axis=1)

#Data Splitting

In [None]:
#Creating a New Variable to Accommodate the Preprocessed Data
dataBasedOnAllCustomerClean = {}
saldCodeBasedOnAllCustomerClean = {}

for i in dataBasedOnAllCustomer:
  dataBasedOnAllCustomerClean[i] = dataBasedOnAllCustomer[i]
  saldCodeBasedOnAllCustomerClean[i] = saldCodeBasedOnAllCustomer[i]

In [None]:
#Splitting The Dataset into Training Features and The Predicted Data
X = {}
y = {}

for i in dataBasedOnAllCustomerClean:
  X[i], y[i] = dataBasedOnAllCustomerClean[i].loc[:, dataBasedOnAllCustomerClean[i].columns != 'QTYRET'], dataBasedOnAllCustomerClean[i].loc[:, dataBasedOnAllCustomerClean[i].columns == 'QTYRET']

#XGBoost Model Development


In [None]:
#Creating XGBoost Models with Parameters from The Hyperparameter Tuning Result
regressor = XGBRegressor(learning_rate=0.6, max_depth=5, reg_lambda=0.9, reg_alpha=0.1, subsample=1)

In [None]:
#Fitting The XGBoost Models with Training Dataset
model = {}

for i in dataBasedOnAllCustomerClean:
  model[i] = regressor.fit(X[i], y[i])

pred_res = {}
predictions = {}

for i in dataBasedOnAllCustomerClean:
  pred_res[i] = model[i].predict(X[i])
  dataBasedOnAllCustomerClean[i]["PREDICTRET"] = [round(value) for value in pred_res[i]]
  dataBasedOnAllCustomerClean[i]["PREDICTQTY"] = dataBasedOnAllCustomerClean[i]["QTY"] - dataBasedOnAllCustomerClean[i]["PREDICTRET"]



#Model Evaluation

##Root Mean Square Error (RMSE)

In [None]:
#Calculating The RMSE Score for All Models
rmse = {}

for i in dataBasedOnAllCustomerClean:
  rmse[i] = mean_squared_error(y[i], dataBasedOnAllCustomerClean[i]["PREDICTRET"], squared=False)
  print("RMSE[%s]: %0.2f" % (i,rmse[i]))

RMSE[BGR001]: 2.36
RMSE[BGR002]: 1.78
RMSE[BGR003]: 3.24
RMSE[BGR004]: 1.76
RMSE[BGR005]: 2.46
RMSE[BGR006]: 1.51
RMSE[BGR007]: 3.36
RMSE[BGR008]: 0.93
RMSE[BGR009]: 1.86
RMSE[BGR010]: 1.80
RMSE[BGR011]: 0.76
RMSE[BGR012]: 1.21
RMSE[BGR014]: 2.88
RMSE[BGR015]: 1.53
RMSE[BGR016]: 3.16
RMSE[BGR017]: 2.64
RMSE[BGR018]: 4.07
RMSE[BGR019]: 2.36
RMSE[BGR020]: 2.28
RMSE[BGR021]: 3.05
RMSE[BGR022]: 2.11
RMSE[BGR023]: 2.19
RMSE[BGR026]: 2.75
RMSE[BGR027]: 1.63
RMSE[BGR028]: 2.71
RMSE[BGR029]: 1.82
RMSE[BGR030]: 2.60
RMSE[BGR031]: 2.76
RMSE[BGR032]: 2.64
RMSE[BGR034]: 2.12
RMSE[BGR035]: 3.38
RMSE[BGR036]: 2.05
RMSE[BGR037]: 1.85
RMSE[BGR039]: 2.29
RMSE[BGR040]: 1.67
RMSE[BGR041]: 2.41
RMSE[BGR042]: 2.02
RMSE[BGR043]: 2.01
RMSE[BGR045]: 1.88
RMSE[BGR052]: 1.63
RMSE[BGR053]: 1.10
RMSE[BGR054]: 2.05
RMSE[BGR055]: 2.33
RMSE[BGR059]: 1.43
RMSE[BGR063]: 2.71
RMSE[BGR065]: 3.79
RMSE[BGR066]: 1.31
RMSE[BGR067]: 0.76
RMSE[BGR070]: 2.19
RMSE[BGR071]: 1.16
RMSE[BGR073]: 1.50
RMSE[BGR074]: 2.94
RMSE[BGR075]

In [None]:
#Averaging The RMSE Score
totalValueMSE = 0

for i in dataBasedOnAllCustomerClean:
  totalValueMSE = totalValueMSE + rmse[i]

avgValue = totalValueMSE/len(dataBasedOnAllCustomerClean)
print(avgValue)

1.8884858757587966


##R-Squared Score

In [None]:
#Calculating The R-Squared Score for All Models
r2 = {}

for i in dataBasedOnAllCustomerClean:
  r2[i] = r2_score(y[i], dataBasedOnAllCustomerClean[i]["PREDICTRET"])
  print("R2 Score[%s]: %0.2f" % (i,r2[i]))

R2 Score[BGR001]: -0.34
R2 Score[BGR002]: -0.48
R2 Score[BGR003]: -0.45
R2 Score[BGR004]: -0.25
R2 Score[BGR005]: -0.49
R2 Score[BGR006]: -0.21
R2 Score[BGR007]: -0.58
R2 Score[BGR008]: -0.25
R2 Score[BGR009]: -0.36
R2 Score[BGR010]: -0.44
R2 Score[BGR011]: -0.31
R2 Score[BGR012]: -0.19
R2 Score[BGR014]: -0.29
R2 Score[BGR015]: -0.15
R2 Score[BGR016]: -0.47
R2 Score[BGR017]: -0.32
R2 Score[BGR018]: -0.80
R2 Score[BGR019]: -0.50
R2 Score[BGR020]: -0.44
R2 Score[BGR021]: -0.40
R2 Score[BGR022]: -0.40
R2 Score[BGR023]: -0.22
R2 Score[BGR026]: -0.70
R2 Score[BGR027]: -0.37
R2 Score[BGR028]: -0.66
R2 Score[BGR029]: -0.35
R2 Score[BGR030]: -1.37
R2 Score[BGR031]: -0.37
R2 Score[BGR032]: -0.31
R2 Score[BGR034]: -0.57
R2 Score[BGR035]: -0.83
R2 Score[BGR036]: -0.41
R2 Score[BGR037]: -0.25
R2 Score[BGR039]: -1.11
R2 Score[BGR040]: -1.76
R2 Score[BGR041]: -1.03
R2 Score[BGR042]: -0.33
R2 Score[BGR043]: -0.43
R2 Score[BGR045]: -0.45
R2 Score[BGR052]: -0.39
R2 Score[BGR053]: -0.10
R2 Score[BGR054]



R2 Score[DMG024]: -0.43
R2 Score[DMG025]: -0.43
R2 Score[DMG026]: -0.52
R2 Score[DMG027]: -0.35
R2 Score[DMG028]: -0.36
R2 Score[DMG029]: -0.30
R2 Score[DMG030]: -0.15
R2 Score[DMG031]: -0.40
R2 Score[DMG032]: -0.48
R2 Score[DMG033]: -0.42
R2 Score[DMG034]: -0.48
R2 Score[DMG035]: -0.14
R2 Score[DMG036]: -0.17
R2 Score[DMG037]: -0.44
R2 Score[DMG038]: -0.28
R2 Score[DMG040]: -0.60
R2 Score[DMG041]: -0.27
R2 Score[DMG044]: -0.42
R2 Score[DMG047]: -0.27
R2 Score[DMG048]: -0.78
R2 Score[DMG049]: -0.88
R2 Score[DMG051]: -0.12
R2 Score[DMG053]: -0.63
R2 Score[DMG054]: -2.00
R2 Score[DMG069]: -0.15
R2 Score[DMG075]: -0.20
R2 Score[DMG076]: -0.59
R2 Score[DMG077]: -0.57
R2 Score[DMG078]: -0.32
R2 Score[DMG079]: -0.42
R2 Score[DMG080]: -0.40
R2 Score[DMG081]: 0.00
R2 Score[DMG082]: -0.79
R2 Score[DMG083]: -0.32
R2 Score[DMG084]: -0.19
R2 Score[DMG085]: -0.78
R2 Score[DMG086]: -1.17
R2 Score[DMG087]: nan
R2 Score[DMG088]: -0.83
R2 Score[DMG089]: -0.47
R2 Score[DMG090]: -1.09
R2 Score[DMG091]: 0



R2 Score[SDB065]: -0.25
R2 Score[SDB142]: -3.39
R2 Score[SDB143]: -0.07
R2 Score[SDB144]: -0.41
R2 Score[SDB64]: nan
R2 Score[SMI001]: -0.22
R2 Score[SMI002]: -0.07
R2 Score[SMI003]: -0.06
R2 Score[SMI004]: -0.17
R2 Score[SMI005]: -0.20
R2 Score[SMI006]: -0.21
R2 Score[SMI007]: -0.97
R2 Score[SMI008]: -0.49
R2 Score[SMI009]: -0.18
R2 Score[SMI010]: -0.29
R2 Score[SMI011]: -0.40
R2 Score[SMI012]: -0.16
R2 Score[SMI013]: -0.63
R2 Score[SMI014]: -0.30
R2 Score[SMI015]: -0.63
R2 Score[SMI016]: -0.06
R2 Score[SMI017]: -0.25
R2 Score[SMI018]: -0.22
R2 Score[SMI019]: -0.08
R2 Score[SMI020]: -0.19
R2 Score[SMI021]: -0.12
R2 Score[SMI022]: -0.10
R2 Score[SMI023]: -0.05
R2 Score[SMI024]: -0.35
R2 Score[SMI025]: -0.21
R2 Score[SMI026]: -0.98
R2 Score[SMI027]: -0.13
R2 Score[SMI028]: -0.34
R2 Score[SMI029]: -0.66
R2 Score[SMI030]: -0.65
R2 Score[SMI031]: -0.27
R2 Score[SMI032]: -0.76
R2 Score[SMI033]: -0.17
R2 Score[SMI034]: 0.00
R2 Score[SMI035]: -1.54
R2 Score[SMI036]: -0.26
R2 Score[SMI037]: -0



R2 Score[SMI121]: -0.28
R2 Score[SMI122]: -0.65
R2 Score[SMI123]: -0.52
R2 Score[SMI124]: -0.37
R2 Score[SMI125]: -0.03
R2 Score[SMI126]: -0.33
R2 Score[SMI127]: -0.38
R2 Score[SMI128]: -0.18
R2 Score[SMI129]: -0.51
R2 Score[SMI130]: -0.34
R2 Score[SMI131]: -3.44
R2 Score[SMI132]: -0.14
R2 Score[SMI133]: -0.18
R2 Score[SMI134]: -0.13
R2 Score[SMI135]: -0.42
R2 Score[SMI136]: -0.02
R2 Score[SMI137]: -0.14
R2 Score[SMI138]: -10.43
R2 Score[SMI139]: -0.19
R2 Score[SMI140]: -0.28
R2 Score[SMI141]: -0.92
R2 Score[SMI142]: -1.70
R2 Score[SMI143]: -1.62
R2 Score[SMI144]: 0.04
R2 Score[SMI145]: -0.36
R2 Score[SMI146]: -0.16
R2 Score[SMI147]: -0.38
R2 Score[SMI148]: -0.47
R2 Score[SMI149]: -0.53
R2 Score[SMI150]: -0.56
R2 Score[SMI151]: -0.40
R2 Score[SMI152]: -0.42
R2 Score[SMI153]: -0.21
R2 Score[SMI154]: -0.65
R2 Score[SMI155]: -0.56
R2 Score[SMI156]: -0.33
R2 Score[SMI157]: -0.45
R2 Score[SMI158]: 1.00
R2 Score[SMI159]: -0.30
R2 Score[SMI160]: -0.27
R2 Score[SMI161]: -1.81
R2 Score[SMI162]:



In [None]:
#Averaging The R-Squared Score
totalValueR2 = 0

for i in dataBasedOnAllCustomerClean:
  totalValueR2 = totalValueR2 + r2[i]

avgValue = totalValueR2/len(dataBasedOnAllCustomerClean)
print("Average R-Squared Score:",avgValue)

Average R-Squared Score: nan


#Combining The Prediction Results With SALDCODE

In [None]:
#Combining The Prediction Results With SALDCODE
combined_df = {}

for i in dataBasedOnAllCustomerClean:
  combined_df[i] = pd.concat([dataBasedOnAllCustomerClean[i], saldCodeBasedOnAllCustomerClean[i]], axis=1, join='inner')

#Combining Dataframes Per Customer into One Dataframe
combined_df_2 = combined_df
all_values_concat = pd.concat(combined_df_2.values(), ignore_index=True)

In [None]:
#Drop The Unused Columns and Reorder The Dataframe Columns
final_pred_df = all_values_concat
final_pred_df.drop(columns = {'TOTAL', 'QTY', 'QTYRET','YEAR','MONTH','DAY'}, inplace=True)
final_pred_df_reordered = final_pred_df[['SALDCODE','PREDICTQTY','PREDICTRET']]

In [None]:
#Download The Dataframe That Contains The XGBoost Prediction Results
final_pred_df_reordered.to_csv('sald_predict.csv', index=False)
files.download('sald_predict.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>