In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
import pandas as pd
import sqlite3
from pathlib import Path


In [34]:
dbPath = Path("../../data/data.sqlite")
if not dbPath.exists():
    # Avoid creating an empty database
    raise Exception("Database file does not exist")
con = sqlite3.connect(dbPath)

In [35]:
customerTypeMap = ["Student", "Teenager", "Adult", "Senior"]
customerLocationMap = ["Munich District One","Munich District Three","Munich District Two","Munich District Four","Munich District Five"]
distributionChannelMap =  ["Feedera SE", "Deliver Now Holding", "Deliveruu Inc.", "Orderly SE", "BestOrder Inc.", "TownExpress Inc.", "Heropizza Lmtd."]
weekdayMap = ["Sunday", "Tuesday", "Friday", "Saturday", "Wednesday", "Monday", "Thursday"]
costfactorMap = ["","Chef 2","Chef 1","Ingredients","Delivery Scooters","Phone Bill","Delivery Guy 2","Distribution channel fees","Waiter","Delivery Guy 1"]
sizeMap = ["Medium", "Small", "Large"]
typeMap = ["Funghi", "Salami", "Calzone", "Speciale", "Magherita", "Paprika", "Veggie"]

In [36]:
df = pd.read_sql("SELECT * FROM Pizza_Case", con)
df.drop(["_CASE_KEY"], axis=1, inplace=True)
df.drop("Customer_ID", axis=1, inplace=True)

df["CustomerType"] = df["CustomerType"].map(customerTypeMap.index)
df["CustomerLocation"] = df["CustomerLocation"].map(customerLocationMap.index)
df["DistributionChannel"] = df["DistributionChannel"].map(distributionChannelMap.index)
df["Weekday"] = df["Weekday"].map(weekdayMap.index)
df["CostFactor"] = df["CostFactor"].map(costfactorMap.index)
df["PizzaSize"] = df["PizzaSize"].map(sizeMap.index)
df["PizzaType"] = df["PizzaType"].map(typeMap.index)

df["Profit"] = df["Revenue"] - df["Costs"]
df["IsOrderProfitable"] = df["Profit"] > 0

df

Unnamed: 0,Variant,Revenue,Costs,PizzaType,PizzaSize,CustomerSatisfaction,CostFactor,Weekday,Daytime,DistributionChannel,CustomerLocation,CustomerType,Profit,IsOrderProfitable
0,1,17,13,0,0,3,1,0,14,0,0,0,4,True
1,1,19,10,1,1,3,1,0,13,1,0,0,9,True
2,1,9,16,2,2,0,2,0,12,0,0,1,-7,False
3,1,36,56,2,2,1,2,0,18,2,0,2,-20,False
4,1,5,35,1,1,5,3,0,14,0,0,0,-30,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,9,11,2,3,1,5,2,3,18,4,4,0,9,True
1994,9,54,23,0,0,3,4,2,11,0,0,0,31,True
1995,9,29,2,4,0,4,9,2,18,6,0,1,27,True
1996,9,12,12,0,2,3,2,1,21,2,1,0,0,False


In [37]:
# Extract the features and target variable
# X = df.drop(['CustomerSatisfaction'], axis=1)
# y = df['CustomerSatisfaction']

def algo(df, column: str):
    df = df.copy()
    X = df.drop([column], axis=1)
    y = df[column]


    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a decision tree classifier on the training data
    clf = RandomForestClassifier(random_state=42) # Going with RandomForest because it is 0.3324 accurate (DecisionTree is 0.24)
    clf.fit(X_train, y_train)

    # Evaluate the performance of the classifier on the testing data
    score = clf.score(X_test, y_test)
    print('Accuracy:', score)
    # recall

    y_pred = clf.predict(X_test)
    recall = recall_score(y_test, y_pred, average='macro')
    print('Recall:', recall)

    # precision
    precision = precision_score(y_test, y_pred, average='macro')
    print('Precision:', precision)


    print()
    # Print the feature importances
    importances = clf.feature_importances_
    for feature, importance in zip(X.columns, importances):
        print(feature, importance)

In [38]:
algo(df, "CustomerSatisfaction")

Accuracy: 0.3075
Recall: 0.26907337343822396
Precision: 0.28381172544775485

Variant 0.12463775903952344
Revenue 0.10418939238418379
Costs 0.10476863316796722
PizzaType 0.07361695290708607
PizzaSize 0.045288544254648046
CostFactor 0.08549576218558577
Weekday 0.06681451328804555
Daytime 0.08940433838302192
DistributionChannel 0.06974924393145625
CustomerLocation 0.059319403006984246
CustomerType 0.04309598709722659
Profit 0.11938490567131797
IsOrderProfitable 0.014234564682953088


In [39]:
algo(df, "Profit")

Accuracy: 0.2725
Recall: 0.16836229146835205
Precision: 0.14393574999635605

Variant 0.07440609393997669
Revenue 0.1589356879733818
Costs 0.14995239997285253
PizzaType 0.07383449882551527
PizzaSize 0.04526921220919884
CustomerSatisfaction 0.0694662724920857
CostFactor 0.08040530863383251
Weekday 0.0660370748134911
Daytime 0.08309653443012968
DistributionChannel 0.06933867117728995
CustomerLocation 0.058126524375202736
CustomerType 0.0486386749374643
IsOrderProfitable 0.022493046219578917


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
# Drop Revenue and Costs because that would train the model to give us a non useful result
dfPredictPrfitablity = df.copy()
dfPredictPrfitablity.drop("Profit", axis=1, inplace=True)
dfPredictPrfitablity.drop("Costs", axis=1, inplace=True)
dfPredictPrfitablity.drop("Revenue", axis=1, inplace=True)
algo(dfPredictPrfitablity, "IsOrderProfitable")

Accuracy: 0.6125
Recall: 0.5018530810930413
Precision: 0.503426934683648

Variant 0.10964711330293164
PizzaType 0.11006925569200485
PizzaSize 0.06653668855193738
CustomerSatisfaction 0.10537396507159086
CostFactor 0.12261791509582229
Weekday 0.09157066543914537
Daytime 0.14198598665654294
DistributionChannel 0.10484428703129169
CustomerLocation 0.07989253609355826
CustomerType 0.06746158706517469


In [42]:
dfPredictCustomerSatisfaction = df.copy()
dfPredictCustomerSatisfaction["IsCustomerSatisfied"] = dfPredictCustomerSatisfaction["CustomerSatisfaction"] >= 3
algo(dfPredictCustomerSatisfaction, "CustomerSatisfaction")

Accuracy: 0.4725
Recall: 0.3911085530804909
Precision: 0.4038843709896341

Variant 0.08878812479572364
Revenue 0.08655495721672427
Costs 0.08851111319320719
PizzaType 0.0582368223096271
PizzaSize 0.036373951673038155
CostFactor 0.06879676544357766
Weekday 0.056322349253149065
Daytime 0.07258665055183512
DistributionChannel 0.056921635244741135
CustomerLocation 0.047504444820978944
CustomerType 0.03529554849284739
Profit 0.10307192410636488
IsOrderProfitable 0.01169209774077104
IsCustomerSatisfied 0.1893436151574145
