In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score
import pandas as pd
import sqlite3
from pathlib import Path


In [2]:
dbPath = Path("../data/data.sqlite")
if not dbPath.exists():
    # Avoid creating an empty database
    raise Exception("Database file does not exist")
con = sqlite3.connect(dbPath)

In [3]:
customerTypeMap = ["Student", "Teenager", "Adult", "Senior"]
customerLocationMap = ["Munich District One","Munich District Three","Munich District Two","Munich District Four","Munich District Five"]
distributionChannelMap =  ["Feedera SE", "Deliver Now Holding", "Deliveruu Inc.", "Orderly SE", "BestOrder Inc.", "TownExpress Inc.", "Heropizza Lmtd."]
weekdayMap = ["Sunday", "Tuesday", "Friday", "Saturday", "Wednesday", "Monday", "Thursday"]
costfactorMap = ["","Chef 2","Chef 1","Ingredients","Delivery Scooters","Phone Bill","Delivery Guy 2","Distribution channel fees","Waiter","Delivery Guy 1"]
sizeMap = ["Medium", "Small", "Large"]
typeMap = ["Funghi", "Salami", "Calzone", "Speciale", "Magherita", "Paprika", "Veggie"]

In [4]:
df = pd.read_sql("SELECT * FROM Pizza_Case", con)
df.drop(["_CASE_KEY"], axis=1, inplace=True)
df.drop("Customer_ID", axis=1, inplace=True)

df["CustomerType"] = df["CustomerType"].map(customerTypeMap.index)
df["CustomerLocation"] = df["CustomerLocation"].map(customerLocationMap.index)
df["DistributionChannel"] = df["DistributionChannel"].map(distributionChannelMap.index)
df["Weekday"] = df["Weekday"].map(weekdayMap.index)
df["CostFactor"] = df["CostFactor"].map(costfactorMap.index)
df["PizzaSize"] = df["PizzaSize"].map(sizeMap.index)
df["PizzaType"] = df["PizzaType"].map(typeMap.index)

df["Profit"] = df["Revenue"] - df["Costs"]

df

Unnamed: 0,Variant,Revenue,Costs,PizzaType,PizzaSize,CustomerSatisfaction,CostFactor,Weekday,Daytime,DistributionChannel,CustomerLocation,CustomerType,Profit
0,1,17,13,0,0,3,1,0,14,0,0,0,4
1,1,19,10,1,1,3,1,0,13,1,0,0,9
2,1,9,16,2,2,0,2,0,12,0,0,1,-7
3,1,36,56,2,2,1,2,0,18,2,0,2,-20
4,1,5,35,1,1,5,3,0,14,0,0,0,-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,9,11,2,3,1,5,2,3,18,4,4,0,9
1994,9,54,23,0,0,3,4,2,11,0,0,0,31
1995,9,29,2,4,0,4,9,2,18,6,0,1,27
1996,9,12,12,0,2,3,2,1,21,2,1,0,0


In [5]:
# Extract the features and target variable
# X = df.drop(['CustomerSatisfaction'], axis=1)
# y = df['CustomerSatisfaction']

def algo(df, column: str):
    df = df.copy()
    X = df.drop([column], axis=1)
    y = df[column]


    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a decision tree classifier on the training data
    clf = RandomForestClassifier(random_state=42) # Going with RandomForest because it is 0.3324 accurate (DecisionTree is 0.24)
    clf.fit(X_train, y_train)

    # Evaluate the performance of the classifier on the testing data
    score = clf.score(X_test, y_test)
    print('Accuracy:', score)
    # recall

    y_pred = clf.predict(X_test)
    recall = recall_score(y_test, y_pred, average='macro')
    print('Recall:', recall)

    # precision
    precision = precision_score(y_test, y_pred, average='macro')
    print('Precision:', precision)


    print()
    # Print the feature importances
    importances = clf.feature_importances_
    for feature, importance in zip(X.columns, importances):
        print(feature, importance)

In [6]:
algo(df, "CustomerSatisfaction")

Accuracy: 0.305
Recall: 0.27068721605136875
Precision: 0.283407027954357

Variant 0.12425862102471312
Revenue 0.1068530814437064
Costs 0.10579378151707804
PizzaType 0.07374968592939127
PizzaSize 0.04512588559670452
CostFactor 0.08737345023864707
Weekday 0.06721061621961731
Daytime 0.08984793228786286
DistributionChannel 0.07170378214236414
CustomerLocation 0.061901529945659374
CustomerType 0.04394521124230545
Profit 0.12223642241195043


In [7]:
algo(df, "Profit")

Accuracy: 0.285
Recall: 0.17076364539599834
Precision: 0.16010415238356415

Variant 0.07368105449479162
Revenue 0.16465608062672618
Costs 0.16008098349218833
PizzaType 0.07367447881038212
PizzaSize 0.04582983999920402
CustomerSatisfaction 0.07288620117092315
CostFactor 0.08209425764921381
Weekday 0.06695901242673115
Daytime 0.08737969776256066
DistributionChannel 0.06717243779690436
CustomerLocation 0.05876778722176029
CustomerType 0.04681816854861429


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
