# ML Mini-Project

1. Import modules
2. Load the data
3. Transform Data
4. Divide data into training/testing set
5. Create empty model
6. Fit/train the model
7. Evaluate the model

### Problem formulation: 
We want to be able to predict pizza type based on the ingredients it contains \
We may also want to be able to predict a pizza price based on ingredients and possibly some other metric

In [55]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Load Files

In [22]:
# Create path
DATA_PATH = f'{os.path.abspath("")}/Data files'

# Load data from files
data_dictionary = pd.read_csv(f'{DATA_PATH}/data_dictionary.csv')
order_details = pd.read_csv(f'{DATA_PATH}/pizza_sales/order_details.csv')
orders = pd.read_csv(f'{DATA_PATH}/pizza_sales/orders.csv')
pizza_types = pd.read_csv(f'{DATA_PATH}/pizza_sales/pizza_types.csv')
pizzas = pd.read_csv(f'{DATA_PATH}/pizza_sales/pizzas.csv')

### Transform DataFrames

Create a dataframe of each unique ingredient to be used for our classifications.

In [23]:
ingredient_list = []
for i in range(len(pizza_types["ingredients"])):
    for n in range(len(pizza_types["ingredients"][i].split(','))):
        if n == 0: continue
        ingredient_list.append(pizza_types["ingredients"][i].split(',')[n].strip())

ingredients = pd.DataFrame(ingredient_list, columns=['ingredient'])["ingredient"].unique()

In [24]:
pizza_ingredient = pizza_types
pizza_ingredient[ingredients] = 0

for i, ingredients in enumerate(pizza_ingredient['ingredients']):
    for ingredient in ingredients:
        if ingredient in pizza_ingredient.columns:
            pizza_ingredient.loc[i, ingredient] = 1


Create a DataFrame of pizza id's and all ingredients as columns to be used for learning. \
If a pizza contains one ingredient, we give it the number 1. if it does not contain an ingredient we assign it a 0.

### Transform
Transform the Data into a viable dataframe.

In [25]:
# Get total count of sold pizza per pizza_type_id, name, and category

# Copying wanted columns into a new DataFrame
pizza_sold_df = pizza_types[['pizza_type_id', 'name', 'category']].copy()

# Create a temporary DataFrame in order to remove sizes from pizza_id,
# and to count number of sold pizzas grouped by pizza_id.
# Adding counted solz pizzas to pizza_sold_df
sold_pizzas = order_details[['pizza_id', 'quantity']].copy()
sold_pizzas['pizza_id'] = sold_pizzas['pizza_id'].str.replace(r"(_s$)|(_m$)|(_l$)|(_xl$)|(_xxl$)", "", regex=True)
pizza_sold_df['quantity_sold'] = sold_pizzas.groupby(['pizza_id'])['quantity'].transform('count')

# Checking DataFrame and controlling contents
pizza_sold_df

Unnamed: 0,pizza_type_id,name,category,quantity_sold
0,bbq_ckn,The Barbecue Chicken Pizza,Chicken,2370
1,cali_ckn,The California Chicken Pizza,Chicken,2416
2,ckn_alfredo,The Chicken Alfredo Pizza,Chicken,1359
3,ckn_pesto,The Chicken Pesto Pizza,Chicken,1849
4,southw_ckn,The Southwest Chicken Pizza,Chicken,1456
5,thai_ckn,The Thai Chicken Pizza,Chicken,2315
6,big_meat,The Big Meat Pizza,Classic,1849
7,classic_dlx,The Classic Deluxe Pizza,Classic,1428
8,hawaiian,The Hawaiian Pizza,Classic,1849
9,ital_cpcllo,The Italian Capocollo Pizza,Classic,1849


In [64]:
# Creating a DataFrame holding the categories and their ingredients per pizza
cat_ingred_df = pizza_ingredient.drop(["ingredients"], axis=1)

new_ingred_df = pizza_ingredient.copy()
cat_ingred_df = pizza_ingredient.drop(["pizza_type_id"], axis=1)

# TODO Check this
# Alternative soloution 1
# cat = pd.Series(cat_ingred_df['category'].unique()).map(lambda x: "Other" if x == 1 else x)
# cat = pd.DataFrame(cat_ingred_df, columns=['category'])

# catdict = {cat["category"].values[i]:range(0,len(cat))[i] for i in range(len(cat["category"].values))}
# cat_ingred_df["category"] = cat_ingred_df["category"].map(lambda x: catdict[x])

# i = 0
# for key in catdict.keys():
#     catdict[key] = i
#     i +=1


# Alternative soloution 2

# cat_ingred_df.loc[cat_ingred_df["category"] == 'Chicken', 'category'] = 0
# cat_ingred_df.loc[cat_ingred_df["category"] == 'Classic', 'category'] = 1
# cat_ingred_df.loc[cat_ingred_df['category'] == 'Supreme', 'category'] = 2
# cat_ingred_df.loc[cat_ingred_df['category'] == 'Veggie', 'category'] = 3


# Alternative solution 3
cat_list = list()
for category in cat_ingred_df['category']:
    if category not in cat_list:
        cat_list.append(category)

for category in cat_list:
    cat_ingred_df=cat_ingred_df.replace(category, cat_list.index(category))

# Checking DataFrame and controlling contents
cat_ingred_df

Unnamed: 0,name,category,ingredients,Red Peppers,Green Peppers,Tomatoes,Red Onions,Barbecue Sauce,Artichoke,Spinach,...,Kalamata Olives,Provolone Cheese,Smoked Gouda Cheese,Romano Cheese,Blue Cheese,Gorgonzola Piccante Cheese,Parmigiano Reggiano Cheese,Zucchini,Sun-dried Tomatoes,Plum Tomatoes
0,The Barbecue Chicken Pizza,0,"Barbecued Chicken, Red Peppers, Green Peppers,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The California Chicken Pizza,0,"Chicken, Artichoke, Spinach, Garlic, Jalapeno ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Chicken Alfredo Pizza,0,"Chicken, Red Onions, Red Peppers, Mushrooms, A...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Chicken Pesto Pizza,0,"Chicken, Tomatoes, Red Peppers, Spinach, Garli...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Southwest Chicken Pizza,0,"Chicken, Tomatoes, Red Peppers, Red Onions, Ja...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,The Thai Chicken Pizza,0,"Chicken, Pineapple, Tomatoes, Red Peppers, Tha...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,The Big Meat Pizza,1,"Bacon, Pepperoni, Italian Sausage, Chorizo Sau...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,The Classic Deluxe Pizza,1,"Pepperoni, Mushrooms, Red Onions, Red Peppers,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,The Hawaiian Pizza,1,"Sliced Ham, Pineapple, Mozzarella Cheese",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,The Italian Capocollo Pizza,1,"Capocollo, Red Peppers, Tomatoes, Goat Cheese,...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
cat_ingred_df.columns

Index(['name', 'category', 'Red Peppers', 'Green Peppers', 'Tomatoes',
       'Red Onions', 'Barbecue Sauce', 'Artichoke', 'Spinach', 'Garlic',
       'Jalapeno Peppers', 'Fontina Cheese', 'Gouda Cheese', 'Mushrooms',
       'Asiago Cheese', 'Alfredo Sauce', 'Pesto Sauce', 'Corn', 'Cilantro',
       'Chipotle Sauce', 'Pineapple', 'Thai Sweet Chilli Sauce', 'Pepperoni',
       'Italian Sausage', 'Chorizo Sausage', 'Bacon', 'Mozzarella Cheese',
       'Goat Cheese', 'Oregano', 'Anchovies', 'Green Olives', 'Feta Cheese',
       'Beef Chuck Roast', 'Prosciutto', 'Caramelized Onions', 'Pears',
       'Thyme', 'Pancetta', 'Friggitello Peppers', 'Capocollo', 'Arugula',
       'Luganega Sausage', 'Onions', 'Artichokes', 'Peperoncini verdi',
       'Kalamata Olives', 'Provolone Cheese', 'Smoked Gouda Cheese',
       'Romano Cheese', 'Blue Cheese', 'Gorgonzola Piccante Cheese',
       'Parmigiano Reggiano Cheese', 'Zucchini', 'Sun-dried Tomatoes',
       'Plum Tomatoes'],
      dtype='object')

#### 4. Divide data into training/testing set

In [56]:
X = cat_ingred_df.drop('name', axis=1) # Skriver in alla rader ifrån instruktionen tidigare i markdown
y = cat_ingred_df["category"]
# TODO Kolla in här
#y = X.pop("name", "category") # Marcus lade till name /Drygt

X_train, X_test, y_train, y_test = train_test_split(X,y)
y_test

6     1
18    2
5     0
0     0
25    3
17    2
13    1
8     1
Name: category, dtype: int64

#### 5. Create empty model

In [119]:
# pizza_model = SVC()

# pizza_model.fit(X_train, y_train)

# y_pred = pizza_model.predict(X_test)

# accuracy = pizza_model.score(X_test, y_test)

# print(y_pred)
# print(accuracy)


[1 2 0 0 3 2 1 1]
Accuracy: 100.00%


#### 6. Fit/train the model

Tables I think we want for classification on label category: \
1. pizza_category - labels for our id's predicted by classification \
2. pizza_ingredients_categories(name?) - columns : pizza-id, pizza category, *features(one column for each ingredient, maybe more to predict on) \
3. pizza id by count(sold_pizza_id)
   1. helen 2. Andreas 3. Marcus


#### 7. Evaluate the model

Transform_data takes in a list of items and translates them into a dataframe in the form of 

In [121]:
def transform_data(ingredient_list):
    df = pizza_ingredient.copy()
    df = df.drop(["pizza_type_id","name","category"], axis=1)
    df = df[df[df.columns[0]] == -12412]
    df.loc[len(df)] = [0] * len(df.columns)
    
    for item in df.columns:
        df[item] = df[item].map(lambda x: 1 if item in ingredient_list else x)  
    return df
transform_data(['Tomatoes', 'Red Peppers', 'Green Peppers', 'Red Onions', 'Artichoke'])

testdict = {"Veggie": ['Tomatoes', 'Red Peppers', 'Green Peppers', 'Red Onions', 'Artichoke'],
            "Classic": ['Tomatoes', 'Italian Sausage', 'Mozzarella Cheese'],
            "Supreme": ['Goat Cheese','Beef Chuck Roast', 'Sun-dried Tomatoes', 'Jalapeno Peppers']}
            
for key,value in testdict.items():
    print(key, ': ', transform_data(value).values)

testdict["Veggie"]= cat_list[pizza_model.predict(transform_data(testdict["Veggie"]))[0]]
testdict["Classic"] = cat_list[pizza_model.predict(transform_data(testdict["Classic"]))[0]]
testdict["Supreme"] = cat_list[pizza_model.predict(transform_data(testdict["Supreme"]))[0]]

print(testdict)

Veggie :  [[0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Classic :  [[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Supreme :  [[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]
{'Veggie': 'Chicken', 'Classic': 'Chicken', 'Supreme': 'Chicken'}


Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category

Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category

Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category



In [145]:

def transform_data(ingredient_list):
    df = pizza_ingredient.copy()
    df = df.drop(["pizza_type_id","name","category"], axis=1)
    df = df[df[df.columns[0]] == -12412]
    df.loc[len(df)] = [0] * len(df.columns)
    
    for item in df.columns:
        df[item] = df[item].map(lambda x: 1 if item in ingredient_list else x)  
    return df

def evaluate_model(ingredient_list):
    X = transform_data(ingredient_list).values
    y_pred= pizza_model.predict(X)
    return cat_list[y_pred[0]]
    
transform_data(['Tomatoes', 'Red Peppers', 'Green Peppers', 'Red Onions', 'Artichoke'])

testdict = {"Veggie": ['Tomatoes', 'Red Peppers', 'Green Peppers', 'Red Onions', 'Artichoke'],
            "Classic": ['Tomatoes', 'Italian Sausage', 'Mozzarella Cheese'],
            "Supreme": ['Goat Cheese','Beef Chuck Roast', 'Sun-dried Tomatoes', 'Jalapeno Peppers']}
            
for key,value in testdict.items():
    prediction = evaluate_model(value)
    print("{}: {}".format(key,prediction))

testdict["Veggie"]= cat_list[pizza_model.predict(transform_data(testdict["Veggie"]))[0]]
testdict["Classic"] = cat_list[pizza_model.predict(transform_data(testdict["Classic"]))[0]]
testdict["Supreme"] = cat_list[pizza_model.predict(transform_data(testdict["Supreme"]))[0]]

print(testdict)

Veggie: Chicken
Classic: Chicken
Supreme: Chicken
{'Veggie': 'Chicken', 'Classic': 'Chicken', 'Supreme': 'Chicken'}


Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category

Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category

Feature names unseen at fit time:
- ingredients
Feature names seen at fit time, yet now missing:
- category



Score and graph the model

# Linear Regression
### Problem formulation
Predict how many orders you'll get based on ingredients?
### 2. Load Data

In [65]:
ingredient_data = new_ingred_df.copy()
ingredient_data
order_details

Unnamed: 0,order_details_id,order_id,pizza_id,quantity
0,1,1,hawaiian_m,1
1,2,2,classic_dlx_m,1
2,3,2,five_cheese_l,1
3,4,2,ital_supr_l,1
4,5,2,mexicana_m,1
...,...,...,...,...
48615,48616,21348,ckn_alfredo_m,1
48616,48617,21348,four_cheese_l,1
48617,48618,21348,napolitana_s,1
48618,48619,21349,mexicana_l,1


### 3. Transform
We need a dataframe in the form pizza_id, quantity and ingredients

## 

In [144]:
#print(order_details["pizza_id"])
#print(ingredient_data["pizza_type_id"])
# Set all to medium
order = order_details.copy()
order = order[order["pizza_id"].str.endswith('_m')]

order["pizza_id"] = order['pizza_id'].str.replace(r'(_m$)', "", regex=True) 

mydf = ingredient_data[ingredient_data.columns.difference(['name', 'category', 'ingredients'])]

# extracted_col1 = order['pizza_id']
# extracted_col2 = order['quantity']
# excol1 = order[["pizza_id", "quantity"]]
mmdf = mydf.join(excol1["pizza_id"])

# mydf = mydf.join(extracted_col1)
# mydf = mydf.join(extracted_col2)
# mydf['pizza_id'], mydf['quantity'] = order['pizza_id'], order['quantity']
tomatoes = 0 
# print(mydf)
mydf

Unnamed: 0,Alfredo Sauce,Anchovies,Artichoke,Artichokes,Arugula,Asiago Cheese,Bacon,Barbecue Sauce,Beef Chuck Roast,Blue Cheese,...,Red Peppers,Romano Cheese,Smoked Gouda Cheese,Spinach,Sun-dried Tomatoes,Thai Sweet Chilli Sauce,Thyme,Tomatoes,Zucchini,pizza_type_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,bbq_ckn
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,cali_ckn
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ckn_alfredo
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ckn_pesto
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,southw_ckn
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,thai_ckn
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,big_meat
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,classic_dlx
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,hawaiian
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ital_cpcllo


In [None]:
def transform_data(df):
    df = df.drop(["pizza_type_id","name","category"], axis=1)
    df = df[df[df.columns[0]] == -12412]
    df.loc[len(df)] = [0] * len(df.columns)
    
    for item in df.columns:
        df[item] = df[item].map(lambda x: 1 if item in ingredient_list else x)  
    return df

def evaluate_model(ingredient_list):
    X = transform_data(ingredient_list).values
    y_pred= pizza_model.predict(X)
    return cat_list[y_pred[0]]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg= LogisticRegression()
logreg.fit(X_train, y_train)

y_pred= logreg.predict(X_test)

accuracy= accuracy_score(y_test, y_pred)
print(y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))