In [None]:
#!pip install openpyxl
import pandas as pd
import openpyxl # to read xls
import urllib.request
from PIL import Image  # image processing library
import numpy as np # import numpy library
import re # import library for regular expression
import random # library for random number generation

from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

from sklearn.tree import export_graphviz

import itertools

In [None]:
pd.set_option('display.max_columns', None)

recipes = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0103EN-SkillsNetwork/labs/Module%202/recipes.csv")

print("Data read into dataframe!") # takes about 30 seconds

In [None]:
recipes.shape # (57691, 384) # each row is a recipe, each column is the ingredient (true/false whether included)

In [None]:
ingredients = list(recipes.columns.values)

In [None]:
#print all specific formats of words including a string

print([match.group(0) for ingredient in ingredients for match in [(re.compile(".*(lingonberry).*")).search(ingredient)] if match])
print([match.group(0) for ingredient in ingredients for match in [(re.compile(".*(rye).*")).search(ingredient)] if match])
print([match.group(0) for ingredient in ingredients for match in [(re.compile(".*(salmon).*")).search(ingredient)] if match])
print([match.group(0) for ingredient in ingredients for match in [(re.compile(".*(oat).*")).search(ingredient)] if match])

In [None]:
recipes["country"].value_counts() # frequency table
# some have only a few recipes, skewed towards American blergh

In [None]:
print(recipes["country"].unique()) # show actual terms

# names are duplicates and lowercase letters, variable name is not consistent with what we want

In [None]:
# fix column name

column_names = recipes.columns.values
column_names[0] = "cuisine"
recipes.columns = column_names

# make all names lowercase
recipes["cuisine"] = recipes["cuisine"].str.lower()

recipes

In [None]:
# make names consistent

recipes.loc[recipes["cuisine"] == "austria", "cuisine"] = "austrian"
recipes.loc[recipes["cuisine"] == "belgium", "cuisine"] = "belgian"
recipes.loc[recipes["cuisine"] == "china", "cuisine"] = "chinese"
recipes.loc[recipes["cuisine"] == "canada", "cuisine"] = "canadian"
recipes.loc[recipes["cuisine"] == "netherlands", "cuisine"] = "dutch"
recipes.loc[recipes["cuisine"] == "france", "cuisine"] = "french"
recipes.loc[recipes["cuisine"] == "germany", "cuisine"] = "german"
recipes.loc[recipes["cuisine"] == "india", "cuisine"] = "indian"
recipes.loc[recipes["cuisine"] == "indonesia", "cuisine"] = "indonesian"
recipes.loc[recipes["cuisine"] == "iran", "cuisine"] = "iranian"
recipes.loc[recipes["cuisine"] == "italy", "cuisine"] = "italian"
recipes.loc[recipes["cuisine"] == "japan", "cuisine"] = "japanese"
recipes.loc[recipes["cuisine"] == "israel", "cuisine"] = "israeli"
recipes.loc[recipes["cuisine"] == "korea", "cuisine"] = "korean"
recipes.loc[recipes["cuisine"] == "lebanon", "cuisine"] = "lebanese"
recipes.loc[recipes["cuisine"] == "malaysia", "cuisine"] = "malaysian"
recipes.loc[recipes["cuisine"] == "mexico", "cuisine"] = "mexican"
recipes.loc[recipes["cuisine"] == "pakistan", "cuisine"] = "pakistani"
recipes.loc[recipes["cuisine"] == "philippines", "cuisine"] = "philippine"
recipes.loc[recipes["cuisine"] == "scandinavia", "cuisine"] = "scandinavian"
recipes.loc[recipes["cuisine"] == "spain", "cuisine"] = "spanish_portuguese"
recipes.loc[recipes["cuisine"] == "portugal", "cuisine"] = "spanish_portuguese"
recipes.loc[recipes["cuisine"] == "switzerland", "cuisine"] = "swiss"
recipes.loc[recipes["cuisine"] == "thailand", "cuisine"] = "thai"
recipes.loc[recipes["cuisine"] == "turkey", "cuisine"] = "turkish"
recipes.loc[recipes["cuisine"] == "vietnam", "cuisine"] = "vietnamese"
recipes.loc[recipes["cuisine"] == "uk-and-ireland", "cuisine"] = "uk-and-irish"
recipes.loc[recipes["cuisine"] == "irish", "cuisine"] = "uk-and-irish"
recipes.loc[recipes["cuisine"] == "south-america", "cuisine"] = "south-american"
recipes.loc[recipes["cuisine"] == "eastern-europe", "cuisine"] = "eastern-european"


In [None]:
print(recipes["cuisine"].unique()) # show actual terms



In [None]:
recipes = recipes.replace(to_replace="Yes", value=1)
recipes = recipes.replace(to_replace="No", value=0)

In [None]:
# get list of cuisines to keep
recipes_counts = recipes["cuisine"].value_counts()
cuisines_indices = recipes_counts > 50
cuisines_to_keep = list(np.array(recipes_counts.index.values)[np.array(cuisines_indices)])
cuisines_to_keep = ["italian", "mexican", "french", "spanish-portuguese", "mediterranean", "scandinavian",\
                   "middleeastern", "central_southamerican", "greek", "caribbean", "cajun_creole",\
                   "moroccan", "african", "south-american", "north-african", "lebanese", "east-african", "west-african"]

print(cuisines_to_keep)

# original one keeps americans that are largest group. it omits only 400 rows.
# my selection of healthy cuisines omits almost 50 000 recipes.

# pick: italian, mexican, french, spanish-portuguese,
# jewish, mediterranean, scandinavian, 
# middleeastern, central-southamerican, greek, 
# caribbean, cajun-creole, easterneuropean_russian,
# moroccan, african, south-american, north-african, 

# omitted: 
# swiss, east-african, west-african, lebanese

In [None]:
# for healthy cuisines only, roughly 9 000 rows accepted

rows_before = recipes.shape[0] # number of rows of original dataframe
print("Number of rows of original dataframe is {}.".format(rows_before))

recipes = recipes.loc[recipes['cuisine'].isin(cuisines_to_keep)]

rows_after = recipes.shape[0] # number of rows of processed dataframe
print("Number of rows of processed dataframe is {}.".format(rows_after))

print("{} rows removed!".format(rows_before - rows_after))

In [None]:
recipes = recipes.replace(to_replace="Yes", value=1)
recipes = recipes.replace(to_replace="No", value=0)

In [None]:
recipes.head()

In [None]:
check_recipes = recipes.loc[
    (recipes["salmon"] == 1) |
    (recipes["rye_bread"] == 1) |
    (recipes["oat"] == 1) |
    (recipes["lingonberry"] == 1)
]

check_recipes.cuisine.unique() 
# 92 recipes have at least one of the Finnish healthy staples (salmon, rye_bread, oat, lingonberry)
# these cuisines are french, central_southamerican, scandinavian, african, 
# italian, mexican, lebanese, south-american, north-african

# notably not greek, cajun, moroccan, spanish, or caribbean

In [None]:
ing = recipes.iloc[:, 1:].sum(axis=0)
ing.sort_values(inplace=True)

#for col in recipes.columns:
#    print(col)
 
print(ing)

In [None]:
# define each column as a pandas series
ingredient = pd.Series(ing.index.values, index = np.arange(len(ing)))
count = pd.Series(list(ing), index = np.arange(len(ing)))

# create the dataframe
ing_df = pd.DataFrame(dict(ingredient = ingredient, count = count))
ing_df = ing_df[["ingredient", "count"]]
print(ing_df.to_string())

In [None]:
ing_df.sort_values(["count"], ascending=False, inplace=True)
ing_df.reset_index(inplace=True, drop=True)

print(ing_df.to_string())

In [None]:
# compute a profile for each category (cuisine)
cuisines = recipes.groupby("cuisine").mean()
cuisines.head()


In [None]:
num_ingredients = 4 # define number of top ingredients to print

# define a function that prints the top ingredients for each cuisine
def print_top_ingredients(row):
    print(row.name.upper())
    row_sorted = row.sort_values(ascending=False)*100
    top_ingredients = list(row_sorted.index.values)[0:num_ingredients]
    row_sorted = list(row_sorted)[0:num_ingredients]

    for ind, ingredient in enumerate(top_ingredients):
        print("%s (%d%%)" % (ingredient, row_sorted[ind]), end=' ')
    print("\n")

# apply function to cuisines dataframe
create_cuisines_profiles = cuisines.apply(print_top_ingredients, axis=1)

# WHAT HAPPENED TO SPANIS-PORTUGUESE?

In [None]:
# select subset of cuisines
#asian_indian_recipes = recipes[recipes.cuisine.isin(["korean", "japanese", "chinese", "thai", "indian"])]
#cuisines = asian_indian_recipes["cuisine"]
#ingredients = asian_indian_recipes.iloc[:,1:]
ingredients = recipes.iloc[:,1:]
cuisines = recipes["cuisine"]

mediterranean_tree = tree.DecisionTreeClassifier(max_depth=3)
mediterranean_tree.fit(ingredients, cuisines)

print("Decision tree model saved to mediterranean_tree!")

In [2]:
# Model evaluation
bamboo = recipes[recipes.cuisine.isin(["italian", "mexican", "french"])]

bamboo["cuisine"].value_counts() # how many recipes exist for each cuisine

# set sample size
sample_n = 30

# take 30 recipes from each cuisine
random.seed(1234) # set random seed
bamboo_test = bamboo.groupby("cuisine", group_keys=False).apply(lambda x: x.sample(sample_n))

bamboo_test_ingredients = bamboo_test.iloc[:,1:] # ingredients
bamboo_test_cuisines = bamboo_test["cuisine"] # corresponding cuisines or labels

# check that we have 30 recipes from each cuisine
bamboo_test["cuisine"].value_counts()

# classify - choose max depth as it affects false positive rates and confusion matrix
# how to plot roc curve to select optimal depth?
bamboo_train_tree = tree.DecisionTreeClassifier(max_depth=7)
bamboo_train_tree.fit(bamboo_train_ingredients, bamboo_train_cuisines)

print("Decision tree model saved to bamboo_train_tree!")

bamboo_test_index = bamboo.index.isin(bamboo_test.index)
bamboo_train = bamboo[~bamboo_test_index]

bamboo_train_ingredients = bamboo_train.iloc[:,1:] # ingredients
bamboo_train_cuisines = bamboo_train["cuisine"] # corresponding cuisines or labels

bamboo_pred_cuisines = bamboo_train_tree.predict(bamboo_test_ingredients)


# plot confusion matrix

test_cuisines = np.unique(bamboo_test_cuisines)
bamboo_confusion_matrix = confusion_matrix(bamboo_test_cuisines, bamboo_pred_cuisines, test_cuisines)
title = 'Bamboo Confusion Matrix'
cmap = plt.cm.Blues

plt.figure(figsize=(8, 6))
bamboo_confusion_matrix = (
    bamboo_confusion_matrix.astype('float') / bamboo_confusion_matrix.sum(axis=1)[:, np.newaxis]
    ) * 100

plt.imshow(bamboo_confusion_matrix, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(test_cuisines))
plt.xticks(tick_marks, test_cuisines)
plt.yticks(tick_marks, test_cuisines)

fmt = '.2f'
thresh = bamboo_confusion_matrix.max() / 2.
for i, j in itertools.product(range(bamboo_confusion_matrix.shape[0]), range(bamboo_confusion_matrix.shape[1])):
    plt.text(j, i, format(bamboo_confusion_matrix[i, j], fmt),
             horizontalalignment="center",
             color="white" if bamboo_confusion_matrix[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.show()

NameError: name 'bamboo_train_tree' is not defined

In [None]:
# can't draw this without conda python-graphviz and can't install it for some reason
export_graphviz(bamboo_train_tree,
                feature_names=list(bamboo_train_ingredients.columns.values),
                out_file="bamboo_train_tree.dot",
                class_names=np.unique(bamboo_train_cuisines),
                filled=True,
                node_ids=True,
                special_characters=True,
                impurity=False,
                label="all",
                leaves_parallel=False)

with open("bamboo_train_tree.dot") as bamboo_train_tree_image:
    bamboo_train_tree_graph = bamboo_train_tree_image.read()
    !pip install pydot
    import pydot
    (graph,) = pydot.graph_from_dot_file("bamboo_train_tree.dot")
    graph.write_png('somefile.png')
graphviz.Source(bamboo_train_tree_graph)