In [30]:
'''
finds association rules (and ranks them based on confidence) using a specific cuisine (currently set at italian)
'''
import numpy as np
import pandas as pd

Data = pd.read_json('train.json')

cuisine = 'italian'
data = Data[Data['cuisine'] == cuisine]
# get the list of ingredients
myList = []
temp = (Data['cuisine'] == cuisine)
for i in range(Data.shape[0]):
    if temp[i]:
        for ingredient in data.loc[i,'ingredients']:
            myList.append(ingredient)
ingredientList = sorted(list(set(myList)))
# the mapping between ingredient and its index
ingredient2index = dict(zip(ingredientList, range(len(ingredientList))))
# create a binary matrix indicating whether or not an ingredient is in a recipe
binaryIngredientsMat = np.zeros((data.shape[0], len(ingredientList)))
count = 0
for iRecipe in range(Data.shape[0]):
    if temp[iRecipe]:
        myList2 = []
        for ingredient in Data.loc[iRecipe, 'ingredients']:
            myList2.append(ingredient2index[ingredient])
        binaryIngredientsMat[count, myList2] = 1
        count += 1
dataBinaryIngredients = pd.DataFrame(binaryIngredientsMat, columns=ingredientList)
print(dataBinaryIngredients.shape)

#market basket analysis
#https://pbpython.com/market-basket-analysis.html
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(dataBinaryIngredients, min_support = 0.005, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print("shape of rules from training", rules.shape)

#display rules
print("rules from training based on confidence, cuisine: " + cuisine)
rules.sort_values('confidence').iloc[::-1, :]

(7838, 2929)
shape of rules from training (10572, 9)
rules from training based on confidence, cuisine: italian


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8240,"(onions, carrots, pepper)",(salt),0.005869,0.440674,0.005869,1.000000,2.269253,0.003283,inf
3334,"(carrots, pepper)",(salt),0.007783,0.440674,0.007655,0.983607,2.232052,0.004225,34.118908
6732,"(ground beef, pepper)",(salt),0.007655,0.440674,0.007527,0.983333,2.231432,0.004154,33.559582
8590,"(dried oregano, dried basil, pepper)",(salt),0.007400,0.440674,0.007272,0.982759,2.230128,0.004011,32.440929
7516,"(baking powder, white sugar, eggs)",(all-purpose flour),0.006379,0.117122,0.006252,0.980000,8.367364,0.005504,44.143914
8884,"(dried oregano, onions, pepper)",(salt),0.006252,0.440674,0.006124,0.979592,2.222942,0.003369,27.406992
7398,"(sugar, pepper)",(salt),0.006124,0.440674,0.005996,0.979167,2.221977,0.003298,26.847665
8128,"(butter, onions, pepper)",(salt),0.005486,0.440674,0.005359,0.976744,2.216480,0.002941,24.051033
6898,"(heavy cream, pepper)",(salt),0.005359,0.440674,0.005231,0.976190,2.215223,0.002870,23.491707
8814,"(dried oregano, grated parmesan cheese, pepper)",(salt),0.005231,0.440674,0.005103,0.975610,2.213905,0.002798,22.932381


In [31]:
# rules from above which do not have salt as consequent
df = rules.sort_values('confidence').iloc[::-1, :]
df[df['consequents'] != frozenset({'salt'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
7516,"(baking powder, white sugar, eggs)",(all-purpose flour),0.006379,0.117122,0.006252,0.980000,8.367364,0.005504,44.143914
7573,"(baking powder, vanilla extract, salt)",(all-purpose flour),0.005614,0.117122,0.005359,0.954545,8.150030,0.004701,19.423322
2138,"(baking powder, white sugar)",(all-purpose flour),0.007017,0.117122,0.006634,0.945455,8.072410,0.005812,16.186102
7517,"(baking powder, white sugar, all-purpose flour)",(eggs),0.006634,0.079995,0.006252,0.942308,11.779598,0.005721,15.946755
2551,"(white sugar, baking powder)",(eggs),0.007017,0.079995,0.006379,0.909091,11.364361,0.005818,10.120056
2132,"(baking powder, vanilla extract)",(all-purpose flour),0.009696,0.117122,0.008803,0.907895,7.751720,0.007668,9.585536
10333,"(large eggs, sugar, baking powder, salt)",(all-purpose flour),0.006252,0.117122,0.005614,0.897959,7.666889,0.004881,8.652207
7734,"(large eggs, vanilla extract, all-purpose flour)",(sugar),0.005996,0.096964,0.005359,0.893617,9.216013,0.004777,8.488543
7520,"(baking powder, white sugar)","(eggs, all-purpose flour)",0.007017,0.021434,0.006252,0.890909,41.565152,0.006101,8.970188
7544,"(large eggs, sugar, baking powder)",(all-purpose flour),0.008548,0.117122,0.007527,0.880597,7.518649,0.006526,7.394106


In [9]:
'''
finds association rules (and ranks them based on confidence) using all cuisines
'''

import numpy as np
import pandas as pd

data = pd.read_json('train.json')
# get the list of ingredients
ingredientList = sorted(list(set([ingredient for i in range(data.shape[0]) for ingredient in data.loc[i,'ingredients']])))
# the mapping between ingredient and its index
ingredient2index = dict(zip(ingredientList, range(len(ingredientList))))
# create a binary matrix indicating whether or not an ingredient is in a recipe
binaryIngredientsMat = np.zeros((data.shape[0], len(ingredientList)))
for iRecipe in range(data.shape[0]):
    binaryIngredientsMat[iRecipe, [ingredient2index[ingredient] for ingredient in data.loc[iRecipe, 'ingredients']]] = 1
dataBinaryIngredients = pd.DataFrame(binaryIngredientsMat, columns=ingredientList)
dataBinaryIngredients.head()
#market basket analysis
#https://pbpython.com/market-basket-analysis.html
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(dataBinaryIngredients, min_support = 0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print("shape of rules from training", rules.shape)
#display rules
print("rules from training based on confidence, allcuisines")
rules.sort_values('confidence').iloc[::-1, :]

shape of rules from training (1200, 9)
rules from training based on confidence, allcuisines


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
798,"(eggs, pepper)",(salt),0.013225,0.453764,0.012320,0.931559,2.052960,0.006319,7.981118
740,"(butter, pepper)",(salt),0.017926,0.453764,0.016594,0.925666,2.039974,0.008459,7.348423
1144,"(olive oil, garlic, pepper)",(salt),0.011691,0.453764,0.010685,0.913978,2.014217,0.005380,6.349996
1186,"(onions, pepper, olive oil)",(salt),0.011364,0.453764,0.010359,0.911504,2.008764,0.005202,6.172469
1158,"(onions, garlic, pepper)",(salt),0.015513,0.453764,0.014105,0.909238,2.003770,0.007066,6.018353
1050,"(onions, pepper)",(salt),0.037411,0.453764,0.033741,0.901882,1.987558,0.016765,5.567120
1082,"(pepper, tomatoes)",(salt),0.012496,0.453764,0.011213,0.897384,1.977646,0.005543,5.323126
882,"(garlic, pepper)",(salt),0.032886,0.453764,0.029416,0.894495,1.971280,0.014494,5.177370
1086,"(water, pepper)",(salt),0.021346,0.453764,0.018957,0.888104,1.957194,0.009271,4.881627
1036,"(olive oil, pepper)",(salt),0.033037,0.453764,0.029265,0.885845,1.952216,0.014275,4.785029


In [29]:
# rules from above which do not have salt as consequent
df = rules.sort_values('confidence').iloc[::-1, :]
df[df['consequents'] != frozenset({'salt'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
913,"(sesame oil, garlic)",(soy sauce),0.016116,0.082868,0.011389,0.706708,8.528099,0.010054,3.127029
1117,"(baking powder, sugar, salt)",(all-purpose flour),0.014457,0.116458,0.010007,0.692174,5.943550,0.008323,2.870264
1110,"(sesame oil, sugar)",(soy sauce),0.014633,0.082868,0.010107,0.690722,8.335183,0.008895,2.965393
573,"(baking powder, salt)",(all-purpose flour),0.032333,0.116458,0.021924,0.678072,5.822456,0.018158,2.744529
399,(ground coriander),(ground cumin),0.019309,0.069065,0.013074,0.677083,9.803536,0.011740,2.882895
585,"(baking soda, salt)",(all-purpose flour),0.017147,0.116458,0.011440,0.667155,5.728722,0.009443,2.654518
0,(baking powder),(all-purpose flour),0.043697,0.116458,0.028939,0.662255,5.686647,0.023850,2.616007
578,"(baking powder, sugar)",(all-purpose flour),0.018303,0.116458,0.011942,0.652473,5.602643,0.009811,2.542366
548,(sesame oil),(soy sauce),0.044577,0.082868,0.028209,0.632826,7.636532,0.024515,2.497811
2,(baking soda),(all-purpose flour),0.023684,0.116458,0.014909,0.629512,5.405483,0.012151,2.384804


In [32]:
'''
below cell finds association rules between ingredients AND also between ingredients and type of cuisine
'''

import numpy as np
import pandas as pd

data = pd.read_json('train.json')
# get the list of ingredients
ingredientList = sorted(list(set([ingredient for i in range(data.shape[0]) for ingredient in data.loc[i,'ingredients']])))
# the mapping between ingredient and its index
ingredient2index = dict(zip(ingredientList, range(len(ingredientList))))
# create a binary matrix indicating whether or not an ingredient is in a recipe
binaryIngredientsMat = np.zeros((data.shape[0], len(ingredientList)))
for iRecipe in range(data.shape[0]):
    binaryIngredientsMat[iRecipe, [ingredient2index[ingredient] for ingredient in data.loc[iRecipe, 'ingredients']]] = 1
dataBinaryIngredients = pd.DataFrame(binaryIngredientsMat, columns=ingredientList)
data2 = data.drop(['id','ingredients'], axis=1)
data3 = data2.join(dataBinaryIngredients)
categorical_columns = ['cuisine']
dataBinaryIngredients = pd.get_dummies(data3, 
                                columns = categorical_columns)
#market basket analysis
#https://pbpython.com/market-basket-analysis.html
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

frequent_itemsets = apriori(dataBinaryIngredients, min_support = 0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print("shape of rules from training", rules.shape)
#display rules
print("rules from training based on confidence, allcuisines")
rules.sort_values('confidence').iloc[::-1, :]

shape of rules from training (1906, 9)
rules from training based on confidence, allcuisines


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
220,(corn tortillas),(cuisine_mexican),0.024262,0.161865,0.023784,0.980311,6.056366,0.019857,42.568459
1114,"(corn tortillas, salt)",(cuisine_mexican),0.010711,0.161865,0.010459,0.976526,6.032982,0.008725,35.704571
680,(salsa),(cuisine_mexican),0.024212,0.161865,0.023206,0.958463,5.921391,0.019287,20.178111
314,(flour tortillas),(cuisine_mexican),0.021748,0.161865,0.020566,0.945665,5.842322,0.017046,15.425259
361,(garam masala),(cuisine_indian),0.023256,0.075502,0.021672,0.931892,12.342680,0.019917,13.573985
1136,"(eggs, pepper)",(salt),0.013225,0.453764,0.012320,0.931559,2.052960,0.006319,7.981118
1184,"(garam masala, salt)",(cuisine_indian),0.016770,0.075502,0.015588,0.929535,12.311467,0.014322,13.120009
1026,"(butter, pepper)",(salt),0.017926,0.453764,0.016594,0.925666,2.039974,0.008459,7.348423
1178,"(onions, garam masala)",(cuisine_indian),0.013024,0.075502,0.011968,0.918919,12.170856,0.010984,11.402147
1794,"(olive oil, garlic, pepper)",(salt),0.011691,0.453764,0.010685,0.913978,2.014217,0.005380,6.349996


In [33]:
# rules from above which do not have salt as consequent
df = rules.sort_values('confidence').iloc[::-1, :]
df[df['consequents'] != frozenset({'salt'})]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
220,(corn tortillas),(cuisine_mexican),0.024262,0.161865,0.023784,0.980311,6.056366,0.019857,42.568459
1114,"(corn tortillas, salt)",(cuisine_mexican),0.010711,0.161865,0.010459,0.976526,6.032982,0.008725,35.704571
680,(salsa),(cuisine_mexican),0.024212,0.161865,0.023206,0.958463,5.921391,0.019287,20.178111
314,(flour tortillas),(cuisine_mexican),0.021748,0.161865,0.020566,0.945665,5.842322,0.017046,15.425259
361,(garam masala),(cuisine_indian),0.023256,0.075502,0.021672,0.931892,12.342680,0.019917,13.573985
1184,"(garam masala, salt)",(cuisine_indian),0.016770,0.075502,0.015588,0.929535,12.311467,0.014322,13.120009
1178,"(onions, garam masala)",(cuisine_indian),0.013024,0.075502,0.011968,0.918919,12.170856,0.010984,11.402147
66,(black beans),(cuisine_mexican),0.022527,0.161865,0.020390,0.905134,5.591922,0.016744,8.834934
1403,"(olive oil, grated parmesan cheese)",(cuisine_italian),0.020139,0.197063,0.018002,0.893883,4.536015,0.014033,7.566497
933,"(salt, avocado)",(cuisine_mexican),0.014934,0.161865,0.013300,0.890572,5.501961,0.010883,7.659269
