#### Hot Encoding and ML Methods Applying
##### After adding USDA and FSA score and binary labeling, hot encoding will be applied to recipes.
##### Then, different ML methods will be applied on the CSR Matrix.

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import tree
from sklearn.neural_network import MLPClassifier

In [2]:
ing=pd.read_csv('ingredients_v4.csv')  

In [3]:
# aggreagate ingredients and put them into list for each recipe
# new df --> recipeName | ingredientList
df=pd.read_csv('recipes_v4.csv')  
df=df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])
df2=df.groupby('Recipe')['Ingredient'].agg(list).reset_index(name='ingredient')
df2.columns = ['recipe', 'ingredient']
df2

Unnamed: 0,recipe,ingredient
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]"
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y..."
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma..."
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose..."
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ..."
...,...,...
66277,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage..."
66278,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga..."
66279,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour..."
66280,zwieback,"[dry yeast, honey, all purpose flour, butter, ..."


In [4]:
# get the recipe health scores from another csv file then merge with our df
# new df --> recipeName | ingredientList | USDAScore | FSAScore
df3=pd.read_csv('recipes-scores.csv')  
df4=df2.merge(df3,how="left",on='recipe')
df4.drop_duplicates(subset="recipe",
                     keep=False, inplace=True)
df4

Unnamed: 0,recipe,ingredient,USDAscore,FSAscore
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]",2,4
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y...",3,5
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma...",4,3
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose...",4,3
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ...",4,4
...,...,...,...,...
66626,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage...",2,2
66627,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga...",1,2
66628,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour...",1,3
66629,zwieback,"[dry yeast, honey, all purpose flour, butter, ...",5,8


In [5]:
# create binary labels according to health score (USDA)
# x > 4 healthy (1) , x <= 4 unhealthy (0)
USDAlabel=list()
for i in range(len(df4)):
    if df4.iloc[i][2]>4:
        USDAlabel.append(1)
    else:
        USDAlabel.append(0)

In [6]:
# create binary labels according to health score (FSA)
# x > 4 healthy (1) , x <= 4 unhealthy (0)
FSAlabel=list()
for i in range(len(df4)):
    if df4.iloc[i][3]>4:
        FSAlabel.append(1)
    else:
        FSAlabel.append(0)

In [7]:
# Add them to our df 
df4['USDAlabel'] = USDAlabel
df4['FSAlabel'] = FSAlabel
df4

Unnamed: 0,recipe,ingredient,USDAscore,FSAscore,USDAlabel,FSAlabel
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]",2,4,0,0
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y...",3,5,0,1
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma...",4,3,0,0
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose...",4,3,0,0
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ...",4,4,0,0
...,...,...,...,...,...,...
66626,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage...",2,2,0,0
66627,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga...",1,2,0,0
66628,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour...",1,3,0,0
66629,zwieback,"[dry yeast, honey, all purpose flour, butter, ...",5,8,1,1


In [8]:
#creating a matrix recipes x ingredients (66107 recipes, 962 ingredietnts)
#It means 1 x 963 vector for each recipe
mat=np.zeros((66107,963))

In [9]:
ingp=ing['Ingredient'].to_numpy()

In [10]:
ingp.shape

(963,)

In [11]:
df5=df4.drop('USDAscore',axis=1)
df5=df4.drop('FSAscore',axis=1)
df5=df4.drop('USDAlabel',axis=1)
df5=df4.drop('FSAlabel',axis=1)
data=df5.to_numpy()

In [12]:
# searching ingredients in recipes. 
#In our matrix, if a recipe contains a word, corresponding index of the ingredient will be 1 for that recipe.
for i in range(962):
    for w in range(66106):
        if ingp[i] in data[w][1]:
            mat[w][i]=1


In [13]:
#FSA column will be first class label vector.
clss1=df4['FSAlabel'].to_numpy()
clss2=df4['USDAlabel'].to_numpy()

In [14]:
#checking out how many "1" we have in our class
len(clss1)
count=0
for i in range(66107):
    if clss1[i]==1:
        count +=1

print(len(clss1),count)

66107 25610


In [17]:
#I'll try to convert our data to CSR dense matrix and feed it to ML methods
from scipy.sparse import csr_matrix
x=csr_matrix(mat)
print(x)

  (0, 6)	1.0
  (0, 87)	1.0
  (0, 115)	1.0
  (0, 153)	1.0
  (0, 203)	1.0
  (1, 6)	1.0
  (1, 38)	1.0
  (1, 59)	1.0
  (1, 80)	1.0
  (1, 106)	1.0
  (1, 478)	1.0
  (2, 8)	1.0
  (2, 12)	1.0
  (2, 15)	1.0
  (2, 21)	1.0
  (2, 55)	1.0
  (2, 105)	1.0
  (2, 147)	1.0
  (3, 0)	1.0
  (3, 7)	1.0
  (3, 8)	1.0
  (3, 31)	1.0
  (3, 49)	1.0
  (3, 51)	1.0
  (3, 61)	1.0
  :	:
  (66102, 86)	1.0
  (66102, 192)	1.0
  (66103, 2)	1.0
  (66103, 4)	1.0
  (66103, 5)	1.0
  (66103, 6)	1.0
  (66103, 7)	1.0
  (66103, 16)	1.0
  (66103, 472)	1.0
  (66104, 0)	1.0
  (66104, 2)	1.0
  (66104, 5)	1.0
  (66104, 7)	1.0
  (66104, 16)	1.0
  (66104, 18)	1.0
  (66104, 43)	1.0
  (66104, 72)	1.0
  (66104, 515)	1.0
  (66105, 0)	1.0
  (66105, 4)	1.0
  (66105, 6)	1.0
  (66105, 7)	1.0
  (66105, 10)	1.0
  (66105, 38)	1.0
  (66105, 103)	1.0


In [27]:
#Converted CSR matrix will be fed to ML methods for clss1 which is FSA Binary Labeling
#same methods are also applied for clss2 which is USDA Binary Labeling
xtraincsr,xtestcsr,ytraincsr,ytestcsr=train_test_split(x,clss1,test_size=0.3, random_state=15)

In [28]:
def modelfitcsr(mod):
    model=mod
    model.fit(xtraincsr,ytraincsr)
    ypredcsr=model.predict(xtestcsr)
    print(accuracy_score(ytestcsr,ypredcsr))


In [29]:
st=time.time()
modelfitcsr(LogisticRegression(max_iter=1000))
et=time.time()
print(et-st,"seconds")

0.7353905107648868
0.700875997543335 seconds


In [30]:
st=time.time()
modelfitcsr(RandomForestClassifier())
et=time.time()
print(et-st,"seconds")

0.7296929360157314
50.54760813713074 seconds


In [31]:
st=time.time()
modelfitcsr(GradientBoostingClassifier())
et=time.time()
print(et-st,"seconds")

0.6998436948520144
3.7572290897369385 seconds


In [32]:
st=time.time()
modelfit(tree.DecisionTreeClassifier())
et=time.time()
print(et-st,"seconds")

0.6507336257752231
25.518248081207275 seconds


In [33]:
st=time.time()
modelfitcsr(MLPClassifier(alpha=1e-5,hidden_layer_sizes=(5, 2),max_iter=1000))
et=time.time()
print(et-st,"seconds")

0.7129027378611406
60.5793240070343 seconds
