In [2]:
import pandas as pd
import numpy as np

In [41]:
# aggreagate ingredients and put them into list for each recipe
# new df --> recipeName | ingredientList
df=pd.read_csv('recipes_v4.csv')  
df=df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])
df2=df.groupby('Recipe')['Ingredient'].agg(list).reset_index(name='ingredient')
df2.columns = ['recipe', 'ingredient']
df2

Unnamed: 0,recipe,ingredient
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]"
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y..."
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma..."
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose..."
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ..."
...,...,...
66277,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage..."
66278,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga..."
66279,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour..."
66280,zwieback,"[dry yeast, honey, all purpose flour, butter, ..."


In [42]:
# get the recipe health scores from another csv file then merge with our df
# new df --> recipeName | ingredientList | USDAScore | FSAScore
df3=pd.read_csv('recipes-scores.csv')  
df4=df2.merge(df3,how="left",on='recipe')
df4.drop_duplicates(subset="recipe",
                     keep=False, inplace=True)
df4

Unnamed: 0,recipe,ingredient,USDAscore,FSAscore
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]",2,4
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y...",3,5
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma...",4,3
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose...",4,3
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ...",4,4
...,...,...,...,...
66626,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage...",2,2
66627,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga...",1,2
66628,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour...",1,3
66629,zwieback,"[dry yeast, honey, all purpose flour, butter, ...",5,8


In [43]:
# create binary labels according to health score (USDA)
# x > 4 healthy (1) , x <= 4 unhealthy (0)
USDAlabel=list()
for i in range(len(df4)):
    if df4.iloc[i][2]>4:
        USDAlabel.append(1)
    else:
        USDAlabel.append(0)

In [44]:
# create binary labels according to health score (FSA)
# x > 6 healthy (1) , x <= 6 unhealthy (0)
FSAlabel=list()
for i in range(len(df4)):
    if df4.iloc[i][3]>6:
        FSAlabel.append(1)
    else:
        FSAlabel.append(0)

In [46]:
# Add them to our df 
df4['USDAlabel'] = USDAlabel
df4['FSAlabel'] = FSAlabel
df4

Unnamed: 0,recipe,ingredient,USDAscore,FSAscore,USDAlabel,FSAlabel
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]",2,4,0,0
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y...",3,5,0,0
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma...",4,3,0,0
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose...",4,3,0,0
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ...",4,4,0,0
...,...,...,...,...,...,...
66626,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage...",2,2,0,0
66627,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga...",1,2,0,0
66628,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour...",1,3,0,0
66629,zwieback,"[dry yeast, honey, all purpose flour, butter, ...",5,8,1,1


In [10]:
import findspark
findspark.find()
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler,Word2Vec

In [47]:
#transfrom pandas df to spark df
mainDF=spark.createDataFrame(df4) 

In [48]:
mainDF.printSchema()

root
 |-- recipe: string (nullable = true)
 |-- ingredient: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- USDAscore: long (nullable = true)
 |-- FSAscore: long (nullable = true)
 |-- USDAlabel: long (nullable = true)
 |-- FSAlabel: long (nullable = true)



In [49]:
# train word2vec 
# ingredient lists are now transformed into digitized vectors, ready to feed to an ML pipeline
word2vec = Word2Vec( minCount=1, vectorSize=5, inputCol="ingredient", outputCol="ingredientVectors")
word2vecModel = word2vec.fit(mainDF)
digitizedDf=word2vecModel.transform(mainDF)
digitizedDf.show()

+--------------------+--------------------+---------+--------+---------+--------+--------------------+
|              recipe|          ingredient|USDAscore|FSAscore|USDAlabel|FSAlabel|   ingredientVectors|
+--------------------+--------------------+---------+--------+---------+--------+--------------------+
|1-2-3-cherry-poke...|[cake, water, whi...|        2|       4|        0|       0|[1.26757811270654...|
|1-2-3-complete-br...|[oat, water, hone...|        3|       5|        0|       0|[0.36244145253052...|
|     1-2-3-jambalaya|[worcestershire s...|        4|       3|        0|       0|[-0.1565016232697...|
|1-dish-chicken-pa...|[chees, olive oil...|        4|       3|        0|       0|[0.13101751067572...|
|1-dish-pepperoni-...|[all purpose flou...|        4|       4|        0|       0|[0.13332542823627...|
|    1-dish-taco-bake|[cooking spray, c...|        4|       2|        0|       0|[0.32245931991686...|
|1-pea-salad-most-...|[onion, black pep...|        4|       4|        0| 

In [50]:
# apply Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [51]:
#splitting data into train and test
train_data,test_data=digitizedDf.randomSplit([0.7,0.3])

In [52]:
# just using ingredient vector and trying to predict FSALabel
regression=LogisticRegression(featuresCol='ingredientVectors',labelCol='FSAlabel')
#pass train_data to train model
fit_model=regression.fit(train_data)
results = fit_model.transform(test_data)

In [53]:
evaluator = MulticlassClassificationEvaluator(labelCol="FSAlabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %s" % (accuracy))

Test Error = 0.0809704 
Accuracy = 0.9190295678544351


In [54]:
# apply Random Forest Classifier
# just using ingredient vector and trying to predict FSALabel
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'ingredientVectors', labelCol = 'FSAlabel')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [55]:
evaluator = MulticlassClassificationEvaluator(labelCol="FSAlabel", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.880252561751854
Test Error = 0.11974743824814604
