In [3]:
import pandas as pd
import findspark
findspark.find()
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [9]:
# aggreagate ingredients and put them into list for each recipe
# new df --> recipeName | ingredientList
df=pd.read_csv('recipes_v4.csv')  
df=df.drop(columns=['Unnamed: 0', 'Unnamed: 1'])
df2=df.groupby('Recipe')['Ingredient'].agg(list).reset_index(name='ingredient')
df2.columns = ['recipe', 'ingredient']
df2

Unnamed: 0,recipe,ingredient
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]"
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y..."
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma..."
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose..."
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ..."
...,...,...
66277,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage..."
66278,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga..."
66279,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour..."
66280,zwieback,"[dry yeast, honey, all purpose flour, butter, ..."


In [10]:
# get the recipe health scores from another csv file then merge with our df
# new df --> recipeName | ingredientList | USDAScore | FSAScore
df3=pd.read_csv('recipes-scores.csv')  
df4=df2.merge(df3,how="left",on='recipe')
df4.drop_duplicates(subset="recipe",
                     keep=False, inplace=True)
df4

Unnamed: 0,recipe,ingredient,USDAscore,FSAscore
0,1-2-3-cherry-poke-cake,"[cake, water, whipped top, gelatin, chocol]",2,4
1,1-2-3-complete-breakfast-smoothie,"[oat, water, honey, blueberri, protein powd, y...",3,5
2,1-2-3-jambalaya,"[worcestershire sauc, olive oil, parsley, toma...",4,3
3,1-dish-chicken-parmesan,"[chees, olive oil, spice, chicken, all purpose...",4,3
4,1-dish-pepperoni-cheese-pizza-bake,"[all purpose flour, cooking spray, mozzarella ...",4,4
...,...,...,...,...
66626,zweiback-cheesecake,"[egg, white sugar, margarin, cinnamon, cottage...",2,2
66627,zwetschekuchen-german-plum-tart,"[cinnamon, lemon peel, butter, egg, white suga...",1,2
66628,zwetschgendatschi-german-plum-sheet-cake,"[white sugar, vanilla sugar, all purpose flour...",1,3
66629,zwieback,"[dry yeast, honey, all purpose flour, butter, ...",5,8


In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler,Word2Vec

In [12]:
#transfrom pandas df to spark df
mainDF=spark.createDataFrame(df4) 
mainDF.printSchema()

root
 |-- recipe: string (nullable = true)
 |-- ingredient: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- USDAscore: long (nullable = true)
 |-- FSAscore: long (nullable = true)



In [13]:
# train word2vec 
# ingredient lists are now transformed into digitized vectors, ready to feed to an ML pipeline
word2vec = Word2Vec( minCount=1, vectorSize=5, inputCol="ingredient", outputCol="ingredientVectors")
word2vecModel = word2vec.fit(mainDF)
digitizedDf=word2vecModel.transform(mainDF)
digitizedDf.show()

+--------------------+--------------------+---------+--------+--------------------+
|              recipe|          ingredient|USDAscore|FSAscore|   ingredientVectors|
+--------------------+--------------------+---------+--------+--------------------+
|1-2-3-cherry-poke...|[cake, water, whi...|        2|       4|[-0.2589531660079...|
|1-2-3-complete-br...|[oat, water, hone...|        3|       5|[0.12949413495759...|
|     1-2-3-jambalaya|[worcestershire s...|        4|       3|[0.55607862983431...|
|1-dish-chicken-pa...|[chees, olive oil...|        4|       3|[0.20679925133784...|
|1-dish-pepperoni-...|[all purpose flou...|        4|       4|[0.05041322857141...|
|    1-dish-taco-bake|[cooking spray, c...|        4|       2|[0.18471308828641...|
|1-pea-salad-most-...|[onion, black pep...|        4|       4|[0.30225087031722...|
|1-pot-3-bean-chic...|[bay leaf, chicke...|        6|       7|[0.63154558198792...|
|1-pumpkin-spice-c...|[all purpose flou...|        5|       7|[0.15285557016

In [25]:
# apply Logistic Regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#splitting data into train and test
train_data,test_data=digitizedDf.randomSplit([0.9,0.1])

# just using ingredient vector and trying to predict FSALabel
regression=LogisticRegression(featuresCol='ingredientVectors',labelCol='FSAscore')
#pass train_data to train model
fit_model=regression.fit(train_data)
results = fit_model.transform(test_data)

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol="FSAscore", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(results)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %s" % (accuracy))

Test Error = 0.75 
Accuracy = 0.25


In [27]:
# apply Random Forest Classifier
# just using ingredient vector and trying to predict FSALabel
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'ingredientVectors', labelCol = 'FSAscore')
rfModel = rf.fit(train_data)
predictions = rfModel.transform(test_data)

In [28]:
evaluator = MulticlassClassificationEvaluator(labelCol="FSAscore", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.19893612844190917
Test Error = 0.8010638715580909
