!pip install sparknlp

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import col, udf
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StringType,
    ArrayType,
    DoubleType,
    FloatType,
    StructType,
    StructField,
)

# We could use the embedders provided by spark nlp, but we will use the sentence_transformers library (I just have more experience with it)
# spark nlp embedders: https://www.johnsnowlabs.com/understanding-the-power-of-transformers-a-guide-to-sentence-embeddings-in-spark-nlp/
from sentence_transformers import SentenceTransformer


from tqdm import tqdm

tqdm.pandas()

import numpy as np
import re
import json

SEED = 42  # Set your desired seed for reproducibility

# increase memory and reduce cores to avoid memory errors
# Initialize Spark session
spark = (
    SparkSession.builder.appName("Recipe Recommender System with Embeddings")
    .config("spark.sql.execution.arrow.pyspark.enabled", "false")
    .getOrCreate()
)

  from tqdm.autonotebook import tqdm, trange
24/10/14 18:32:17 WARN Utils: Your hostname, macOS-Maxime-Bonnesoeur.local resolves to a loopback address: 127.0.0.1; using 192.168.2.9 instead (on interface en0)
24/10/14 18:32:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/14 18:32:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Load pre-trained sentence embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

embedder_bc = spark.sparkContext.broadcast(embedder)

24/10/14 18:32:32 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
from IPython.display import HTML


def display_spark_dataframe(df, num_rows=100):
    # Convert the Spark DataFrame to Pandas for display purposes
    pandas_df = df.limit(num_rows).toPandas()

    # Generate HTML table
    html = pandas_df.to_html(classes="table table-striped table-bordered", index=False)

    # Display the HTML in the notebook
    display(HTML(html))

In [4]:
# Load dataset
file_path = "../data/processed/users_interactions.parquet"
raw_df = spark.read.parquet(file_path)


# Inspect dataset
raw_df.printSchema()
display_spark_dataframe(raw_df, 3)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: string (nullable = true)
 |-- link: string (nullable = true)
 |-- import_date: double (nullable = true)



user_id,recipe_id,title,ingredients,link,import_date
1,-5836364512049762718,Log Cabin Toast,"[""sugar"", ""cinnamon"", ""white"", ""margarine""]",http://www.cookbooks.com/Recipe-Details.aspx?id=595265,1720726000.0
1,5825856628004213219,Vegetable Burger Soup,"[""ground beef"", ""tomatoes"", ""tomato sauce"", ""frozen mixed vegetables"", ""onion soup"", ""sugar""]",http://www.cookbooks.com/Recipe-Details.aspx?id=302429,1724700000.0
1,6164519699133745296,Magic Cookie Bars,"[""butter"", ""graham cracker crumbs"", ""milk"", ""semi-sweet chocolate chips"", ""flaked coconut"", ""nuts""]",http://www.cookbooks.com/Recipe-Details.aspx?id=343171,1726601000.0


Here the issue is that the ingredients column is a string and not a list of strings. We need to convert it to a list of strings.

In [5]:
# clean the ingredients column as a list of strings
def clean_ingredients(ingredients):
    # Convert JSON string to list
    ingredients = json.loads(ingredients)
    # Remove digits and extra spaces
    return [re.sub(r"\d+", "", ingredient).strip() for ingredient in ingredients]


clean_ingredients_udf = udf(clean_ingredients, ArrayType(StringType()))
df = raw_df.withColumn("ingredients", clean_ingredients_udf("ingredients"))

In [6]:
# Inspect dataset
df.printSchema()
display_spark_dataframe(df, 5)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- link: string (nullable = true)
 |-- import_date: double (nullable = true)



                                                                                

user_id,recipe_id,title,ingredients,link,import_date
1,-5836364512049762718,Log Cabin Toast,"[sugar, cinnamon, white, margarine]",http://www.cookbooks.com/Recipe-Details.aspx?id=595265,1720726000.0
1,5825856628004213219,Vegetable Burger Soup,"[ground beef, tomatoes, tomato sauce, frozen mixed vegetables, onion soup, sugar]",http://www.cookbooks.com/Recipe-Details.aspx?id=302429,1724700000.0
1,6164519699133745296,Magic Cookie Bars,"[butter, graham cracker crumbs, milk, semi-sweet chocolate chips, flaked coconut, nuts]",http://www.cookbooks.com/Recipe-Details.aspx?id=343171,1726601000.0
1,-8815957590080865143,Chicken Divan,"[margarine, onion, celery, flour, curry powder, mushrooms, cream of celery soup, broccoli, chicken, cheese]",http://www.cookbooks.com/Recipe-Details.aspx?id=425137,1725824000.0
1,3296045182844799490,Baked Cabbage,"[ground chuck, onion, rice, salt, cabbage, pepper, tomato soup, water, Mozzarella cheese]",http://www.cookbooks.com/Recipe-Details.aspx?id=761049,1722800000.0


## Embedding the ingredients

Here, we will use the SentenceTransformer library to convert the ingredients into sentence embeddings.
The goal would be to use the natural language processing (NLP) model to convert the ingredients into fixed-length dense vectors.

Then, we will use the cosine similarity to find the similarity between the user's input and the recipe ingredients.

In [7]:
df.show(5)

+-------+--------------------+--------------------+--------------------+--------------------+-------------------+
|user_id|           recipe_id|               title|         ingredients|                link|        import_date|
+-------+--------------------+--------------------+--------------------+--------------------+-------------------+
|      1|-5836364512049762718|     Log Cabin Toast|[sugar, cinnamon,...|http://www.cookbo...|1.720726089854533E9|
|      1| 5825856628004213219|Vegetable Burger ...|[ground beef, tom...|http://www.cookbo...| 1.72470048985463E9|
|      1| 6164519699133745296|   Magic Cookie Bars|[butter, graham c...|http://www.cookbo...|1.726601289854659E9|
|      1|-8815957590080865143|       Chicken Divan|[margarine, onion...|http://www.cookbo...|1.725823689854685E9|
|      1| 3296045182844799490|       Baked Cabbage|[ground chuck, on...|http://www.cookbo...|1.722799689854709E9|
+-------+--------------------+--------------------+--------------------+----------------

In [8]:
# Create a combined text column for titles and ingredients
df = df.withColumn(
    "ingredients", F.sort_array(col("ingredients"))
)  # Sort the ingredients
df = df.withColumn(
    "combined_text",
    F.concat_ws(" ", col("title"), F.concat_ws(" ", col("ingredients"))),
)


# Function to embed text using SentenceTransformer from the broadcast variable
def embed_text(text_list):
    embedder = embedder_bc.value
    embeddings = embedder.encode([text_list], batch_size=8)[
        0
    ]  # Use batch encoding with smaller batch size
    return embeddings.tolist()


embed_text_udf = udf(embed_text, ArrayType(DoubleType()))


# DISCLAIMER: This udf does not work with the current version of Spark. It is a known issue and it is being worked on.

# Embed the combined text column manually converting the DataFrame to Pandas

# Convert the DataFrame to Pandas
df_pandas = df.toPandas()

# Embed the text using the SentenceTransformer
df_pandas["text_embeddings"] = df_pandas["combined_text"].progress_apply(
    lambda x: [float(val) for val in embedder.encode(x)]
)

schema_fields = df.schema.fields

# Add the 'embeddings' field to the schema
schema_fields.append(StructField("text_embeddings", ArrayType(DoubleType()), True))

# Convert the Pandas DataFrame back to a Spark DataFrame
df = spark.createDataFrame(df_pandas, schema=StructType(schema_fields))

100%|██████████| 224/224 [00:05<00:00, 37.62it/s]


In [9]:
display_spark_dataframe(df, 1)

user_id,recipe_id,title,ingredients,link,import_date,combined_text,text_embeddings
1,-5836364512049762718,Log Cabin Toast,"[cinnamon, margarine, sugar, white]",http://www.cookbooks.com/Recipe-Details.aspx?id=595265,1720726000.0,Log Cabin Toast cinnamon margarine sugar white,"[0.019722038879990578, 0.010755373165011406, -0.030095934867858887, 0.022724272683262825, 0.08866915851831436, -0.0012085040798410773, 0.043510258197784424, 0.012287961319088936, 0.021360674872994423, 0.019342171028256416, -0.011456969194114208, 0.0035146630834788084, -0.0523746982216835, 0.010659187100827694, 0.0073283943347632885, -0.007702303119003773, 0.029257938265800476, -0.031293272972106934, -0.020033977925777435, 0.008631894364953041, 0.14283229410648346, 0.029597090557217598, 0.02689484879374504, 0.0726298913359642, 0.01705986075103283, 0.023738622665405273, 0.07095839828252792, -0.04517602548003197, -0.09096232801675797, 0.05846288055181503, 0.06310981512069702, -0.03432522714138031, 0.044696956872940063, 0.005657962057739496, -0.008814116008579731, -0.10759952664375305, 0.038561850786209106, -0.07481438666582108, 0.12967358529567719, -0.0388704389333725, 0.00857237633317709, -0.0019532896112650633, 0.043242767453193665, -0.042188771069049835, -0.036886826157569885, -0.03151802718639374, -0.011455806903541088, 0.0036324509419500828, 0.008756095543503761, -0.02658032812178135, 0.013522527180612087, -0.02837211824953556, -0.022579673677682877, -0.039147213101387024, -0.01981183886528015, 0.05085907503962517, -0.08923672139644623, -0.07283534854650497, 0.05667347460985184, 0.004634208977222443, -0.03418145701289177, 0.06024903804063797, -0.027315251529216766, -0.0013431244296953082, 0.01513887569308281, -0.0480848029255867, -0.06863751262426376, -0.006275206338614225, 0.08818942308425903, -0.013919901102781296, -0.058685190975666046, -0.0327465794980526, 0.08708815276622772, -0.01521309930831194, 0.06923483312129974, -0.016049297526478767, 0.04341140761971474, -0.03483021631836891, -0.012210169807076454, -0.08644692599773407, -0.03207109868526459, 0.0011837307829409838, -0.021874887868762016, 0.09983982145786285, -0.0059978533536195755, 0.018630025908350945, 0.030919549986720085, 0.08230749517679214, 0.008790355175733566, -0.042853161692619324, 0.02225908264517784, -0.043751101940870285, -0.053173478692770004, 0.08226197957992554, -0.126056969165802, -0.04551691934466362, 0.00476819509640336, -0.01941521465778351, -0.009869878180325031, 0.06680719554424286, ...]"


Now that we have our embeddings, we can proceed with the recommendation system. 

Let's use the same approach as before, but this time we will use the embeddings instead of the TF-IDF vectors.

## Recommendation System

In [10]:
SIMILARITY_THRESHOLD = 0.5
# Recommendations for Users
RECOMMENDATION_LIMIT = 5

In [11]:
# Define a function to compute cosine similarity between two vectors
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return float(dot_product / (norm_v1 * norm_v2))


cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Self-join the DataFrame to calculate similarity between all recipes
cross_joined_df = df.alias("df1").crossJoin(
    df.select(["recipe_id", "text_embeddings", "title", "user_id"]).alias("df2")
)


# Calculate cosine similarity for each pair of recipes
cross_joined_df = cross_joined_df.withColumn(
    "similarity",
    cosine_similarity_udf(col("df1.text_embeddings"), col("df2.text_embeddings")),
)
cross_joined_df.printSchema()


# Select the necessary columns and alias them to avoid ambiguity
recommendations_df = cross_joined_df.filter(
    (col("df1.recipe_id") != col("df2.recipe_id"))
    & (  # Never recommend the same recipe
        col("df1.user_id") != col("df2.user_id")
    )  # Never recommend the a recipe from the same user
)

# Explicitly select and alias the columns
recommendations_df = (
    recommendations_df.select(
        col("df1.recipe_id").alias("recipe_id"),
        col("df1.user_id").alias("user_id"),
        col("df2.user_id").alias("recommended_user_id"),
        col("df1.title").alias("recipe_title"),
        col("df2.recipe_id").alias("recommended_recipe_id"),
        col("df2.title").alias("recommended_recipe_title"),
        col("similarity"),
        # Add any other columns you may need
    )
    .filter(col("similarity") > SIMILARITY_THRESHOLD)
    .orderBy(col("recipe_id"), col("similarity").desc())
)

display_spark_dataframe(recommendations_df, 5)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- link: string (nullable = true)
 |-- import_date: double (nullable = true)
 |-- combined_text: string (nullable = false)
 |-- text_embeddings: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- recipe_id: long (nullable = true)
 |-- text_embeddings: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- title: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- similarity: double (nullable = true)



                                                                                

recipe_id,user_id,recommended_user_id,recipe_title,recommended_recipe_id,recommended_recipe_title,similarity
-9058629457183933290,24,1,Vegetable-Burger Soup,5825856628004213219,Vegetable Burger Soup,0.965317
-9058629457183933290,24,17,Vegetable-Burger Soup,8480238711966712403,Taco Soup,0.664391
-9058629457183933290,24,42,Vegetable-Burger Soup,7065949700082226446,Vegetable Soup,0.639391
-9058629457183933290,24,1,Vegetable-Burger Soup,3296045182844799490,Baked Cabbage,0.631768
-9058629457183933290,24,5,Vegetable-Burger Soup,-3996395749314391944,Chunky Cheddar Chili Beans,0.621017


This is looking quite good. We have a DataFrame with recommendations for each user. We can now filter the recommendations to get the top N recommendations for each user.

In [12]:
# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id
top_recommendations = recommendations_df.groupby(["recipe_id", "recipe_title"]).agg(
    F.collect_list("recommended_recipe_id").alias("recommended_recipes"),
    F.collect_list("recommended_recipe_title").alias("recommended_recipe_titles"),
    F.collect_list("similarity").alias("similarity_scores"),
)
# Show Top Recommendations
display_spark_dataframe(top_recommendations, 1)

                                                                                

recipe_id,recipe_title,recommended_recipes,recommended_recipe_titles,similarity_scores
-9058629457183933290,Vegetable-Burger Soup,"[5825856628004213219, 8480238711966712403, 7065949700082226446, 3296045182844799490, -3996395749314391944, 7334669830415655927, -6669779999682413337, -6669779999682413337, -3420589259395077505, 1047453043641864603, 2310158205204139617, -5614553296578195021, 2694324010299801955, -7959335504771479864, -6113747026078318655, -7311370875205457768, 7852753338977449607, 580990177277857836, -5023575177784438039, -5823049819411440635, -5669451870154398266, 8706017513915014750, 6278886121403210282, 6278886121403210282, 1250803172746907701, -5118623329530415137, 7182608949288552664, 6562653343465218429, -727207831510502445, 566745543695458467, -8663176839699988983, -4027038759916901243, 5338836256362533528, -236700309736130815]","[Vegetable Burger Soup, Taco Soup, Vegetable Soup, Baked Cabbage, Chunky Cheddar Chili Beans, Ranch Beef And Beans, Scotch Broth, Scotch Broth, Glop, Lazy Day Chicken, Chili Con Carne, Rich Beef And Gravy(Crock-Pot Recipe) , Egg Drop Soup, Salsa, Cabbage Rolls, Seven Layer Salad, Hot Tomatoes And Cucumbers, Chippewa Soup, Cornbread Salad, Oven Swiss Steak, Ground Beef Casserole, Indian Meat Loaf, Ranch Style Baked Beans Casserole, Ranch Style Baked Beans Casserole, Crunchy Onion Burger, ""Glop"", Easy Hot Dish, Minestrone Soup, Lemon Chicken And Zucchini, Layered Salad, Picnic Potato Salad, Sausages And Seashells, Five Way Chili, Easy Spaghetti]","[0.965316955480652, 0.6643905905513515, 0.639390824843076, 0.6317678972870261, 0.6210171020575, 0.6097954595032831, 0.5925520982547088, 0.5925520982547088, 0.5908964329011596, 0.5843453964776765, 0.5705714073825408, 0.564587024938875, 0.550117776038424, 0.5490008239228716, 0.548899213424713, 0.5459131070910704, 0.5410702427718477, 0.5400240463756666, 0.5394385237845769, 0.539001205743465, 0.5383098255470018, 0.5299557551909952, 0.5288790746012078, 0.5288790746012078, 0.5276637612722723, 0.5232176671287985, 0.51986149801284, 0.5162109534900085, 0.5123011523997628, 0.5059346572544402, 0.5041601749197784, 0.5028143570817479, 0.5027267177884959, 0.5007013756545885]"


## Content-based Filtering

The approach here is to recommend a recipe to a user based on the similarity of the ingredients present in other recipes.

Let's have a look

In [13]:
subset = (
    recommendations_df.alias("df1")
    .join(
        df.alias("df2"),
        recommendations_df.recommended_recipe_id == df.recipe_id,
    )
    .select(
        col("df1.user_id"),
        col("df2.recipe_id"),
        col("df2.title"),
        col("df2.link"),
        col("df2.ingredients"),
        col("df1.similarity"),
    )
)


recipe_content_based_recommendations = (
    subset.orderBy(["user_id", "similarity"])
    .groupby(["user_id"])
    .agg(
        F.slice(
            F.collect_list(
                F.struct(
                    F.col("recipe_id"),
                    F.col("title"),
                    F.col("link"),
                    F.col("ingredients"),
                )
            ),
            1,
            RECOMMENDATION_LIMIT,
        ).alias("recommended_recipes"),
        F.slice(F.collect_list("similarity"), 1, RECOMMENDATION_LIMIT).alias(
            "similarity_scores"
        ),
    )
)
display_spark_dataframe(recipe_content_based_recommendations, 5)

                                                                                

user_id,recommended_recipes,similarity_scores
26,"[(-3353573329779529551, Cranberry Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=427420, [cranberry sauce, cream cheese, margarine, pineapple, sugar, walnuts, whipping cream]), (-7067716566959450710, Divinity Fudge, http://www.cookbooks.com/Recipe-Details.aspx?id=875051, [cold water, egg whites, nuts, sugar, vanilla, white corn syrup]), (1475742938514587362, Old-Time Bread Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=400071, [Pet milk, boiling water, buttered toast, cinnamon, eggs, raisins, salt, sugar, sugar, vanilla]), (-7588581206644941043, Strawberry Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=575611, [cream cheese, frozen strawberries, sugar]), (8847943492650148032, Low Cholesterol Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=289172, [Egg Scramblers, milk, milk, sugar, vanilla])]","[0.5024621613296735, 0.5035705903297357, 0.5049867260916007, 0.5101536873659674, 0.5102255391113762]"
29,"[(3088604997597712054, Phylis' Pineapple-Banana Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=682439, [bananas, pineapple]), (-3251620781098817209, Fruit Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=267592, [baking powder, candied cherries, chocolate chips, eggs, flour, nuts, salt, sugar]), (-4573706575222033830, Party Pink Punch, http://www.cookbooks.com/Recipe-Details.aspx?id=976793, [cranberry juice, frozen lemonade, liters ginger ale, pineapple juice, sugar]), (4248772492409983307, Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=824206, [butter, eggs, flour, milk, salt, sugar, yeast]), (-4525960110849332048, Fruity Ice, http://www.cookbooks.com/Recipe-Details.aspx?id=710858, [applesauce, bananas, cinnamon, crushed ice, oranges])]","[0.5048953457457137, 0.5053284887621573, 0.5100188652663266, 0.5138964350751841, 0.5151694513576688]"
65,"[(-2549225793712397108, Mixed Vegetable Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=472234, [Veg-All, grated cheese, margarine, mayonnaise, onion, roll, water chestnuts]), (-2682508455536574038, Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=13922, [cream, eggs, milk, milk, salt, sugar, vanilla]), (-606132927153244690, Millionaire Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=794547, [condensed milk, graham cracker crusts, lemons, pecans, pineapple]), (-973267922133716957, Sour Cream Lasagna, http://www.cookbooks.com/Recipe-Details.aspx?id=400312, [Cheddar cheese, brown sugar, cottage cheese, egg dumpling noodles, garlic salt, green onions, ground beef, liquid smoke, pepper, salt, sour cream, tomato sauce]), (6833491545595824920, Quick Fruit Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=357496, [bananas, cherry pie filling, marshmallows, pecans, pineapple])]","[0.5023056472128538, 0.50375620157832, 0.5052550939616084, 0.5064559973128402, 0.5065376211296618]"
19,"[(-6269967695551353259, Fast Real Good Fudge, http://www.cookbooks.com/Recipe-Details.aspx?id=505741, [Velveeta, cocoa, confectioners sugar, margarine, nuts, vanilla]), (-7358079084168233672, Rum Balls, http://www.cookbooks.com/Recipe-Details.aspx?id=814952, [light corn syrup, nuts, powdered sugar, rum, semi-sweet chocolate, vanilla wafers]), (-7358079084168233672, Rum Balls, http://www.cookbooks.com/Recipe-Details.aspx?id=814952, [light corn syrup, nuts, powdered sugar, rum, semi-sweet chocolate, vanilla wafers]), (-7358079084168233672, Rum Balls, http://www.cookbooks.com/Recipe-Details.aspx?id=814952, [light corn syrup, nuts, powdered sugar, rum, semi-sweet chocolate, vanilla wafers]), (-7358079084168233672, Rum Balls, http://www.cookbooks.com/Recipe-Details.aspx?id=814952, [light corn syrup, nuts, powdered sugar, rum, semi-sweet chocolate, vanilla wafers])]","[0.5037205756127809, 0.5043095164383786, 0.5043095164383786, 0.5043095164383786, 0.5043095164383786]"
54,"[(-1084923410672998863, Strawberry Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=57263, [baking soda, cinnamon, eggs, flour, frozen strawberries, sugar, vegetable oil]), (-1084923410672998863, Strawberry Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=57263, [baking soda, cinnamon, eggs, flour, frozen strawberries, sugar, vegetable oil]), (-1084923410672998863, Strawberry Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=57263, [baking soda, cinnamon, eggs, flour, frozen strawberries, sugar, vegetable oil]), (-1084923410672998863, Strawberry Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=57263, [baking soda, cinnamon, eggs, flour, frozen strawberries, sugar, vegetable oil]), (4140273761656505959, Oatmeal Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=728549, [baking powder, boiling water, brown sugar, butter, cinnamon, eggs, flour, oatmeal, salt, soda, vanilla, white sugar])]","[0.5038128348136361, 0.5038128348136361, 0.5038128348136361, 0.5038128348136361, 0.5122573084708405]"


With this first approach, each user gets some recommendations based on the recipe similar to theirs.

However, this approach breeds little diversity in the recommendations. Let's thus observe another approach to recommend recipes to users.

## Collaborative Filtering

In this approach, we will get for each users the users that have similar taste to them, meaning users that have used similar ingredients in their recipe.

Then, we will sample randomly a recipe from the users that have similar taste to the user and that the user has not already tried.

In [14]:
# Recommendations for Users
RECOMMENDATION_LIMIT = 5
SIMILARITY_THRESHOLD_USERS = 0.7  # We want really similar users


# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id
top_user_similarity_recommendations = (
    recommendations_df.orderBy(["user_id"])
    .groupby(["user_id", "recommended_user_id"])
    .agg(
        F.avg("similarity").alias("average_similarity"),
    )
    .orderBy(["user_id", "average_similarity"], ascending=[True, False])
    .withColumnRenamed("user_id", "user_id")
)

top_user_similarity_recommendations = (
    top_user_similarity_recommendations.filter(
        col("average_similarity") > SIMILARITY_THRESHOLD_USERS
    )
    .groupBy("user_id")
    .agg(
        F.slice(F.collect_list("recommended_user_id"), 1, RECOMMENDATION_LIMIT).alias(
            "recommended_users"
        ),
        F.slice(F.collect_list("average_similarity"), 1, RECOMMENDATION_LIMIT).alias(
            "average_similarity_scores"
        ),
    )
)

display_spark_dataframe(top_user_similarity_recommendations, 5)

                                                                                

user_id,recommended_users,average_similarity_scores
2,[76],[0.8260017557284715]
7,[73],[0.7210138686744786]
9,"[88, 82, 18]","[0.714900757888079, 0.7095564383668574, 0.7027203357071433]"
10,"[62, 29]","[0.7503616640000887, 0.7446023453078634]"
15,[88],[0.745308784440513]


In [15]:
subset.show()

                                                                                

+-------+-------------------+--------------------+--------------------+--------------------+------------------+
|user_id|          recipe_id|               title|                link|         ingredients|        similarity|
+-------+-------------------+--------------------+--------------------+--------------------+------------------+
|     98|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|0.5548206051376792|
|     86|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|0.6830661954338574|
|     83|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|0.6329597591775982|
|     76|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|0.5325578916363407|
|     72|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|0.5434275715113834|
|     69|5996411961097143608|Meg'S Power Peanu...|http://www.cookbo...|[butter, chocolat...|  0.51783053

In [16]:
# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id

subset = (
    top_user_similarity_recommendations.alias("df1")
    .join(
        df.alias("df2"),
        F.array_contains(
            top_user_similarity_recommendations.recommended_users, df.user_id
        ),
    )
    .select(
        "df1.user_id",
        "df1.recommended_users",
        "df2.recipe_id",
        "df2.title",
        "df2.link",
        "df2.ingredients",
    )
)


# Random recipe sampling for each reommended user
recipe_collaborative_recommendations = subset.groupBy("user_id").agg(
    F.slice(
        F.shuffle(
            F.collect_list(
                F.struct(
                    F.col("recipe_id"),
                    F.col("title"),
                    F.col("link"),
                    F.col("ingredients"),
                )
            )
        ),
        1,
        RECOMMENDATION_LIMIT,
    ).alias("recommended_recipes"),
)

display_spark_dataframe(recipe_collaborative_recommendations, 5)

                                                                                

user_id,recommended_recipes
76,"[(7226555342600168366, Crustless Pumpkin Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=309992, [cinnamon, eggs, ginger, honey, milk, nutmeg, pumpkin, salt]), (6315088686442183061, Pineapple Souffle, http://www.cookbooks.com/Recipe-Details.aspx?id=870995, [bread, butter, cinnamon, eggs, milk, pecans, pineapple, sugar, sugar]), (-5665958477676335033, Foolproof Chocolate Fudge, http://www.cookbooks.com/Recipe-Details.aspx?id=48056, [condensed milk, nuts, salt, semi-sweet chocolate morsels, vanilla extract]), (-3491060866410877481, Never Fail Pie Crust(Double Pie Crust) , http://www.cookbooks.com/Recipe-Details.aspx?id=994414, [egg, flour, salt, shortening, sugar, vinegar, water]), (-6669779999682413337, Scotch Broth, http://www.cookbooks.com/Recipe-Details.aspx?id=166538, [carrots, celery, meaty soup bones, onion, pearl barley, peppercorns, salt, turnips, water])]"
29,"[(-2682508455536574038, Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=13922, [cream, eggs, milk, milk, salt, sugar, vanilla]), (-2548822472067743108, Chocolate Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=490390, [boiling water, clump, cocoa, egg yolks, flour, milk, sugar, vanilla]), (3729704041729540460, Heath Bar Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=976718, [chocolate curls, graham cracker pie crust]), (8847943492650148032, Low Cholesterol Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=289172, [Egg Scramblers, milk, milk, sugar, vanilla]), (-3427706341014057740, No Bake Persimmon Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=387528, [graham cracker crumbs, marshmallows, peanuts, persimmon pulp, powdered sugar, whipped cream])]"
88,"[(-3481376460629089103, German Potato Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=949761, [bacon, bacon fat, celery, eggs, flour, onion, pepper, potatoes, salt, sugar, vinegar, water]), (1431377674314182701, Favorite Chocolate Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=684926, [baking powder, buttermilk, cocoa, eggs, flour, red food coloring, salt, shortening, soda, sugar]), (-2243084374492544274, Betsy'S Chocolate Fudge Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=858480, [baking powder, boiling water, brown sugar, butter, eggs, flour, salt, sour cream, unsweetened chocolate, vanilla]), (1429935095559871731, Ham And Country Dumplings, http://www.cookbooks.com/Recipe-Details.aspx?id=681333, [cold water, eggs, flour, ham bone, ham bouillon]), (2394488575882758118, Blueberry Surprise, http://www.cookbooks.com/Recipe-Details.aspx?id=14352, [graham cracker crumbs, granulated sugar, margarine])]"
62,"[(673815503209316634, Lemon Fluff Jello, http://www.cookbooks.com/Recipe-Details.aspx?id=947069, [boiling water, cream cheese, lemon Jell-O, pecans, pineapple, sugar]), (6930348736077852695, Vanilla Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=91883, [eggs, milk, milk, milk, sugar]), (-3427706341014057740, No Bake Persimmon Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=387528, [graham cracker crumbs, marshmallows, peanuts, persimmon pulp, powdered sugar, whipped cream]), (3729704041729540460, Heath Bar Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=976718, [chocolate curls, graham cracker pie crust]), (8706017513915014750, Indian Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=778364, [corn meal, cream-style corn, egg, green pepper, ground beef, ground pork, onion, pepper, sage, salt, tomatoes])]"
18,"[(2667220836415796897, Beef Brisket, http://www.cookbooks.com/Recipe-Details.aspx?id=1007920, [beef brisket, catsup, cider vinegar, horseradish, mustard, onion, pepper, salt, water]), (-8810089315511597378, Brown Rice Pizza(Light) , http://www.cookbooks.com/Recipe-Details.aspx?id=168268, [Mozzarella cheese, broccoli flowerets, brown rice, egg, mushrooms, onion, oregano, tomato sauce, zucchini]), (-1413936131405738684, Dry Meat Seasoning, http://www.cookbooks.com/Recipe-Details.aspx?id=138336, [Accent, black pepper, paprika, peel powder, salt, sugar]), (-3740394205139448372, Cheese Ball, http://www.cookbooks.com/Recipe-Details.aspx?id=515966, [Worcestershire sauce, beef, celery salt, cream cheese, garlic salt, mayonnaise, olives, onion salt, parsley flakes, pimento, sharp cheese]), (1796827391485446016, Sour Cream Pound Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=436042, [Crisco, cake flour, eggs, margarine, sour cream, sugar, vanilla flavoring])]"


## Merging the two recommendations

In [17]:
final_recommendations = (
    recipe_content_based_recommendations.alias("df1")
    .join(
        recipe_collaborative_recommendations.alias("df2"),
        recipe_content_based_recommendations.user_id
        == recipe_collaborative_recommendations.user_id,
    )
    .select(
        "df1.user_id",
        col("df1.recommended_recipes").alias("content_based_recipes"),
        col("df1.similarity_scores").alias("content_based_similarity_scores"),
        col("df2.recommended_recipes").alias("collaborative_recipes"),
    )
)

In [19]:
# Example usage
display_spark_dataframe(final_recommendations, 5)

                                                                                

user_id,content_based_recipes,content_based_similarity_scores,collaborative_recipes
76,"[(-8983223843788436463, Meltaways, http://www.cookbooks.com/Recipe-Details.aspx?id=719571, [cream cheese, flour, margarine]), (-6969692039410593042, Crazy Peanut Butter Cookies, http://www.cookbooks.com/Recipe-Details.aspx?id=892363, [egg, peanut butter, sugar]), (3682129508320495524, Buckeyes(Cookies) , http://www.cookbooks.com/Recipe-Details.aspx?id=1022822, [Rice Krispies, chocolate chips, crunchy peanut butter, oleo, paraffin, powdered sugar]), (-3633718265606525894, Lemon Chip Cookies, http://www.cookbooks.com/Recipe-Details.aspx?id=639713, [baking powder, cream cheese, eggs, flour, flour, lemon chips, lemon juice, margarine, sugar, sugar]), (-3928874728248188262, Blueberry Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=303061, [baking powder, eggs, flour, floured blueberries, milk, salt, shortening, sugar, vanilla])]","[0.5010046713507873, 0.5037843864073783, 0.5106719787860023, 0.5119221734045735, 0.5258377462498738]","[(7226555342600168366, Crustless Pumpkin Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=309992, [cinnamon, eggs, ginger, honey, milk, nutmeg, pumpkin, salt]), (6315088686442183061, Pineapple Souffle, http://www.cookbooks.com/Recipe-Details.aspx?id=870995, [bread, butter, cinnamon, eggs, milk, pecans, pineapple, sugar, sugar]), (-5665958477676335033, Foolproof Chocolate Fudge, http://www.cookbooks.com/Recipe-Details.aspx?id=48056, [condensed milk, nuts, salt, semi-sweet chocolate morsels, vanilla extract]), (-3491060866410877481, Never Fail Pie Crust(Double Pie Crust) , http://www.cookbooks.com/Recipe-Details.aspx?id=994414, [egg, flour, salt, shortening, sugar, vinegar, water]), (-6669779999682413337, Scotch Broth, http://www.cookbooks.com/Recipe-Details.aspx?id=166538, [carrots, celery, meaty soup bones, onion, pearl barley, peppercorns, salt, turnips, water])]"
29,"[(3088604997597712054, Phylis' Pineapple-Banana Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=682439, [bananas, pineapple]), (-3251620781098817209, Fruit Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=267592, [baking powder, candied cherries, chocolate chips, eggs, flour, nuts, salt, sugar]), (-4573706575222033830, Party Pink Punch, http://www.cookbooks.com/Recipe-Details.aspx?id=976793, [cranberry juice, frozen lemonade, liters ginger ale, pineapple juice, sugar]), (4248772492409983307, Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=824206, [butter, eggs, flour, milk, salt, sugar, yeast]), (-4525960110849332048, Fruity Ice, http://www.cookbooks.com/Recipe-Details.aspx?id=710858, [applesauce, bananas, cinnamon, crushed ice, oranges])]","[0.5048953457457137, 0.5053284887621573, 0.5100188652663266, 0.5138964350751841, 0.5151694513576688]","[(-2682508455536574038, Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=13922, [cream, eggs, milk, milk, salt, sugar, vanilla]), (-2548822472067743108, Chocolate Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=490390, [boiling water, clump, cocoa, egg yolks, flour, milk, sugar, vanilla]), (3729704041729540460, Heath Bar Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=976718, [chocolate curls, graham cracker pie crust]), (8847943492650148032, Low Cholesterol Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=289172, [Egg Scramblers, milk, milk, sugar, vanilla]), (-3427706341014057740, No Bake Persimmon Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=387528, [graham cracker crumbs, marshmallows, peanuts, persimmon pulp, powdered sugar, whipped cream])]"
88,"[(7065949700082226446, Vegetable Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=386645, [Italian seasoning, bay leaf, beans, carrots, celery, green sweet peppers, ground turkey, macaroni, onion, potato, sweet peas, tomato juice]), (-1227772323698415345, Mango With Tomatoes And Scallions Or Leeks, http://www.cookbooks.com/Recipe-Details.aspx?id=681988, [lemon juice, mango, mustard, safflower oil, scallions, tomatoes]), (251365204153554229, Garden Cabbage Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=1004617, [Dressing, cabbage, carrot, cucumber, green onions, green pepper, radishes]), (-2754011034698263661, Sour Cream Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=459058, [apple, bacon, carrot, dry mustard, eggs, ground lean beef, nutmeg, onion, pepper, potato, salt, sour cream]), (-4584826163288133981, Hattie Coffer'S Kool-Aid Punch, http://www.cookbooks.com/Recipe-Details.aspx?id=776286, [ginger ale, lemon juice, pineapple juice, sugar, water, water])]","[0.5004700083795904, 0.5017016327021121, 0.5027673105654547, 0.5031007067113923, 0.5078038941276417]","[(-3481376460629089103, German Potato Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=949761, [bacon, bacon fat, celery, eggs, flour, onion, pepper, potatoes, salt, sugar, vinegar, water]), (1431377674314182701, Favorite Chocolate Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=684926, [baking powder, buttermilk, cocoa, eggs, flour, red food coloring, salt, shortening, soda, sugar]), (-2243084374492544274, Betsy'S Chocolate Fudge Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=858480, [baking powder, boiling water, brown sugar, butter, eggs, flour, salt, sour cream, unsweetened chocolate, vanilla]), (1429935095559871731, Ham And Country Dumplings, http://www.cookbooks.com/Recipe-Details.aspx?id=681333, [cold water, eggs, flour, ham bone, ham bouillon]), (2394488575882758118, Blueberry Surprise, http://www.cookbooks.com/Recipe-Details.aspx?id=14352, [graham cracker crumbs, granulated sugar, margarine])]"
62,"[(-3353573329779529551, Cranberry Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=427420, [cranberry sauce, cream cheese, margarine, pineapple, sugar, walnuts, whipping cream]), (3682129508320495524, Buckeyes(Cookies) , http://www.cookbooks.com/Recipe-Details.aspx?id=1022822, [Rice Krispies, chocolate chips, crunchy peanut butter, oleo, paraffin, powdered sugar]), (-7780409990758030266, Chocolate Mint Bars, http://www.cookbooks.com/Recipe-Details.aspx?id=590883, [chocolate syrup, eggs, flour, oleo, sugar]), (-6969692039410593042, Crazy Peanut Butter Cookies, http://www.cookbooks.com/Recipe-Details.aspx?id=892363, [egg, peanut butter, sugar]), (6164519699133745296, Magic Cookie Bars, http://www.cookbooks.com/Recipe-Details.aspx?id=343171, [butter, flaked coconut, graham cracker crumbs, milk, nuts, semi-sweet chocolate chips])]","[0.5011695093338349, 0.50375620157832, 0.5082117264029877, 0.5099061777432216, 0.5111555461598223]","[(673815503209316634, Lemon Fluff Jello, http://www.cookbooks.com/Recipe-Details.aspx?id=947069, [boiling water, cream cheese, lemon Jell-O, pecans, pineapple, sugar]), (6930348736077852695, Vanilla Ice Cream, http://www.cookbooks.com/Recipe-Details.aspx?id=91883, [eggs, milk, milk, milk, sugar]), (-3427706341014057740, No Bake Persimmon Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=387528, [graham cracker crumbs, marshmallows, peanuts, persimmon pulp, powdered sugar, whipped cream]), (3729704041729540460, Heath Bar Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=976718, [chocolate curls, graham cracker pie crust]), (8706017513915014750, Indian Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=778364, [corn meal, cream-style corn, egg, green pepper, ground beef, ground pork, onion, pepper, sage, salt, tomatoes])]"
18,"[(-419092492216335800, Quick Macaroons, http://www.cookbooks.com/Recipe-Details.aspx?id=306623, [all-purpose flour, coconut, cream of tartar, egg whites, lemon juice, sugar]), (6717102631624880744, Original Mayfair Dressing, http://www.cookbooks.com/Recipe-Details.aspx?id=857466, [Accent, anchovies, black pepper, eggs, garlic, lemon juice, mustard, oil, onion, stalks celery]), (-9058629457183933290, Vegetable-Burger Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=822015, [frozen mixed vegetables, ground beef, onion soup, sugar, tomato sauce, tomatoes, water]), (-3921871239571114018, Dick'S Grilled Boneless Leg Of Lamb, http://www.cookbooks.com/Recipe-Details.aspx?id=334439, [fresh ginger, garlic, honey, lamb, onion, peanut, pepper, shoyu]), (566745543695458467, Layered Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=314817, [Cheddar cheese, bacon, dressing, eggs, frozen peas, mayonnaise, shredded lettuce, shredded lettuce, sour cream, water chestnuts])]","[0.5004718077779934, 0.5006730312482796, 0.5007013756545885, 0.5010236166094441, 0.5015494158299932]","[(2667220836415796897, Beef Brisket, http://www.cookbooks.com/Recipe-Details.aspx?id=1007920, [beef brisket, catsup, cider vinegar, horseradish, mustard, onion, pepper, salt, water]), (-8810089315511597378, Brown Rice Pizza(Light) , http://www.cookbooks.com/Recipe-Details.aspx?id=168268, [Mozzarella cheese, broccoli flowerets, brown rice, egg, mushrooms, onion, oregano, tomato sauce, zucchini]), (-1413936131405738684, Dry Meat Seasoning, http://www.cookbooks.com/Recipe-Details.aspx?id=138336, [Accent, black pepper, paprika, peel powder, salt, sugar]), (-3740394205139448372, Cheese Ball, http://www.cookbooks.com/Recipe-Details.aspx?id=515966, [Worcestershire sauce, beef, celery salt, cream cheese, garlic salt, mayonnaise, olives, onion salt, parsley flakes, pimento, sharp cheese]), (1796827391485446016, Sour Cream Pound Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=436042, [Crisco, cake flour, eggs, margarine, sour cream, sugar, vanilla flavoring])]"
