In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StringType,
    ArrayType,
    DoubleType,
    StructType,
    StructField,
)

# We could use the embedders provided by spark nlp, but we will use the sentence_transformers library (I just have more experience with it)
# spark nlp embedders: https://www.johnsnowlabs.com/understanding-the-power-of-transformers-a-guide-to-sentence-embeddings-in-spark-nlp/
from sentence_transformers import SentenceTransformer


from tqdm import tqdm

tqdm.pandas()

import numpy as np
import re
import json

SEED = 42  # Set your desired seed for reproducibility

# increase memory and reduce cores to avoid memory errors
# Initialize Spark session
spark = (
    SparkSession.builder.appName("Recipe Recommender System with Embeddings")
    .config("spark.sql.execution.arrow.pyspark.enabled", "false")
    .getOrCreate()
)

  from tqdm.autonotebook import tqdm, trange
24/10/14 21:49:13 WARN Utils: Your hostname, Maximes-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.97 instead (on interface en0)
24/10/14 21:49:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/14 21:49:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/14 21:49:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52098)
Traceback (most recent call last):
  File "/Users/maximebonnesoeur/.pyenv/versions/3.11.9/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/maximebonnesoeur/.pyenv/versions/3.11.9/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/maximebonnesoeur/.pyenv/versions/3.11.9/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/maximebonnesoeur/.pyenv/versions/3.11.9/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/Users/maximebonnesoeur/.pyenv/versions/3.11.9/envs/bring/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Users/maximebonnesoeur/.pyenv/v

In [None]:
# Load pre-trained sentence embedder
embedder = SentenceTransformer("all-MiniLM-L6-v2")

embedder_bc = spark.sparkContext.broadcast(embedder)

In [None]:
from IPython.display import HTML


def display_spark_dataframe(df, num_rows=100):
    # Convert the Spark DataFrame to Pandas for display purposes
    pandas_df = df.limit(num_rows).toPandas()

    # Generate HTML table
    html = pandas_df.to_html(classes="table table-striped table-bordered", index=False)

    # Display the HTML in the notebook
    display(HTML(html))

In [None]:
# Load dataset
file_path = "../data/processed/users_interactions.parquet"
raw_df = spark.read.parquet(file_path)


# Inspect dataset
raw_df.printSchema()
display_spark_dataframe(raw_df, 3)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: string (nullable = true)
 |-- link: string (nullable = true)
 |-- ratings: long (nullable = true)
 |-- import_date: double (nullable = true)



user_id,recipe_id,title,ingredients,link,ratings,import_date
1,3236072851089661531,Sauteed Bananas,"[""bananas"", ""confectioners sugar"", ""flour"", ""butter""]",http://www.cookbooks.com/Recipe-Details.aspx?id=913543,3,1727300000.0
1,9145863324980688501,Fruit Pizza,"[""cream cheese"", ""Marshmallow Fluff"", ""blueberries""]",http://www.cookbooks.com/Recipe-Details.aspx?id=1056162,2,1726177000.0
1,6646370876398660190,Raw Gluten,"[""whole wheat flour"", ""water""]",http://www.cookbooks.com/Recipe-Details.aspx?id=1075892,4,1726609000.0


Here the issue is that the ingredients column is a string and not a list of strings. We need to convert it to a list of strings.

In [None]:
# clean the ingredients column as a list of strings
def clean_ingredients(ingredients):
    # Convert JSON string to list
    ingredients = json.loads(ingredients)
    # Remove digits and extra spaces
    return [re.sub(r"\d+", "", ingredient).strip() for ingredient in ingredients]


clean_ingredients_udf = udf(clean_ingredients, ArrayType(StringType()))
df = raw_df.withColumn("ingredients", clean_ingredients_udf("ingredients"))

In [None]:
# Inspect dataset
df.printSchema()
display_spark_dataframe(df, 5)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- link: string (nullable = true)
 |-- ratings: long (nullable = true)
 |-- import_date: double (nullable = true)



user_id,recipe_id,title,ingredients,link,ratings,import_date
1,3236072851089661531,Sauteed Bananas,"[bananas, confectioners sugar, flour, butter]",http://www.cookbooks.com/Recipe-Details.aspx?id=913543,3,1727300000.0
1,9145863324980688501,Fruit Pizza,"[cream cheese, Marshmallow Fluff, blueberries]",http://www.cookbooks.com/Recipe-Details.aspx?id=1056162,2,1726177000.0
1,6646370876398660190,Raw Gluten,"[whole wheat flour, water]",http://www.cookbooks.com/Recipe-Details.aspx?id=1075892,4,1726609000.0
1,-3751444487082958385,Buckeye Candy,"[powdered sugar, butter, peanut butter, paraffin, chocolate chips]",http://www.cookbooks.com/Recipe-Details.aspx?id=886785,5,1728510000.0
1,4663243117944940535,Beer Bread,"[flour, beer, sugar]",http://www.cookbooks.com/Recipe-Details.aspx?id=26648,1,1726782000.0


## Embedding the ingredients

Here, we will use the SentenceTransformer library to convert the ingredients into sentence embeddings.
The goal would be to use the natural language processing (NLP) model to convert the ingredients into fixed-length dense vectors.

Then, we will use the cosine similarity to find the similarity between the user's input and the recipe ingredients.

In [None]:
df.show(5)

+-------+--------------------+---------------+--------------------+--------------------+-------+-------------------+
|user_id|           recipe_id|          title|         ingredients|                link|ratings|        import_date|
+-------+--------------------+---------------+--------------------+--------------------+-------+-------------------+
|      1| 3236072851089661531|Sauteed Bananas|[bananas, confect...|http://www.cookbo...|      3|1.727300181135684E9|
|      1| 9145863324980688501|    Fruit Pizza|[cream cheese, Ma...|http://www.cookbo...|      2|1.726176981135969E9|
|      1| 6646370876398660190|     Raw Gluten|[whole wheat flou...|http://www.cookbo...|      4|1.726608981136002E9|
|      1|-3751444487082958385|  Buckeye Candy|[powdered sugar, ...|http://www.cookbo...|      5|1.728509781136032E9|
|      1| 4663243117944940535|     Beer Bread|[flour, beer, sugar]|http://www.cookbo...|      1|1.726781781136058E9|
+-------+--------------------+---------------+------------------

In [None]:
# Create a combined text column for titles and ingredients
df = df.withColumn(
    "ingredients", F.sort_array(col("ingredients"))
)  # Sort the ingredients
df = df.withColumn(
    "combined_text",
    F.concat_ws(" ", col("title"), F.concat_ws(" ", col("ingredients"))),
)


# Function to embed text using SentenceTransformer from the broadcast variable
def embed_text(text_list):
    embedder = embedder_bc.value
    embeddings = embedder.encode([text_list], batch_size=8)[
        0
    ]  # Use batch encoding with smaller batch size
    return embeddings.tolist()


embed_text_udf = udf(embed_text, ArrayType(DoubleType()))

# df = df.withColumn("text_embedding", embed_text_udf("ingredients"))

# DISCLAIMER: This udf does not work with the current version of Spark. It is a known issue and it is being worked on.

# Embed the combined text column manually converting the DataFrame to Pandas

# Convert the DataFrame to Pandas
df_pandas = df.toPandas()

# Embed the text using the SentenceTransformer
df_pandas["text_embeddings"] = df_pandas["combined_text"].progress_apply(
    lambda x: [float(val) for val in embedder.encode(x)]
)

schema_fields = df.schema.fields

# Add the 'embeddings' field to the schema
schema_fields.append(StructField("text_embeddings", ArrayType(DoubleType()), True))

# Convert the Pandas DataFrame back to a Spark DataFrame
df = spark.createDataFrame(df_pandas, schema=StructType(schema_fields))

100%|██████████| 254/254 [00:05<00:00, 44.08it/s]


In [None]:
display_spark_dataframe(df, 1)

user_id,recipe_id,title,ingredients,link,ratings,import_date,combined_text,text_embeddings
1,3236072851089661531,Sauteed Bananas,"[bananas, butter, confectioners sugar, flour]",http://www.cookbooks.com/Recipe-Details.aspx?id=913543,3,1727300000.0,Sauteed Bananas bananas butter confectioners sugar flour,"[-0.04720819368958473, -0.03552774712443352, 0.018779192119836807, 0.03397465497255325, 0.031133214011788368, 0.06753106415271759, 0.051513541489839554, 0.057281628251075745, -0.05492916330695152, 0.0643991231918335, 0.0055371420457959175, -0.053802378475666046, -0.001987747149541974, -0.12907610833644867, 0.051568109542131424, 0.05899570882320404, 0.011009857058525085, -0.004234679974615574, -0.05075608938932419, 0.008176838047802448, 0.09461259096860886, -0.022860664874315262, -0.07120402157306671, 0.01731092482805252, 0.020062299445271492, 0.11409356445074081, 0.023074593394994736, -0.044035810977220535, 0.001971392659470439, -0.02889914996922016, -0.07251840084791183, 0.0077716028317809105, 0.018685735762119293, 0.038940321654081345, -0.08879372477531433, -0.030314283445477486, 0.02326280064880848, -0.1277460902929306, -0.01483216229826212, -0.025922546163201332, 0.007503298576921225, 0.07758360356092453, 0.03214264661073685, -0.044510602951049805, -0.015773996710777283, 0.005392811726778746, 0.09347721934318542, -0.055638641119003296, 0.04330061748623848, -0.05434602126479149, 0.01774831861257553, 0.0028786538168787956, -0.02677912637591362, -0.03574324771761894, -0.011224225163459778, -0.021000251173973083, -0.11256932467222214, -0.038423504680395126, 0.0636599063873291, 0.026441724970936775, -0.00026934160268865526, 0.042558345943689346, 0.02123277261853218, -0.005700065288692713, 0.030773429200053215, -0.06500266492366791, -0.1288241744041443, 0.009803316555917263, 0.07667635381221771, 0.059749189764261246, -0.039457663893699646, 0.0008506444864906371, 0.022045910358428955, 0.026387343183159828, -0.047805216163396835, 0.03676089271903038, 0.10848471522331238, -0.03488781675696373, -0.04342992603778839, -0.013661269098520279, -0.008418469689786434, -0.05990208685398102, 0.011760327033698559, 0.01800733059644699, 0.03850390389561653, -0.007392127998173237, 0.04559975489974022, -0.032617099583148956, 0.049865733832120895, 0.00970426294952631, -0.015499268658459187, 0.016990112140774727, 0.09456205368041992, 0.04894685372710228, -0.08680940419435501, -0.05927664786577225, -0.048624590039253235, -0.09644345939159393, 0.011328055523335934, 0.10225837677717209, ...]"


Now that we have our embeddings, we can proceed with the recommendation system. 

Let's use the same approach as before, but this time we will use the embeddings instead of the TF-IDF vectors.

## Recommendation System

In [None]:
SIMILARITY_THRESHOLD = 0.5
# Recommendations for Users
RECOMMENDATION_LIMIT = 5

In [None]:
# Define a function to compute cosine similarity between two vectors
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0.0
    return float(dot_product / (norm_v1 * norm_v2))


cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Self-join the DataFrame to calculate similarity between all recipes
cross_joined_df = df.alias("df1").crossJoin(
    df.select(["recipe_id", "text_embeddings", "title", "user_id"]).alias("df2")
)


# Calculate cosine similarity for each pair of recipes
cross_joined_df = cross_joined_df.withColumn(
    "similarity",
    cosine_similarity_udf(col("df1.text_embeddings"), col("df2.text_embeddings")),
)
cross_joined_df.printSchema()


# Select the necessary columns and alias them to avoid ambiguity
recommendations_df = cross_joined_df.filter(
    (col("df1.recipe_id") != col("df2.recipe_id"))
    & (  # Never recommend the same recipe
        col("df1.user_id") != col("df2.user_id")
    )  # Never recommend the a recipe from the same user
)

# Explicitly select and alias the columns
recommendations_df = (
    recommendations_df.select(
        col("df1.recipe_id").alias("recipe_id"),
        col("df1.user_id").alias("user_id"),
        col("df2.user_id").alias("recommended_user_id"),
        col("df1.title").alias("recipe_title"),
        col("df2.recipe_id").alias("recommended_recipe_id"),
        col("df2.title").alias("recommended_recipe_title"),
        col("similarity"),
        # Add any other columns you may need
    )
    .filter(col("similarity") > SIMILARITY_THRESHOLD)
    .orderBy(col("recipe_id"), col("similarity").desc())
)

display_spark_dataframe(recommendations_df, 5)

root
 |-- user_id: long (nullable = true)
 |-- recipe_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- link: string (nullable = true)
 |-- ratings: long (nullable = true)
 |-- import_date: double (nullable = true)
 |-- combined_text: string (nullable = false)
 |-- text_embeddings: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- recipe_id: long (nullable = true)
 |-- text_embeddings: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- title: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- similarity: double (nullable = true)



                                                                                

recipe_id,user_id,recommended_user_id,recipe_title,recommended_recipe_id,recommended_recipe_title,similarity
-9041716274990925793,1,97,"Chicken, Broccoli And Cheese Casserole",-8169634765373897883,Broccoli Cheese Soup,0.780361
-9041716274990925793,1,82,"Chicken, Broccoli And Cheese Casserole",6221270887431356133,Chicken Divan,0.730094
-9041716274990925793,1,78,"Chicken, Broccoli And Cheese Casserole",6221270887431356133,Chicken Divan,0.730094
-9041716274990925793,1,17,"Chicken, Broccoli And Cheese Casserole",-629699203677729697,Rice-Broccoli Casserole,0.722989
-9041716274990925793,1,86,"Chicken, Broccoli And Cheese Casserole",-629699203677729697,Rice-Broccoli Casserole,0.722989


This is looking quite good. We have a DataFrame with recommendations for each user. We can now filter the recommendations to get the top N recommendations for each user.

In [None]:
# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id
top_recommendations = recommendations_df.groupby(["recipe_id", "recipe_title"]).agg(
    F.collect_list("recommended_recipe_id").alias("recommended_recipes"),
    F.collect_list("recommended_recipe_title").alias("recommended_recipe_titles"),
    F.collect_list("similarity").alias("similarity_scores"),
)
# Show Top Recommendations
display_spark_dataframe(top_recommendations, 1)

                                                                                

recipe_id,recipe_title,recommended_recipes,recommended_recipe_titles,similarity_scores
-9041716274990925793,"Chicken, Broccoli And Cheese Casserole","[-8169634765373897883, 6221270887431356133, 6221270887431356133, -629699203677729697, -629699203677729697, -629699203677729697, -7387520298671766024, -5453638496203510467, -4183142323524057486, -4183142323524057486, -422517828634671599, -422517828634671599, 6082401801560777728, 7581797391280084786, -9000007408826768581, 4100010333716968982, 7402159647861717060, -3794147428210825578, 5196214181404699774, 3278061019699120492, -1145622998892862328, 5924916736367898477, -3330603631272654854, -3330603631272654854, -7399032887253686530, 7428472426516175613, 7847888142213182408, 7847888142213182408, -7052612929375997794, 7738058011543391218, -5641060332518678774, -1031130900881771012, -1031130900881771012, -1031130900881771012, 5706119069825396294, -7641831091295751698, -3730646324417593656, 6606263598321894989, 6606263598321894989, -4152717132441885915, -6598821747478731019, 5213548222747985524, -282742890867789695, -282742890867789695, 6233456789766403456, 1167272635377215359, 6332127167687164440, -3398542533204193611, -3398542533204193611, 1560159323863281568, -3899424102060393968, -3899424102060393968, -3496996154711083177, -7675083767138170827, -7675083767138170827]","[Broccoli Cheese Soup, Chicken Divan, Chicken Divan, Rice-Broccoli Casserole, Rice-Broccoli Casserole, Rice-Broccoli Casserole, Broccoli Casserole, Chicken And Tortilla Casserole, Broccoli Casserole, Broccoli Casserole, Chicken Ole, Chicken Ole, Broccoli Rice Casserole, Chicken And Pasta Salad, Beef And Spanish Rice Casserole, Chicken Casserole, Fresh Broccoli Salad, Chicken Stroganoff, Chicken Spaghetti, Souper Tuna Crunch, Corn Casserole, Taco Dip, Turkey Vegetable Stir-Fry, Turkey Vegetable Stir-Fry, Cashew Chicken, Chicken Casserole, Cabbage Soup, Cabbage Soup, Hash Brown Casserole, Egg Casserole, Crock Pot Pizza, Macaroni And Cheese Casserole, Macaroni And Cheese Casserole, Macaroni And Cheese Casserole, Diane'S Quick Quiche, Chinese Chicken Salad, Chinese Hamburger, Fettucini With Lemon Vegetables, Fettucini With Lemon Vegetables, Chicken-Cheese Ball, Mixed Vegetable Casserole, Chicken Over Rice, Onion Casserole, Onion Casserole, Cheese Ball, Minestrone Soup, Cornbread Dressing, Summer Squash Casserole, Summer Squash Casserole, Ranch Beef And Beans, Salsa, Salsa, Potato And Cheese Pie, Jello Salad(This Is Almost Like A Dessert, Yummy.) , Jello Salad(This Is Almost Like A Dessert, Yummy.) ]","[0.7803608838171421, 0.7300943520686649, 0.7300943520686649, 0.7229892062975044, 0.7229892062975044, 0.7229892062975044, 0.6803865290730899, 0.6781993161607334, 0.6705102324953788, 0.6705102324953788, 0.6559830903297467, 0.6559830903297467, 0.655447322944649, 0.6419158617641101, 0.6347777101121902, 0.6310285464510081, 0.6276166949146768, 0.6197134102339905, 0.5964523322802119, 0.595501115050785, 0.590383262549974, 0.5886176806721224, 0.5848440194979477, 0.5848440194979477, 0.5828935088100246, 0.5789809733460842, 0.572230637620981, 0.572230637620981, 0.5706334294926784, 0.5686599057723092, 0.5658600917248362, 0.5654826006877223, 0.5654826006877223, 0.5654826006877223, 0.5638164598249653, 0.5611374150186267, 0.5525246166856328, 0.5487508105994474, 0.5487508105994474, 0.5487101347909942, 0.5426692830067809, 0.5418634804780189, 0.5348592219651425, 0.5348592219651425, 0.5333743864847311, 0.5300970693046488, 0.5257049525490975, 0.5153015570511095, 0.5153015570511095, 0.5139020305025711, 0.5135352945135078, 0.5135352945135078, 0.512340401514671, 0.511207207280639, 0.511207207280639]"


## Content-based Filtering

The approach here is to recommend a recipe to a user based on the similarity of the ingredients present in other recipes.

Let's have a look

In [None]:
subset = (
    recommendations_df.alias("df1")
    .join(
        df.alias("df2"),
        recommendations_df.recommended_recipe_id == df.recipe_id,
    )
    .select(
        col("df1.user_id"),
        col("df2.recipe_id"),
        col("df2.title"),
        col("df2.link"),
        col("df2.ingredients"),
        col("df1.similarity"),
    )
)


recipe_content_based_recommendations = (
    subset.orderBy(["user_id", "similarity"])
    .groupby(["user_id"])
    .agg(
        F.slice(
            F.collect_list(
                F.struct(
                    F.col("recipe_id"),
                    F.col("title"),
                    F.col("link"),
                    F.col("ingredients"),
                )
            ),
            1,
            RECOMMENDATION_LIMIT,
        ).alias("recommended_recipes"),
        F.slice(F.collect_list("similarity"), 1, RECOMMENDATION_LIMIT).alias(
            "similarity_scores"
        ),
    )
)
display_spark_dataframe(recipe_content_based_recommendations, 5)

                                                                                

user_id,recommended_recipes,similarity_scores
26,"[(-8005079721966657344, Lemon Pudding Pound Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=1068808, [eggs, oil, water, yellow cake mix]), (-7976200725112237420, Cold Bread Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=91723, [butter, cold, eggs, sugar, sweet milk]), (-2316211623411254567, Watergate Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=15086, [+, cool whip, eggs, ginger ale, milk, pecans, pistachio instant pudding, pistachio instant pudding, white cake mix]), (-5294039721809172201, Poppy Seed Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=264391, [almond extract, cinnamon-sugar, eggs, instant vanilla pudding, pineapple juice, poppy seed, powdered sugar, rum, shortening, white cake mix])]","[0.5110366855297126, 0.5297612655261549, 0.5495067948752781, 0.7222705581862582]"
29,"[(6393498659283924984, One Bowl Chocolate Fudge(Microwave) , http://www.cookbooks.com/Recipe-Details.aspx?id=502592, [condensed milk, nuts, salt, semi-sweet chocolate, vanilla]), (-4473752179619106509, 10 Minute Peanut Brittle, http://www.cookbooks.com/Recipe-Details.aspx?id=353985, [butter, light corn syrup, peanuts, salt, soda, sugar, vanilla]), (2125963217123002841, Nana'S Cornbread(For 9-Inch Iron Skillet Or 8 X 8-Inch Pan) , http://www.cookbooks.com/Recipe-Details.aspx?id=917184, [bacon fat, baking soda, buttermilk, corn meal, egg, flour, salt, sugar]), (2125963217123002841, Nana'S Cornbread(For 9-Inch Iron Skillet Or 8 X 8-Inch Pan) , http://www.cookbooks.com/Recipe-Details.aspx?id=917184, [bacon fat, baking soda, buttermilk, corn meal, egg, flour, salt, sugar]), (2125963217123002841, Nana'S Cornbread(For 9-Inch Iron Skillet Or 8 X 8-Inch Pan) , http://www.cookbooks.com/Recipe-Details.aspx?id=917184, [bacon fat, baking soda, buttermilk, corn meal, egg, flour, salt, sugar])]","[0.5007367808482274, 0.5016363648040536, 0.5016522638987081, 0.5016522638987081, 0.5016522638987081]"
65,"[(-2613806832173886595, Gold Rush Brunch, http://www.cookbooks.com/Recipe-Details.aspx?id=769837, [bacon, butter, eggs, flour, milk, parsley, potatoes O'Brien, salt, sour cream]), (9145863324980688501, Fruit Pizza, http://www.cookbooks.com/Recipe-Details.aspx?id=1056162, [Marshmallow Fluff, blueberries, cream cheese]), (5921757842749730267, Annie'S Diabetic Candy, http://www.cookbooks.com/Recipe-Details.aspx?id=942266, [butter, cream cheese, peanut butter, vanilla]), (-3544503804847503851, Ambrosia Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=472909, [flaked coconut, mandarin oranges, maraschino cherries, marshmallows, pineapple, sour cream]), (-3544503804847503851, Ambrosia Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=472909, [flaked coconut, mandarin oranges, maraschino cherries, marshmallows, pineapple, sour cream])]","[0.5001183510105831, 0.5003666473920033, 0.5014627496293429, 0.5015633587763452, 0.5015633587763452]"
19,"[(-2316211623411254567, Watergate Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=15086, [+, cool whip, eggs, ginger ale, milk, pecans, pistachio instant pudding, pistachio instant pudding, white cake mix]), (-703446942390088871, Rhubarb Crumble, http://www.cookbooks.com/Recipe-Details.aspx?id=458858, [brown sugar, butter, cinnamon, cornstarch, flour, rhubarb, rolled oats, sugar, vanilla, water]), (4613396634172017573, Gooey Butter Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=118624, [butter, chocolate cake, cream cheese, eggs, powdered sugar, vanilla]), (-3133213768206477946, Preacher Cookies, http://www.cookbooks.com/Recipe-Details.aspx?id=774571, [butter, milk, oats, powdered cocoa, sugar, vanilla]), (-5294039721809172201, Poppy Seed Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=264391, [almond extract, cinnamon-sugar, eggs, instant vanilla pudding, pineapple juice, poppy seed, powdered sugar, rum, shortening, white cake mix])]","[0.5003431777574933, 0.5038377159519393, 0.5191419476991813, 0.5285763078845723, 0.5287125290257425]"
54,"[(-8783939955479050182, Chocolate Macaroon Bars, http://www.cookbooks.com/Recipe-Details.aspx?id=693256, [bread crumbs, chocolate chips, cocoa, coconut flakes, condensed milk, eggs, graham cracker crumbs, margarine, sugar, vanilla extract]), (3993719957147299301, Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=517611, [Worcestershire sauce, catsup, egg, ground round, milk, onion, pepper, salt, whole wheat bread]), (-7387520298671766024, Broccoli Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=693123, [broccoli, cream of celery soup, margarine, onion, rice]), (-7313861574581366223, Spanish Hamburger #1, http://www.cookbooks.com/Recipe-Details.aspx?id=801085, [celery, green pepper, hamburger, salt, tomato soup, white onion]), (5042635200645017822, Crispy Herb Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=26201, [Parmesan cheese, bread, garlic powder, marjoram, olive oil, paprika, thyme])]","[0.5001765427743281, 0.5012558998606927, 0.502377928759969, 0.5033209105362157, 0.5033834227312081]"


With this first approach, each user gets some recommendations based on the recipe similar to theirs.

However, this approach breeds little diversity in the recommendations. Let's thus observe another approach to recommend recipes to users.

## Collaborative Filtering

In this approach, we will get for each users the users that have similar taste to them, meaning users that have used similar ingredients in their recipe.

Then, we will sample randomly a recipe from the users that have similar taste to the user and that the user has not already tried.

In [None]:
# Recommendations for Users
RECOMMENDATION_LIMIT = 5
SIMILARITY_THRESHOLD_USERS = 0.7  # We want really similar users


# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id
top_user_similarity_recommendations = (
    recommendations_df.orderBy(["user_id"])
    .groupby(["user_id", "recommended_user_id"])
    .agg(
        F.avg("similarity").alias("average_similarity"),
    )
    .orderBy(["user_id", "average_similarity"], ascending=[True, False])
    .withColumnRenamed("user_id", "user_id")
)

top_user_similarity_recommendations = (
    top_user_similarity_recommendations.filter(
        col("average_similarity") > SIMILARITY_THRESHOLD_USERS
    )
    .groupBy("user_id")
    .agg(
        F.slice(F.collect_list("recommended_user_id"), 1, RECOMMENDATION_LIMIT).alias(
            "recommended_users"
        ),
        F.slice(F.collect_list("average_similarity"), 1, RECOMMENDATION_LIMIT).alias(
            "average_similarity_scores"
        ),
    )
)

display_spark_dataframe(top_user_similarity_recommendations, 5)

                                                                                

user_id,recommended_users,average_similarity_scores
2,[46],[0.7177783597897859]
4,[7],[0.7062547522466481]
7,[4],[0.7062547522466481]
9,"[67, 15, 32, 65]","[0.7727806160936544, 0.7350564238146571, 0.7251965111118398, 0.719073136654869]"
10,[40],[0.7148448527438211]


In [None]:
subset.show()



+-------+--------------------+--------------------+--------------------+--------------------+------------------+
|user_id|           recipe_id|               title|                link|         ingredients|        similarity|
+-------+--------------------+--------------------+--------------------+--------------------+------------------+
|      1|-4765217018562870239|Do-Unkles Pumpkin...|http://www.cookbo...|[baking powder, c...|0.5465544975887536|
|      1|-4765217018562870239|Do-Unkles Pumpkin...|http://www.cookbo...|[baking powder, c...|0.6197122986947743|
|      1|-4765217018562870239|Do-Unkles Pumpkin...|http://www.cookbo...|[baking powder, c...|0.5095642078274377|
|      1|-4765217018562870239|Do-Unkles Pumpkin...|http://www.cookbo...|[baking powder, c...|0.6765102593266795|
|      2| 9145863324980688501|         Fruit Pizza|http://www.cookbo...|[Marshmallow Fluf...|0.5011503480185645|
|      2| 9145863324980688501|         Fruit Pizza|http://www.cookbo...|[Marshmallow Fluf...|0.5

                                                                                

In [None]:
# Get top recommendations by collecting the recommended_recipe_id into a list for each recipe_id

subset = (
    top_user_similarity_recommendations.alias("df1")
    .join(
        df.alias("df2"),
        F.array_contains(
            top_user_similarity_recommendations.recommended_users, df.user_id
        ),
    )
    .select(
        "df1.user_id",
        "df1.recommended_users",
        "df2.recipe_id",
        "df2.title",
        "df2.link",
        "df2.ingredients",
    )
)


# Random recipe sampling for each reommended user
recipe_collaborative_recommendations = subset.groupBy("user_id").agg(
    F.slice(
        F.shuffle(
            F.collect_list(
                F.struct(
                    F.col("recipe_id"),
                    F.col("title"),
                    F.col("link"),
                    F.col("ingredients"),
                )
            )
        ),
        1,
        RECOMMENDATION_LIMIT,
    ).alias("recommended_recipes"),
)

display_spark_dataframe(recipe_collaborative_recommendations, 5)

                                                                                

user_id,recommended_recipes
46,"[(-629699203677729697, Rice-Broccoli Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=184755, [broccoli, celery, cream of chicken soup, cream of mushroom soup, frozen broccoli florets, margarine, onion, rice]), (-629699203677729697, Rice-Broccoli Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=184755, [broccoli, celery, cream of chicken soup, cream of mushroom soup, frozen broccoli florets, margarine, onion, rice]), (8676006650672359005, Creole Flounder, http://www.cookbooks.com/Recipe-Details.aspx?id=580768, [basil, drops red pepper sauce, flounder, green pepper, green pepper, ground black pepper, lemon juice, onion, salad oil, salt, tomatoes]), (-4444908957552929730, Heavenly Hash, http://www.cookbooks.com/Recipe-Details.aspx?id=269997, [marshmallows, pineapple, sugar, vanilla, whipping cream]), (131900849733992621, Date Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=206398, [baking soda, dates, egg, flour, margarine, salt, sugar, walnuts, water])]"
7,"[(-703446942390088871, Rhubarb Crumble, http://www.cookbooks.com/Recipe-Details.aspx?id=458858, [brown sugar, butter, cinnamon, cornstarch, flour, rhubarb, rolled oats, sugar, vanilla, water]), (-7815179659791359312, Prize-Winning Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=923674, [egg, ground beef, oats, onion, pepper, salt, tomato juice]), (-9025251382900008244, Marinated Cucumber And Sweet Onion Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=294971, [cucumbers, dark vinegar, pepper, salt, sugar, sweet onions, water]), (-1145622998892862328, Corn Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=1023122, [Cheddar cheese, corn, corn, corn muffin, eggs, margarine, onion, red]), (-7976200725112237420, Cold Bread Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=91723, [butter, cold, eggs, sugar, sweet milk])]"
67,"[(-2057475624038522420, Breakfast Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=118116, [brown sugar, butter, butterscotch pudding, cinnamon, dinner rolls, nuts]), (-7965593088727689504, Mexican Chicken, http://www.cookbooks.com/Recipe-Details.aspx?id=524201, [Doritos, Ro-Tel, cheese, chicken, cream of chicken soup, cream of mushroom soup, onion]), (-3447600827509936262, Lemon Chess Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=341588, [butter, cornmeal, eggs, flour, lemon juice, milk, sugar]), (-3730646324417593656, Chinese Hamburger, http://www.cookbooks.com/Recipe-Details.aspx?id=130210, [celery, cream of chicken soup, cream of mushroom soup, hamburger, noodles, oil, onion, pepper, rice, soy sauce, water]), (7428472426516175613, Chicken Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=661800, [bite, cream of chicken, mushrooms, onion soup, rice])]"
4,"[(-3995740312499116500, Corn Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=899479, [corn, corn, corn muffin, eggs, oleo, sour cream])]"
15,"[(-3447600827509936262, Lemon Chess Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=341588, [butter, cornmeal, eggs, flour, lemon juice, milk, sugar]), (-4444908957552929730, Heavenly Hash, http://www.cookbooks.com/Recipe-Details.aspx?id=269997, [marshmallows, pineapple, sugar, vanilla, whipping cream]), (7428472426516175613, Chicken Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=661800, [bite, cream of chicken, mushrooms, onion soup, rice]), (2545251486700500673, Sesame Ginger Chicken, http://www.cookbooks.com/Recipe-Details.aspx?id=352931, [chicken, grated ginger, honey, sesame seed, soy sauce, thin green onion strips, vegetable cooking spray]), (-2057475624038522420, Breakfast Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=118116, [brown sugar, butter, butterscotch pudding, cinnamon, dinner rolls, nuts])]"


## Merging the two recommendations

In [None]:
final_recommendations = (
    recipe_content_based_recommendations.alias("df1")
    .join(
        recipe_collaborative_recommendations.alias("df2"),
        recipe_content_based_recommendations.user_id
        == recipe_collaborative_recommendations.user_id,
    )
    .select(
        "df1.user_id",
        col("df1.recommended_recipes").alias("content_based_recipes"),
        col("df1.similarity_scores").alias("content_based_similarity_scores"),
        col("df2.recommended_recipes").alias("collaborative_recipes"),
    )
)

In [None]:
# Example usage
display_spark_dataframe(final_recommendations, 5)

                                                                                

user_id,content_based_recipes,content_based_similarity_scores,collaborative_recipes
46,"[(5042635200645017822, Crispy Herb Bread, http://www.cookbooks.com/Recipe-Details.aspx?id=26201, [Parmesan cheese, bread, garlic powder, marjoram, olive oil, paprika, thyme]), (7581797391280084786, Chicken And Pasta Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=342068, [almonds, chicken breasts, egg noodles, green grapes]), (-8794556939154468327, Taco Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=113139, [cheese, cottage cheese, garlic powder, ground beef, onion, sour cream, taco, tomato sauce, tortilla chips]), (-8794556939154468327, Taco Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=113139, [cheese, cottage cheese, garlic powder, ground beef, onion, sour cream, taco, tomato sauce, tortilla chips]), (-8794556939154468327, Taco Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=113139, [cheese, cottage cheese, garlic powder, ground beef, onion, sour cream, taco, tomato sauce, tortilla chips])]","[0.5002246868021848, 0.502377928759969, 0.5029004955692796, 0.5029004955692796, 0.5029004955692796]","[(-629699203677729697, Rice-Broccoli Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=184755, [broccoli, celery, cream of chicken soup, cream of mushroom soup, frozen broccoli florets, margarine, onion, rice]), (-629699203677729697, Rice-Broccoli Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=184755, [broccoli, celery, cream of chicken soup, cream of mushroom soup, frozen broccoli florets, margarine, onion, rice]), (8676006650672359005, Creole Flounder, http://www.cookbooks.com/Recipe-Details.aspx?id=580768, [basil, drops red pepper sauce, flounder, green pepper, green pepper, ground black pepper, lemon juice, onion, salad oil, salt, tomatoes]), (-4444908957552929730, Heavenly Hash, http://www.cookbooks.com/Recipe-Details.aspx?id=269997, [marshmallows, pineapple, sugar, vanilla, whipping cream]), (131900849733992621, Date Cake, http://www.cookbooks.com/Recipe-Details.aspx?id=206398, [baking soda, dates, egg, flour, margarine, salt, sugar, walnuts, water])]"
7,"[(-3447600827509936262, Lemon Chess Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=341588, [butter, cornmeal, eggs, flour, lemon juice, milk, sugar]), (-6598821747478731019, Mixed Vegetable Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=544573, [celery, cheese, mayonnaise, mixed vegetables, onions, salt]), (1398684608330795972, Speedy Little Devils, http://www.cookbooks.com/Recipe-Details.aspx?id=232383, [cake mix, marshmallow cream, oleo, peanut butter]), (2125963217123002841, Nana'S Cornbread(For 9-Inch Iron Skillet Or 8 X 8-Inch Pan) , http://www.cookbooks.com/Recipe-Details.aspx?id=917184, [bacon fat, baking soda, buttermilk, corn meal, egg, flour, salt, sugar]), (2125963217123002841, Nana'S Cornbread(For 9-Inch Iron Skillet Or 8 X 8-Inch Pan) , http://www.cookbooks.com/Recipe-Details.aspx?id=917184, [bacon fat, baking soda, buttermilk, corn meal, egg, flour, salt, sugar])]","[0.5019550248671601, 0.5053076600702187, 0.5107391250659817, 0.5126371873728435, 0.5126371873728435]","[(-703446942390088871, Rhubarb Crumble, http://www.cookbooks.com/Recipe-Details.aspx?id=458858, [brown sugar, butter, cinnamon, cornstarch, flour, rhubarb, rolled oats, sugar, vanilla, water]), (-7815179659791359312, Prize-Winning Meat Loaf, http://www.cookbooks.com/Recipe-Details.aspx?id=923674, [egg, ground beef, oats, onion, pepper, salt, tomato juice]), (-9025251382900008244, Marinated Cucumber And Sweet Onion Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=294971, [cucumbers, dark vinegar, pepper, salt, sugar, sweet onions, water]), (-1145622998892862328, Corn Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=1023122, [Cheddar cheese, corn, corn, corn muffin, eggs, margarine, onion, red]), (-7976200725112237420, Cold Bread Pudding, http://www.cookbooks.com/Recipe-Details.aspx?id=91723, [butter, cold, eggs, sugar, sweet milk])]"
67,"[(6070472271589565164, Corn ""Oysters"", http://www.cookbooks.com/Recipe-Details.aspx?id=1059690, [crackers, cream-style corn, eggs, onion, salt]), (-3496996154711083177, Potato And Cheese Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=784386, [Cheddar cheese, eggs, green onions, pepper, potatoes, salt]), (-6598821747478731019, Mixed Vegetable Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=544573, [celery, cheese, mayonnaise, mixed vegetables, onions, salt]), (4380568420300765492, Original Mayfair Dressing, http://www.cookbooks.com/Recipe-Details.aspx?id=857466, [Accent, anchovies, black pepper, eggs, garlic, lemon juice, mustard, oil, onion, stalks celery]), (5912469327965089621, Zucchini Squares, http://www.cookbooks.com/Recipe-Details.aspx?id=239356, [Parmesan, clove garlic, eggs, oil, onion, oregano, parsley, salt, salt, zucchini])]","[0.5000602005850349, 0.5024647226404425, 0.5048231795208801, 0.5056792500650101, 0.5060978447089635]","[(-2057475624038522420, Breakfast Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=118116, [brown sugar, butter, butterscotch pudding, cinnamon, dinner rolls, nuts]), (-7965593088727689504, Mexican Chicken, http://www.cookbooks.com/Recipe-Details.aspx?id=524201, [Doritos, Ro-Tel, cheese, chicken, cream of chicken soup, cream of mushroom soup, onion]), (-3447600827509936262, Lemon Chess Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=341588, [butter, cornmeal, eggs, flour, lemon juice, milk, sugar]), (-3730646324417593656, Chinese Hamburger, http://www.cookbooks.com/Recipe-Details.aspx?id=130210, [celery, cream of chicken soup, cream of mushroom soup, hamburger, noodles, oil, onion, pepper, rice, soy sauce, water]), (7428472426516175613, Chicken Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=661800, [bite, cream of chicken, mushrooms, onion soup, rice])]"
4,"[(6242106399901263890, Vegetable Beef Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=846408, [celery, ground beef, mixed vegetables, onion, pepper, potatoes, salt, tomato soup, tomatoes, water]), (6242106399901263890, Vegetable Beef Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=846408, [celery, ground beef, mixed vegetables, onion, pepper, potatoes, salt, tomato soup, tomatoes, water]), (6242106399901263890, Vegetable Beef Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=846408, [celery, ground beef, mixed vegetables, onion, pepper, potatoes, salt, tomato soup, tomatoes, water]), (6242106399901263890, Vegetable Beef Soup, http://www.cookbooks.com/Recipe-Details.aspx?id=846408, [celery, ground beef, mixed vegetables, onion, pepper, potatoes, salt, tomato soup, tomatoes, water]), (-3089260678803287909, Sky High Biscuits, http://www.cookbooks.com/Recipe-Details.aspx?id=805008, [baking powder, butter, cream of tartar, egg, flour, milk, salt, sugar])]","[0.500575777260491, 0.500575777260491, 0.500575777260491, 0.500575777260491, 0.5006975152085524]","[(-3995740312499116500, Corn Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=899479, [corn, corn, corn muffin, eggs, oleo, sour cream])]"
15,"[(-5968552099726038111, German Potato Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=567794, [bacon, cider vinegar, hot bacon, new potatoes, onions, parsley, salt, stalks celery]), (-8284361199212835094, Chewy Oatmeal Cookies, http://www.cookbooks.com/Recipe-Details.aspx?id=283076, [baking soda, brown sugar, butter flavor, cinnamon, egg, flour, milk, oats, raisins, salt, vanilla flavor, walnuts]), (4380568420300765492, Original Mayfair Dressing, http://www.cookbooks.com/Recipe-Details.aspx?id=857466, [Accent, anchovies, black pepper, eggs, garlic, lemon juice, mustard, oil, onion, stalks celery]), (-6345232053553097226, Sweet And Sour Red Cabbage, http://www.cookbooks.com/Recipe-Details.aspx?id=806317, [caraway, cider vinegar, onion, pepper, red cabbage, salt, salt, sugar]), (850196355257936730, Frozen Salad, http://www.cookbooks.com/Recipe-Details.aspx?id=733532, [Miracle, marshmallows, pimento cheese, pineapple, walnuts, whipping cream])]","[0.5005355171806503, 0.5034117242731613, 0.5037127584375559, 0.5052120106175259, 0.5060000310585574]","[(-3447600827509936262, Lemon Chess Pie, http://www.cookbooks.com/Recipe-Details.aspx?id=341588, [butter, cornmeal, eggs, flour, lemon juice, milk, sugar]), (-4444908957552929730, Heavenly Hash, http://www.cookbooks.com/Recipe-Details.aspx?id=269997, [marshmallows, pineapple, sugar, vanilla, whipping cream]), (7428472426516175613, Chicken Casserole, http://www.cookbooks.com/Recipe-Details.aspx?id=661800, [bite, cream of chicken, mushrooms, onion soup, rice]), (2545251486700500673, Sesame Ginger Chicken, http://www.cookbooks.com/Recipe-Details.aspx?id=352931, [chicken, grated ginger, honey, sesame seed, soy sauce, thin green onion strips, vegetable cooking spray]), (-2057475624038522420, Breakfast Rolls, http://www.cookbooks.com/Recipe-Details.aspx?id=118116, [brown sugar, butter, butterscotch pudding, cinnamon, dinner rolls, nuts])]"


24/10/17 22:14:59 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 837463 ms exceeds timeout 120000 ms
24/10/17 22:14:59 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/17 22:15:09 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$