!pip install pandas numpy torch pyspark pyarrow

In [21]:
import os
import json
import random
import pandas as pd
import dotenv

# add .. to the path
import sys

sys.path.append("..")

In [23]:
n_recipes = 1000

# Load the uploaded CSV file
file_path = "../data/external/full_dataset.csv.zip"
recipes_df = pd.read_csv(
    file_path,
    usecols=["title", "link", "source", "NER"],
    compression="zip",
    nrows=n_recipes,
)

# Generate an unique identifier for each recipe as an hash of the title and source
recipes_df["recipe_id"] = recipes_df.apply(
    lambda x: hash(
        x["link"],
    ),
    axis=1,
)

recipes_df = recipes_df[["title", "link", "NER", "recipe_id"]]
recipes_df["link"] = recipes_df["link"].apply(lambda x: "http://" + x)
recipes_df["summary"] = recipes_df["title"]

recipes_df.rename(columns={"NER": "ingredients"}, inplace=True)

# Display the first few rows to understand the data structure
recipes_df.head()

Unnamed: 0,title,link,ingredients,recipe_id,summary
0,No-Bake Nut Cookies,http://www.cookbooks.com/Recipe-Details.aspx?i...,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",3402834030376044496,No-Bake Nut Cookies
1,Jewell Ball'S Chicken,http://www.cookbooks.com/Recipe-Details.aspx?i...,"[""beef"", ""chicken breasts"", ""cream of mushroom...",6861306608361106943,Jewell Ball'S Chicken
2,Creamy Corn,http://www.cookbooks.com/Recipe-Details.aspx?i...,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",-7063450339715035317,Creamy Corn
3,Chicken Funny,http://www.cookbooks.com/Recipe-Details.aspx?i...,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",-7135967491823227766,Chicken Funny
4,Reeses Cups(Candy),http://www.cookbooks.com/Recipe-Details.aspx?i...,"[""peanut butter"", ""graham cracker crumbs"", ""bu...",-3278581771199924805,Reeses Cups(Candy)


# Saving the processed recipes in a parquet file

In [24]:
# Save to a parquet file
recipes_df.to_parquet("../data/processed/recipes.parquet")

In [33]:
# Adjusting the number of imported recipes to not exceed the number of available recipes in the dataset
user_data = []
num_users = 100  # Simulating users
max_importable_recipes = len(recipes_df)

for user_id in range(1, num_users + 1):
    num_imported_recipes = random.randint(1, 3)

    # Emulate a power user that imports more recipes
    if user_id <= num_users // 20:
        num_imported_recipes = random.randint(4, min(20, max_importable_recipes))
    imported_recipes = random.sample(recipes_df.index.tolist(), num_imported_recipes)

    for recipe_idx in imported_recipes:
        user_data.append(
            {
                "user_id": user_id,
                "recipe_id": recipes_df.loc[recipe_idx, "recipe_id"],
                "title": recipes_df.loc[recipe_idx, "title"],
                "ingredients": recipes_df.loc[recipe_idx, "ingredients"],
                "link": recipes_df.loc[recipe_idx, "link"],
                "ratings": random.randint(1, 5),  # Random rating
                "import_date": (
                    pd.Timestamp.now() - pd.Timedelta(days=random.randint(1, 100))
                ).timestamp(),  # Random import date)
            }
        )

users_interactions_df = pd.DataFrame(user_data)

In [34]:
users_interactions_df.head()

Unnamed: 0,user_id,recipe_id,title,ingredients,link,ratings,import_date
0,1,-3353573329779529551,Cranberry Salad,"[""cream cheese"", ""margarine"", ""sugar"", ""cranbe...",http://www.cookbooks.com/Recipe-Details.aspx?i...,5,1725306000.0
1,1,4428084575182838413,My Caramel Rolls,"[""yeast"", ""sugar"", ""water"", ""eggs"", ""oleo"", ""s...",http://www.cookbooks.com/Recipe-Details.aspx?i...,3,1727639000.0
2,1,-143089806084606751,Cutout Cookie,"[""shortening"", ""butter"", ""sugar"", ""eggs"", ""van...",http://www.cookbooks.com/Recipe-Details.aspx?i...,3,1728676000.0
3,1,-668212766834799798,Rhubarb Coffee Cake,"[""sugar"", ""butter"", ""egg"", ""buttermilk"", ""flou...",http://www.cookbooks.com/Recipe-Details.aspx?i...,2,1722541000.0
4,1,-6883475931568933275,Baked Pork Tenderloin,"[""pork tenderloin"", ""salt"", ""pepper"", ""bacon"",...",http://www.cookbooks.com/Recipe-Details.aspx?i...,1,1723319000.0


## Saving the users

In [32]:
users_interactions_df.to_parquet(
    "../data/processed/users_interactions.parquet",
)