# Synthetic Feedback Data

## 1. Setup and Imports

In [1]:
from collections import defaultdict
import random

import duckdb as db
import numpy as np
import pandas as pd
from faker import Faker


In [2]:
RANDOM_SEED = 42


## 2. Synthetic User Data

In [3]:
N_SYNTHETIC_USERS: int = 5_000

USER_PROFILES: list[str] = [
    "Whole-meal vegan",
    "High protein gym goer",
    "Keto focused professional",
    "Gluten free family shopper",
    "Dairy free dessert lover",
    "Heart healthy senior",
    "Convenience seeker on the go",
    "Mediterranean food enthusiast",
    "Pescatarian omega booster",
    "Sugar conscious diabetic supporter",
]

# Pre-compute synthetic profile assignments for reproducibility.
rng = np.random.default_rng(RANDOM_SEED)
user_profiles: np.ndarray = rng.choice(USER_PROFILES, size=N_SYNTHETIC_USERS)


In [4]:
fake = Faker()

# Generate synthetic user data
data = {
    'UserID': [fake.uuid4() for _ in range(N_SYNTHETIC_USERS)],
    'Name': [fake.name() for _ in range(N_SYNTHETIC_USERS)],
    'Email': [fake.email() for _ in range(N_SYNTHETIC_USERS)],
    'Address': [fake.address() for _ in range(N_SYNTHETIC_USERS)],
    'PhoneNumber': [fake.phone_number() for _ in range(N_SYNTHETIC_USERS)],
    'DateOfBirth': [fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(N_SYNTHETIC_USERS)],
    'UserProfile': user_profiles.tolist(),
}

user_df = pd.DataFrame(data)

user_df.head(3)


Unnamed: 0,UserID,Name,Email,Address,PhoneNumber,DateOfBirth,UserProfile
0,43bfd397-c07c-4859-8c37-3a4e4aa1b945,Ronald Hampton,dflores@example.net,"11703 Barr Village Suite 607\nEast Tiffany, GU...",+1-465-741-0347x75202,1989-08-24,Whole-meal vegan
1,faaf0310-7d9f-433b-88e3-15e6eaf95676,Kayla Wood,watsonjessica@example.net,"5553 Carson Passage\nSullivanstad, MN 61472",(440)742-8657,1979-12-03,Mediterranean food enthusiast
2,6ad5ca0a-b74c-4650-bf26-1810589ee33c,Melanie Johnson,ydominguez@example.net,"4326 Kirby Greens\nLake Amandastad, NV 38149",(553)714-8844,2002-06-17,Convenience seeker on the go


In [5]:
user_df.to_csv("data/users.csv", index=False)


## 3. Synthetic Explicit Rating

In [6]:
with db.connect("../db/open_food.duckdb") as conn:
    product_categories_df = conn.sql("select * from product_categories").df()
    product_ingredients_df = conn.sql("select * from product_ingredients").df()
    product_nutrient_profiles_df = conn.sql("select * from product_nutrient_profiles").df()

product_df = (
    product_categories_df
    .merge(product_ingredients_df, on="code", how="inner")
    .merge(product_nutrient_profiles_df, on="code", how="inner")
)

product_df.columns


Index(['code', 'is_breakfast', 'is_main_meal', 'is_snack', 'is_dessert',
       'is_beverage', 'is_dairy', 'is_meat_fish', 'is_grains_cereals',
       'is_fruits_vegetables', 'is_condiments_spices', 'is_prepared_meals',
       'is_vegan_compatible', 'is_vegetarian_compatible',
       'is_mediterranean_compatible', 'is_keto_compatible',
       'is_gluten_free_compatible', 'is_dairy_free_compatible',
       'ingredients_text', 'has_ingredients_text', 'contains_milk',
       'contains_eggs', 'contains_peanuts', 'contains_tree_nuts',
       'contains_fish', 'contains_shellfish', 'contains_soy', 'contains_wheat',
       'contains_gluten', 'contains_sesame', 'contains_sulfites',
       'contains_celery', 'contains_mustard', 'contains_lupin',
       'contains_molluscs', 'protein_ratio', 'fat_ratio', 'carb_ratio',
       'macro_profile', 'energy_density', 'energy_category',
       'sugar_to_carb_ratio', 'high_sugar', 'fiber_g', 'high_fiber',
       'sodium_g', 'salt_g', 'high_salt', 'vitamin_r

In [7]:
def whole_meal_vegan():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_fruits_vegetables"] == True)
            | (product_df["is_grains_cereals"] == True)
        )
        & (product_df["is_vegan_compatible"] == True)
    ]


def high_protein_gym_goer():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_meat_fish"] == True)
            | (product_df["is_dairy"] == True)
        )
        & (product_df["macro_profile"] == "high_protein")
        & (product_df["protein_ratio"] > 0.3)
        & (product_df["high_satiety"] == True)
        & (product_df["high_sugar"] == False)
    ]


def keto_focused_professional():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_meat_fish"] == True)
            | (product_df["is_dairy"] == True)
        )
        & (product_df["macro_profile"] != "high_carb")
        & (product_df["high_glycemic_impact"] == False)
    ]


def gluten_free_family_shopper():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_dessert"] == True)
            | (product_df["is_snack"] == True)
        )
        & (product_df["macro_profile"] == "balanced")
        & (product_df["contains_wheat"] == False)
        & (product_df["contains_gluten"] == False)
        & (product_df["is_gluten_free_compatible"] == True)
    ]


def dairy_free_dessert_lover():
    return product_df[
        (
            (product_df["is_beverage"] == True)
            | (product_df["is_dessert"] == True)
            | (product_df["is_snack"] == True)
        )
        & (product_df["is_dairy"] == False)
        & (product_df["contains_milk"] == False)
        & (product_df["is_vegan_compatible"] == True)
    ]


def heart_healthy_senior():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_fruits_vegetables"] == True)
            | (product_df["is_mediterranean_compatible"] == True)
        )
        & (product_df["high_salt"] == False)
        & (product_df["high_sugar"] == False)
        & (product_df["high_fiber"] == True)
        & (product_df["energy_category"] == "low_energy")
    ]


def convenience_seeker_on_the_go():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_beverage"] == True)
            | (product_df["is_snack"] == True)
        )
        & (product_df["high_satiety"] == True)
    ]


def mediterranean_food_enthusiast():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_beverage"] == True)
            | (product_df["is_dessert"] == True)
            | (product_df["is_snack"] == True)
        )
        & (product_df["macro_profile"] == "balanced")
        & (product_df["is_mediterranean_compatible"] == True)
    ]


def pescatarian_omega_booster():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_meat_fish"] == True)
        )
        & (product_df["contains_fish"] == True)
        & (product_df["macro_profile"] == "high_protein")
    ]


def sugar_conscious_diabetic_supporter():
    return product_df[
        (
            (product_df["is_breakfast"] == True)
            | (product_df["is_main_meal"] == True)
            | (product_df["is_meat_fish"] == True)
        )
        & (product_df["macro_profile"].isin(["low_carb", "high_protein"]))
        & (product_df["high_sugar"] == False)
        & (product_df["high_glycemic_impact"] == False)
        & (product_df["is_keto_compatible"] == True)
    ]


In [8]:
# Calculate product counts for all user preference functions
whole_meal_vegan_df = whole_meal_vegan()
high_protein_gym_goer_df = high_protein_gym_goer()
keto_focused_professional_df = keto_focused_professional()
gluten_free_family_shopper_df = gluten_free_family_shopper()
dairy_free_dessert_lover_df = dairy_free_dessert_lover()
heart_healthy_senior_df = heart_healthy_senior()
convenience_seeker_on_the_go_df = convenience_seeker_on_the_go()
mediterranean_food_enthusiast_df = mediterranean_food_enthusiast()
pescatarian_omega_booster_df = pescatarian_omega_booster()
sugar_conscious_diabetic_supporter_df = sugar_conscious_diabetic_supporter()

# Print product counts using variables
print(f"Whole-meal vegan: {whole_meal_vegan_df.shape[0]}")
print(f"High protein gym goer: {high_protein_gym_goer_df.shape[0]}")
print(f"Keto focused professional: {keto_focused_professional_df.shape[0]}")
print(f"Gluten free family shopper: {gluten_free_family_shopper_df.shape[0]}")
print(f"Dairy free dessert lover: {dairy_free_dessert_lover_df.shape[0]}")
print(f"Heart healthy senior: {heart_healthy_senior_df.shape[0]}")
print(f"Convenience seeker on the go: {convenience_seeker_on_the_go_df.shape[0]}")
print(f"Mediterranean food enthusiast: {mediterranean_food_enthusiast_df.shape[0]}")
print(f"Pescatarian omega booster: {pescatarian_omega_booster_df.shape[0]}")
print(f"Sugar conscious diabetic supporter: {sugar_conscious_diabetic_supporter_df.shape[0]}")

user_preferences: dict[str, pd.DataFrame] = {
    "Whole-meal vegan": whole_meal_vegan_df,
    "High protein gym goer": high_protein_gym_goer_df,
    "Keto focused professional": keto_focused_professional_df,
    "Gluten free family shopper": gluten_free_family_shopper_df,
    "Dairy free dessert lover": dairy_free_dessert_lover_df,
    "Heart healthy senior": heart_healthy_senior_df,
    "Convenience seeker on the go": convenience_seeker_on_the_go_df,
    "Mediterranean food enthusiast": mediterranean_food_enthusiast_df,
    "Pescatarian omega booster": pescatarian_omega_booster_df,
    "Sugar conscious diabetic supporter": sugar_conscious_diabetic_supporter_df,
}


Whole-meal vegan: 170321
High protein gym goer: 5462
Keto focused professional: 94748
Gluten free family shopper: 7389
Dairy free dessert lover: 58112
Heart healthy senior: 1077
Convenience seeker on the go: 81706
Mediterranean food enthusiast: 47701
Pescatarian omega booster: 30182
Sugar conscious diabetic supporter: 8007


In [9]:
def generate_explicit_feedback(
    users: pd.DataFrame,
    seed: int | None = 42,
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)

    sampled_users: list[str] = []
    sampled_codes_dict: dict[str, list[list[str]]] = defaultdict(list)
    sampled_codes: list[str] = []
    sampled_ratings: list[int] = []

    for user_id, profile_name in users[["UserID", "UserProfile"]].itertuples(index=False):
        n_samples = rng.integers(3, 10)

        preferences_df = user_preferences[profile_name].sample(n=n_samples)
        chosen_preferences = preferences_df["code"].tolist()

        chosen_similar = []
        if (profile_similar_codes := sampled_codes_dict.get(profile_name)):
            n_similar = rng.integers(0, n_samples)
            while len(chosen_similar) < n_similar:
                similar_codes = random.choice(profile_similar_codes)
                n_choose = rng.integers(0, 3)
                chosen_similar += rng.choice(similar_codes, size=n_choose).tolist()

        chosen_codes = chosen_similar + chosen_preferences

        sampled_users.extend([user_id] * n_samples)
        sampled_codes_dict[profile_name].append(chosen_codes[:n_samples])
        sampled_codes.extend(chosen_codes[:n_samples])
        sampled_ratings.extend(rng.choice([3, 4, 5], size=n_samples))

    return pd.DataFrame(
        {
            "UserID": sampled_users,
            "ProductCode": sampled_codes,
            "Rating": sampled_ratings,
        }
    )

explicit_feedback_df = generate_explicit_feedback(user_df)
explicit_feedback_df


Unnamed: 0,UserID,ProductCode,Rating
0,43bfd397-c07c-4859-8c37-3a4e4aa1b945,3452200004079,5
1,43bfd397-c07c-4859-8c37-3a4e4aa1b945,2609842031098,4
2,43bfd397-c07c-4859-8c37-3a4e4aa1b945,3700036905900,4
3,faaf0310-7d9f-433b-88e3-15e6eaf95676,0041190069613,5
4,faaf0310-7d9f-433b-88e3-15e6eaf95676,8901262173605,3
...,...,...,...
30131,74486206-6700-4f6c-ae94-aa623db78c5b,0081910501986,3
30132,74486206-6700-4f6c-ae94-aa623db78c5b,3439497009550,4
30133,74486206-6700-4f6c-ae94-aa623db78c5b,3760077180421,3
30134,74486206-6700-4f6c-ae94-aa623db78c5b,0804531440058,4


In [10]:
explicit_feedback_df.to_csv("data/feedback_explicit.csv", index=False)


## 4. Synthetic Implicit Feedback

In [11]:
IMPLICIT_EVENT_PROBABILITIES = {
    "view": 0.55,
    "query_search": 0.2,
    "click": 0.15,
    "compare": 0.1,
}


In [12]:
def generate_implicit_feedback(
    users: pd.DataFrame,
    seed: int | None = 42,
) -> pd.DataFrame:
    rng = np.random.default_rng(seed)

    sampled_users: list[str] = []
    sampled_codes_dict: dict[str, list[list[str]]] = defaultdict(list)
    sampled_codes: list[str] = []
    sampled_events: list[str] = []

    for user_id, profile_name in users[["UserID", "UserProfile"]].itertuples(index=False):
        n_samples = rng.integers(10,31)

        preferences_df = user_preferences[profile_name].sample(n=n_samples)
        chosen_preferences = preferences_df["code"].tolist()

        chosen_similar = []
        if (profile_similar_codes := sampled_codes_dict.get(profile_name)):
            n_similar = rng.integers(0, n_samples)
            while len(chosen_similar) < n_similar:
                similar_codes = random.choice(profile_similar_codes)
                n_choose = rng.integers(0, 6)
                chosen_similar += rng.choice(similar_codes, size=n_choose).tolist()

        chosen_codes = chosen_similar + chosen_preferences

        sampled_users.extend([user_id] * n_samples)
        sampled_codes_dict[profile_name].append(chosen_codes[:n_samples])
        sampled_codes.extend(chosen_codes[:n_samples])
        sampled_events.extend(
            rng.choice(
                list(IMPLICIT_EVENT_PROBABILITIES.keys()),
                size=n_samples,
                p=list(IMPLICIT_EVENT_PROBABILITIES.values()),
            )
        )

    return pd.DataFrame(
        {
            "UserID": sampled_users,
            "ProductCode": sampled_codes,
            "Event": sampled_events,
        }
    )

implicit_feedback_df = generate_implicit_feedback(user_df)
implicit_feedback_df


Unnamed: 0,UserID,ProductCode,Event
0,43bfd397-c07c-4859-8c37-3a4e4aa1b945,2600203037001,view
1,43bfd397-c07c-4859-8c37-3a4e4aa1b945,3422440003491,click
2,43bfd397-c07c-4859-8c37-3a4e4aa1b945,20953553,query_search
3,43bfd397-c07c-4859-8c37-3a4e4aa1b945,5412158011244,view
4,43bfd397-c07c-4859-8c37-3a4e4aa1b945,0898575001115,compare
...,...,...,...
99997,74486206-6700-4f6c-ae94-aa623db78c5b,8017139103658,view
99998,74486206-6700-4f6c-ae94-aa623db78c5b,03266090,query_search
99999,74486206-6700-4f6c-ae94-aa623db78c5b,03266090,compare
100000,74486206-6700-4f6c-ae94-aa623db78c5b,0206916000084,view


In [13]:
implicit_feedback_df.to_csv("data/feedback_implicit.csv", index=False)
