In [1]:
import pandas as pd

# -----------------------------
# 1. Load datasets
# -----------------------------
ingredients = pd.read_csv("../data/ingredients.csv")
recipes = pd.read_csv("../data/recipes.csv")
recipe_ingredients = pd.read_csv("../data/recipe_ingredients.csv")
output_path = "../data/recipe_features.csv"

In [2]:
# Basic sanity checks (optional but recommended)
assert not ingredients.empty, "Ingredients dataset is empty"
assert not recipes.empty, "Recipes dataset is empty"
assert not recipe_ingredients.empty, "Recipe-Ingredients dataset is empty"

In [3]:

# -----------------------------
# 2. Merge datasets
# -----------------------------
# Step 1: recipe_ingredients + ingredients
merged = recipe_ingredients.merge(
    ingredients,
    on="ingredient_id",
    how="left"
)

# Step 2: add recipe metadata
merged = merged.merge(
    recipes,
    on="recipe_id",
    how="left"
)


In [4]:
merged.head()  # Display first few rows to verify merging worked correctly

Unnamed: 0,recipe_id,ingredient_id,quantity_required,name_x,category,unit,avg_cost_per_unit,calories_per_unit,protein_g_per_unit,carbs_g_per_unit,fat_g_per_unit,name_y,meal_type,difficulty,prep_time_minutes
0,101,1,300,rice,carb,g,0.5,1.3,0.02,0.28,0.003,Jollof Rice,lunch,medium,45
1,101,3,150,tomato,vegetable,g,0.3,0.18,0.01,0.04,0.002,Jollof Rice,lunch,medium,45
2,101,4,100,onion,vegetable,g,0.2,0.4,0.01,0.09,0.001,Jollof Rice,lunch,medium,45
3,101,5,30,vegetable oil,fat,ml,0.8,8.8,0.0,0.0,1.0,Jollof Rice,lunch,medium,45
4,102,2,250,chicken breast,protein,g,2.5,1.65,0.31,0.0,0.04,Grilled Chicken,dinner,easy,30


In [5]:

# -----------------------------
# 3. Feature engineering (row-level)
# -----------------------------
merged["ingredient_cost"] = merged["quantity_required"] * merged["avg_cost_per_unit"]
merged["ingredient_calories"] = merged["quantity_required"] * merged["calories_per_unit"]
merged["ingredient_protein"] = merged["quantity_required"] * merged["protein_g_per_unit"]
merged["ingredient_carbs"] = merged["quantity_required"] * merged["carbs_g_per_unit"]
merged["ingredient_fat"] = merged["quantity_required"] * merged["fat_g_per_unit"]

# -----------------------------
# 4. Aggregate to recipe level
# -----------------------------
recipe_features = (
    merged
    .groupby("recipe_id")
    .agg(
        total_cost=("ingredient_cost", "sum"),
        total_calories=("ingredient_calories", "sum"),
        total_protein_g=("ingredient_protein", "sum"),
        total_carbs_g=("ingredient_carbs", "sum"),
        total_fat_g=("ingredient_fat", "sum"),
        ingredient_count=("ingredient_id", "nunique"),
        prep_time_minutes=("prep_time_minutes", "first"),
        difficulty=("difficulty", "first")
    )
    .reset_index()
)

# -----------------------------
# 5. Encode difficulty
# -----------------------------
difficulty_map = {
    "easy": 1,
    "medium": 2,
    "hard": 3
}

recipe_features["difficulty_score"] = recipe_features["difficulty"].map(difficulty_map)

# Drop original difficulty column
recipe_features.drop(columns=["difficulty"], inplace=True)

# -----------------------------
# 6. Final cleanup
# -----------------------------
# Ensure correct column order
recipe_features = recipe_features[
    [
        "recipe_id",
        "total_cost",
        "total_calories",
        "total_protein_g",
        "total_carbs_g",
        "total_fat_g",
        "ingredient_count",
        "difficulty_score",
        "prep_time_minutes",
    ]
]


In [6]:

# Ensure no missing values
assert recipe_features.isnull().sum().sum() == 0, "Missing values found in final dataset"

# -----------------------------
# 7. Save output
# -----------------------------
recipe_features.to_csv(output_path, index=False)

print("✅ recipe_features.csv successfully created")
print(recipe_features)


✅ recipe_features.csv successfully created
   recipe_id  total_cost  total_calories  total_protein_g  total_carbs_g  \
0        101       239.0           721.0              8.5           99.0   
1        102       651.0           608.5             78.0            4.5   
2        103       192.0           535.6             12.0           62.0   
3        104       172.0           812.0             28.0           81.0   
4        105       114.0           240.0             12.3            3.9   

   total_fat_g  ingredient_count  difficulty_score  prep_time_minutes  
0        31.30                 4                 2                 45  
1        30.05                 3                 1                 30  
2        27.32                 4                 1                 25  
3        43.10                 3                 2                 50  
4        20.03                 3                 1                 15  
