# Notebook 22 - Generate Fuzzy Matching Matrix
Purpose  
This notebook performs fuzzy-only concept matching between recipe ingredients and available products. It simulates the simplified baseline where only textual similarity between canonical food concepts is used—no embeddings, no boosting. Output will be used in Notebook 23 to simulate waste impact under fuzzy-only logic.

## Inputs  
- recipes_with_variants.csv - Ingredient metadata (with ingredient_concept)  
- products_with_variants.csv - Product metadata (with product_concept)  

## Output  
- matching_matrix_fuzzy.csv - Fuzzy concept-level matches with similarity score


In [21]:
import os
import pandas as pd
from difflib import SequenceMatcher

# Folders
input_folder = "variant_exports"
output_folder = "matching_scored"
os.makedirs(output_folder, exist_ok=True)

# Files
recipes_file = os.path.join(input_folder, "recipes_with_variants.csv")
products_file = os.path.join(input_folder, "products_with_variants.csv")


In [22]:
df_recipes = pd.read_csv(recipes_file)
df_products = pd.read_csv(products_file)

print("Loaded:")
print(f"- Recipes: {df_recipes.shape}")
print(f"- Products: {df_products.shape}")


Loaded:
- Recipes: (6, 8)
- Products: (126919, 37)


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [23]:
# Inspect column names to debug issue
print("Available columns in df_products:")
print(df_products.columns.tolist())


Available columns in df_products:
['store', 'date_sales', 'article', 'product_category', 'discount_flag', 'promotion', 'price_theoretical', 'price_sold', 'items_sold', 'volume_sold', 'revenue_sold', 'date_waste', 'product_name', 'brand', 'content', 'unit', 'supplier', 'content_category', 'waste_reason', 'items_wasted', 'value_wasted', 'product_name_clean', 'date', 'delivered_quantity', 'store_name', 'address', 'postal_code', 'city', 'product_normalized', 'product_en', 'product_embedding', 'product_concept', 'waste_flag', 'markdown_flag', 'priority_score', 'product_variants', 'product_concept_variant']


In [24]:
# Try both possible product ID columns
product_id_col = "product_article" if "product_article" in df_products.columns else "article"

# Get unique store–product–concept combinations
df_store_products = df_products[[ "store", product_id_col, "product_name", "product_concept" ]].drop_duplicates()
df_store_products = df_store_products[df_store_products["product_concept"].notna()]
df_store_products = df_store_products.rename(columns={product_id_col: "product_article"})


In [25]:
# Ensure row_id exists for join tracking
if "row_id" not in df_recipes.columns:
    df_recipes = df_recipes.reset_index(drop=False).rename(columns={"index": "row_id"})


In [26]:
# Normalize recipe concepts
df_recipes = df_recipes.dropna(subset=["ingredient_concept"])
df_recipes["ingredient_concept"] = df_recipes["ingredient_concept"].str.strip().str.lower()

# Prepare output list
fuzzy_matches = []

# Iterate over recipes and stores
for _, recipe_row in df_recipes.iterrows():
    ingredient = recipe_row["ingredient_concept"]
    recipe = recipe_row["recipe"]
    row_id = recipe_row["row_id"]

    for _, product_row in df_store_products.iterrows():
        product_concept = product_row["product_concept"]
        product_article = product_row["product_article"]
        store = product_row["store"]
        product_name = product_row["product_name"]

        # Compute fuzzy similarity
        score = SequenceMatcher(None, ingredient, product_concept).ratio()

        # Store result
        fuzzy_matches.append({
            "row_id": row_id,
            "ingredient": ingredient,
            "product_article": product_article,
            "product_name": product_name,
            "product_concept": product_concept,
            "store": store,
            "fuzzy_score": round(score * 100, 2),
            "match_source": "fuzzy_only"
        })


In [27]:
# Fuzzy similarity function
def fuzzy_similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Collect matches
matches = []

for _, recipe_row in df_recipes.iterrows():
    ingredient = recipe_row["ingredient_concept"]
    recipe = recipe_row["recipe"]
    row_id = recipe_row["row_id"]

    for _, product_row in df_store_products.iterrows():
        product = product_row["product_concept"]
        score = fuzzy_similarity(ingredient, product)

        if score > 0.5:  # adjustable threshold
            matches.append({
                "row_id": row_id,
                "recipe": recipe,
                "ingredient": ingredient,
                "store": product_row["store"],
                "product_article": product_row["product_article"],
                "product_name": product_row["product_name"],
                "product_concept": product,
                "fuzzy_score": round(score * 100, 2)
            })

print("Fuzzy matches found:", len(matches))


Fuzzy matches found: 21


In [28]:
# Save fuzzy match matrix
df_fuzzy = pd.DataFrame(matches)
output_file = os.path.join(output_folder, "matching_matrix_fuzzy.csv")
df_fuzzy.to_csv(output_file, index=False)

print("Saved fuzzy matching matrix to:", output_file)
df_fuzzy.head()


Saved fuzzy matching matrix to: matching_scored\matching_matrix_fuzzy.csv


Unnamed: 0,row_id,recipe,ingredient,store,product_article,product_name,product_concept,fuzzy_score
0,0,Strawberry Smoothie,strawberries,5147,247743,Aardbeien,strawberries,100.0
1,1,Banana Yogurt Bowl,banana,4278,144576,Wolkentoetje banaan,banana,100.0
2,1,Banana Yogurt Bowl,banana,5070,144576,Wolkentoetje banaan,banana,100.0
3,2,Greek Yogurt & Honey,yogurt,1024,438226,Roeryoghurt,yogurt,100.0
4,2,Greek Yogurt & Honey,yogurt,1058,427454,Volle yoghurt,yogurt,100.0
