# Notebook 18 - Construct Final Matching Matrix

### Purpose
This notebook constructs a complete many-to-many matching matrix between recipes and available store products using ontology concepts and variant-expanded representations. It consolidates previous annotation layers to build a flexible foundation for downstream scoring, prioritization, and optimization.

### Objectives
- Merge recipes and products on shared `concept` and `variant` terms
- Allow for many-to-many fuzzy and semantic matching alignment
- Annotate matrix with metadata: source (exact vs. variant), type, and store-level flags
- Output comprehensive candidate match table for future scoring

### Inputs
- `recipes_with_variants.csv` - Ingredient concepts and variants per recipe
- `products_with_variants.csv` - Product concepts and variants per store-product
- `variant_map.csv` - Variant to concept mapping (including types)

### Outputs
- `matching_matrix_candidates.csv` - Matrix of recipe-product pairings with match metadata
- Console previews of match types and summary statistics


In [1]:
import os
import pandas as pd

# Define folders
input_folder = "variant_exports"
output_folder = "matching_matrix"
os.makedirs(output_folder, exist_ok=True)

# File paths
recipes_file = os.path.join(input_folder, "recipes_with_variants.csv")
products_file = os.path.join(input_folder, "products_with_variants.csv")
variant_map_file = os.path.join(input_folder, "variant_map.csv")


In [2]:
df_recipes = pd.read_csv(recipes_file)
df_products = pd.read_csv(products_file)
df_variants = pd.read_csv(variant_map_file)

print("Loaded:")
print(f"- Recipes: {df_recipes.shape}")
print(f"- Products: {df_products.shape}")
print(f"- Variant Map: {df_variants.shape}")


Loaded:
- Recipes: (6, 8)
- Products: (126919, 37)
- Variant Map: (7, 3)


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [19]:
# Add unique row ID to each recipe row
df_recipes = df_recipes.reset_index(drop=False).rename(columns={"index": "row_id"})

# Quick check
print(df_recipes.columns)


Index(['row_id', 'recipe_row_id', 'recipe_row_id', 'recipe', 'ingredient',
       'ingredient_normalized', 'ingredient_en', 'ingredient_embedding',
       'ingredient_concept', 'ingredient_variants',
       'ingredient_concept_variant'],
      dtype='object')


In [20]:
def expand_variants(row):
    base = row["ingredient_concept"]
    variants = row["ingredient_variants"]

    # Parse variant column if it's a stringified list
    if isinstance(variants, str):
        try:
            variants = eval(variants)
        except:
            variants = []

    if not isinstance(variants, (set, list)):
        variants = []

    all_terms = [base] if pd.notna(base) else []
    all_terms += list(variants)

    match_type = []
    if pd.notna(base):
        match_type.append("concept")
    match_type += ["variant"] * len(variants)

    return pd.DataFrame({
        "row_id": [row["row_id"]] * len(all_terms),
        "ingredient": [row["ingredient"]] * len(all_terms),
        "match_term": all_terms,
        "match_type": match_type
    })

# Expand recipes
recipe_rows = []
for _, row in df_recipes.iterrows():
    recipe_rows.append(expand_variants(row))

df_recipe_matches = pd.concat(recipe_rows, ignore_index=True)
print("Exploded recipe match terms:", df_recipe_matches.shape)
df_recipe_matches.head()


Exploded recipe match terms: (12, 4)


Unnamed: 0,row_id,ingredient,match_term,match_type
0,0,strawberries,strawberries,concept
1,1,banana,banana,concept
2,2,yogurt,yogurt,concept
3,3,honey,honey,concept
4,3,honey,flower honey,variant


In [15]:
# Explode product concept and variant terms for matching
def expand_product_variants(row):
    base = row["product_concept"]
    variants = row["product_variants"]
    
    if isinstance(variants, str):
        try:
            variants = eval(variants)
        except:
            variants = []
    if not isinstance(variants, (set, list)):
        variants = []

    all_terms = [base] if pd.notna(base) else []
    all_terms += list(variants)

    match_type = []
    if pd.notna(base):
        match_type.append("concept")
    match_type += ["variant"] * len(variants)

    return pd.DataFrame({
        "product_article": [row["article"]] * len(all_terms),
        "product_name": [row["product_name_clean"]] * len(all_terms),
        "match_term": all_terms,
        "match_type": match_type,
        "store": [row["store"]] * len(all_terms)
    })

# Apply across all product rows
product_rows = []
for _, row in df_products.iterrows():
    product_rows.append(expand_product_variants(row))

df_product_matches = pd.concat(product_rows, ignore_index=True)
print("Exploded product match terms:", df_product_matches.shape)
df_product_matches.head()


Exploded product match terms: (26, 5)


Unnamed: 0,product_article,product_name,match_term,match_type,store
0,438226,roeryoghurt,yogurt,concept,1024.0
1,427454,volle yoghurt,yogurt,concept,1058.0
2,438226,roeryoghurt,yogurt,concept,1090.0
3,105755,kwark aardbei,yogurt,concept,1160.0
4,315170,kwark aardbei,yogurt,concept,3123.0


In [21]:
# Join on match_term to construct candidate matrix
df_matches = df_recipe_matches.merge(
    df_product_matches,
    on="match_term",
    suffixes=("_recipe", "_product")
)

print("Joined match matrix:", df_matches.shape)
df_matches.head()


Joined match matrix: (26, 8)


Unnamed: 0,row_id,ingredient,match_term,match_type_recipe,product_article,product_name,match_type_product,store
0,0,strawberries,strawberries,concept,247743,aardbeien,concept,5147.0
1,1,banana,banana,concept,144576,wolkentoetje banaan,concept,4278.0
2,1,banana,banana,concept,144576,wolkentoetje banaan,concept,5070.0
3,2,yogurt,yogurt,concept,438226,roeryoghurt,concept,1024.0
4,2,yogurt,yogurt,concept,427454,volle yoghurt,concept,1058.0


In [22]:
# Define match source: concept→concept = exact, concept↔variant = mixed, variant↔variant = fuzzy
def classify_match(row):
    key = (row["match_type_recipe"], row["match_type_product"])
    if key == ("concept", "concept"):
        return "exact"
    elif "concept" in key:
        return "variant_mixed"
    else:
        return "variant_only"

df_matches["match_source"] = df_matches.apply(classify_match, axis=1)

# Quick preview of distribution
print("Match source distribution:")
print(df_matches["match_source"].value_counts())


Match source distribution:
exact           22
variant_only     4
Name: match_source, dtype: int64


In [23]:
# Output file path
match_output = os.path.join(output_folder, "matching_matrix_candidates.csv")

# Save match matrix
df_matches.to_csv(match_output, index=False)
print(f"Matching matrix saved to: {match_output}")


Matching matrix saved to: matching_matrix\matching_matrix_candidates.csv
