In [1]:
import pandas as pd
import numpy as np
import re
from typing import Optional

df = pd.read_csv('preprocessedPhase1FoodFacts.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")

Dataset shape: (3185, 47)

Column names:
['url', 'product_name', 'barcode', 'brand', 'quantity', 'serving_size', 'nutriscore_letter', 'nova_group', 'ingredients_text', 'allergens', 'traces', 'energy_kcal_100g', 'fat_100g', 'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'main_image_url', 'categories', 'contains_palm_oil', 'vegetarian_status', 'vegan_status', 'nutrient_level_fat', 'nutrient_level_saturated_fat', 'nutrient_level_sugars', 'nutrient_level_salt', 'additives', 'packaging', 'stores', 'countries', 'origins', 'manufacturing_places', 'ecoscore_grade', 'ecoscore_score', 'carbon_footprint_100g', 'additives_count', 'sugar_ratio', 'energy_density', 'protein_ratio', 'macro_balance', 'healthy_score', 'log_energy_kcal_100g', 'log_fat_100g', 'log_sugars_100g', 'log_salt_100g']


In [2]:
target_columns = ['brand', 'allergens', 'ingredients_text', 'countries', 'additives']

for col in target_columns:
    print(f"\n{'='*60}")
    print(f"Column: {col}")
    print(f"{'='*60}")
    print(f"Data type: {df[col].dtype}")
    print(f"Total rows: {len(df)}")
    print(f"Non-null count: {df[col].notna().sum()}")
    print(f"Null count: {df[col].isna().sum()}")
    print(f"Unique values: {df[col].nunique()}")
    print(f"\nSample values (first 5 non-null):")
    for idx, val in enumerate(df[col].dropna().head(5).values):
        print(f"  {idx+1}. {repr(val)[:100]}")


Column: brand
Data type: object
Total rows: 3185
Non-null count: 3039
Null count: 146
Unique values: 1478

Sample values (first 5 non-null):
  1. 'La Casetta di Campagna'
  2. 'H-E-B Organics'
  3. 'DmBio'
  4. 'Diamond of california'
  5. 'Tree Of Life  Inc.'

Column: allergens
Data type: object
Total rows: 3185
Non-null count: 2321
Null count: 864
Unique values: 243

Sample values (first 5 non-null):
  1. 'Nuts'
  2. 'Nuts'
  3. 'Nuts, Peanuts, Soybeans'
  4. 'Nuts'
  5. 'Nuts'

Column: ingredients_text
Data type: object
Total rows: 3185
Non-null count: 3048
Null count: 137
Unique values: 2820

Sample values (first 5 non-null):
  1. 'Italian: Mais'
  2. 'German: 99,5% Linsenmehl*, 0,5 % Meersalz. aus biologischer Landwirtschaft Kann Spuren von Soja und
  3. 'Almonds'
  4. 'Organic whole raw almonds'
  5. 'Almonds . soybean and/or peanut oil. sea salt.'

Column: countries
Data type: object
Total rows: 3185
Non-null count: 3181
Null count: 4
Unique values: 490

Sample values (first 5 

In [3]:
COUNTRY_NORMALIZATION = {
    "usa": "united states",
    "u.s.a.": "united states",
    "us": "united states",
    "united states of america": "united states",
    "uk": "united kingdom",
    "u.k.": "united kingdom",
    "england": "united kingdom",
    "scotland": "united kingdom",
    "wales": "united kingdom",
    "gb": "united kingdom",
    "germany": "germany",
    "deutschland": "germany",
    "austria": "austria",
    "Ã¶sterreich": "austria",
    "france": "france",
    "espagne": "spain",
    "spain": "spain",
    "italy": "italy",
}

COMPANY_SUFFIXES = {
    "inc", "inc.", "sa", "s.a.", "gmbh", "srl", "s.r.l.",
    "ltd", "ltd.", "co.", "company", "ag", "kg"
}


In [4]:
def _basic_clean(text: str) -> str:
    """Common text cleaning applied to all columns."""
    text = re.sub(r'http\S+|www\S+', '', text)

    text = re.sub(r'\S+@\S+', '', text)

    text = re.sub(r'&[a-z]+;', '', text)

    text = re.sub(
        r'^(german|french|italian|spanish|english|portuguese|dutch|swedish|danish|'
        r'norwegian|polish|czech|hungarian|romanian|bulgarian|greek|russian|ukrainian|'
        r'turkish|arabic|hebrew|japanese|chinese|korean):\s*',
        '',
        text,
        flags=re.IGNORECASE
    )

    text = re.sub(r'\[|\]', '', text)

    text = re.sub(r'\s*[,;]\s*', ', ', text)
    text = re.sub(r'\s*\.\s*', '. ', text)

    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\*{2,}', '*', text)

    text = re.sub(r'\s+', ' ', text)

    text = text.lower().strip()

    return text



In [None]:
def _clean_ingredients(text: str) -> str:
    """
    Ingredients: keep main content, drop parentheses with 'may contain' / 'traces' / 'contains'
    because they are often cross-contamination metadata.
    """
    text = re.sub(r'\([^)]*(may contain|traces|contains)[^)]*\)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [9]:
def _normalize_brand(text: str) -> str:
    """
    Brand: split on commas, remove duplicates and company suffixes.
    Return a pipe-separated list of normalized brand names.
    """
    if not text:
        return ''

    parts = [p.strip() for p in re.split(r'[,;/]+', text) if p.strip()]
    cleaned_brands = []

    for p in parts:
        tokens = [t for t in p.split() if t not in COMPANY_SUFFIXES]
        if not tokens:
            continue
        brand_name = ' '.join(tokens)
        cleaned_brands.append(brand_name)

    seen = set()
    unique_brands = []
    for b in cleaned_brands:
        if b not in seen:
            seen.add(b)
            unique_brands.append(b)

    return ' | '.join(unique_brands)

In [10]:
def _normalize_allergens(text: str) -> str:
    """
    Allergens: split on commas/semicolons, normalize phrases into canonical tags.
    Output is a pipe-separated list of allergen tags for easy ML use.
    """
    if not text:
        return ''

    raw_tokens = re.split(r'[;,/]+', text)
    normalized = []

    for tok in raw_tokens:
        tok = tok.strip().lower()
        if not tok:
            continue

        tok = re.sub(r'\bmay contain\b', '', tok)
        tok = re.sub(r'\btraces of\b', '', tok)
        tok = re.sub(r'\bcontains\b', '', tok)
        tok = tok.strip()

        if not tok:
            continue

        if 'sulphur dioxide' in tok or 'sulfites' in tok or 'sulphites' in tok:
            label = 'sulphur_dioxide_sulphites'
        elif 'peanut' in tok:
            label = 'peanuts'
        elif 'nut' in tok and 'peanut' not in tok:
            label = 'tree_nuts'
        elif 'milk' in tok or 'lactose' in tok or 'dairy' in tok:
            label = 'milk'
        elif 'egg' in tok:
            label = 'eggs'
        elif 'soy' in tok or 'soya' in tok:
            label = 'soybeans'
        elif 'gluten' in tok or 'wheat' in tok or 'barley' in tok or 'rye' in tok:
            label = 'gluten'
        elif 'sesame' in tok:
            label = 'sesame'
        elif 'fish' in tok:
            label = 'fish'
        elif 'crustacean' in tok or 'shrimp' in tok or 'prawn' in tok or 'crab' in tok:
            label = 'crustaceans'
        elif 'mustard' in tok:
            label = 'mustard'
        elif 'celery' in tok:
            label = 'celery'
        elif 'lupin' in tok:
            label = 'lupin'
        else:
            label = tok

        normalized.append(label)

    seen = set()
    unique_labels = []
    for a in normalized:
        if a not in seen:
            seen.add(a)
            unique_labels.append(a)

    return ' | '.join(unique_labels)




In [11]:
def _normalize_countries(text: str) -> str:
    """
    Countries: split, normalize variants (usa -> united states), drop 'world',
    deduplicate. Return pipe-separated list.
    """
    if not text:
        return ''

    parts = [p.strip().lower() for p in re.split(r'[;,/]+', text) if p.strip()]
    normalized = []

    for p in parts:
        p = re.sub(r'\s+', ' ', p)
        p = COUNTRY_NORMALIZATION.get(p, p)

        if p in {'world', 'en:world'}:
            continue

        normalized.append(p)

    seen = set()
    unique = []
    for c in normalized:
        if c not in seen:
            seen.add(c)
            unique.append(c)

    return ' | '.join(unique)
