In [1]:
import spacy
import pandas as pd
from collections import Counter
import re

In [2]:
df = pd.read_csv('/ghome/c5mcv04/MCV-C5-2025-Team4/dataset/food_dataset_split/train.csv')  
titles = df['Title'].tolist() 

In [3]:
nlp = spacy.load("en_core_web_lg")

food_terms = ["risotto", "pad thai", "bulgogi", "pho", "goulash",
    'chicken', 'beef', 'pork', 'fish', 'shrimp', 'rice', 'pasta', 
    'tomato', 'onion', 'garlic', 'cheese', 'egg', 'potato', 
    'carrot', 'broccoli', 'spinach', 'mushroom', 'bell pepper'
] 
for term in food_terms:
    lex = nlp.vocab[term]
    lex.is_stop = False  



In [4]:
common_ingredients = [
    'chicken', 'beef', 'pork', 'fish', 'shrimp', 'rice', 'pasta', 
    'tomato', 'onion', 'garlic', 'cheese', 'egg', 'potato', 
    'carrot', 'broccoli', 'spinach', 'mushroom', 'bell pepper'
]

common_cuisines = [
    'italian', 'mexican', 'chinese', 'japanese', 'indian', 
    'thai', 'french', 'mediterranean', 'american', 'korean',
    'turkish', 'vietnamese'
]
def extract_ingredients(text):
    doc = nlp(text.lower())
    ingredients = []
    
    for token in doc:
        # nouns that are likely ingredients
        if (token.pos_ in ["NOUN", "PROPN"] and 
            not token.is_stop and 
            len(token.text) > 2):
            
            # exclude non-ingredient nouns
            if not token.text in ["dinner", "lunch", "recipe", "dish"]:
                ingredients.append(token.text)
    
    # Handle compound ingredients like "bell pepper"
    for chunk in doc.noun_chunks:
        if any(tok.text in common_ingredients for tok in chunk):
            ingredients.append(chunk.text)
    
    return list(set(ingredients))  

def extract_cuisines(text):
    doc = nlp(text.lower())
    cuisines = []
    
    # Look for adjectives that might indicate cuisine (Italian, Mexican)
    for token in doc:
        if token.pos_ == "PROPN" and token.text in common_cuisines:
            cuisines.append(token.text)
    
    # Handle cuisine mentions like "Thai style"
    for i, token in enumerate(doc[:-1]):
        next_token = doc[i+1]
        if token.text in common_cuisines and next_token.text == "style":
            cuisines.append(f"{token.text} {next_token.text}")
    
    return list(set(cuisines))

In [5]:
ingredient_list = []
cuisine_list = []

for title in titles:
    ingredient_list.extend(extract_ingredients(title))
    cuisine_list.extend(extract_cuisines(title))

# Count occurrences
ingredient_counts = Counter(ingredient_list)
cuisine_counts = Counter(cuisine_list)

In [6]:
def get_lemmatized_counts(term_list):
    lemmatized_counts = Counter()
    for term in term_list:
        doc = nlp(term)
        # Get base form (lemma) of each word in the term
        lemma = " ".join([token.lemma_ for token in doc])
        lemmatized_counts[lemma] += 1
    return lemmatized_counts

lemmatized_ingredients = get_lemmatized_counts(ingredient_list)
lemmatized_cuisines = get_lemmatized_counts(cuisine_list)

In [16]:
def create_stats_df(counter, total_dishes):
    stats = []
    for term, count in counter.most_common():
        percentage = (count / total_dishes) * 100
        if percentage < 0.4:
            representation = 'discarded'
        elif percentage < 1:
            representation = 'underrepresented'
        else:
            representation = 'common'
        stats.append({
            'term': term,
            'count': count,
            'percentage': round((count / total_dishes) * 100, 2),
            'representation': representation
        })
    return pd.DataFrame(stats)

total_dishes = len(titles)
ingredient_df = create_stats_df(ingredient_counts, total_dishes)
cuisine_df = create_stats_df(cuisine_counts, total_dishes)

In [17]:
ingredient_df.to_csv('enhanced_ingredient_analysis_withgenerated.csv', index=False)
cuisine_df.to_csv('enhanced_cuisine_analysis_withgenerated.csv', index=False)

print("Underrepresented Ingredients:")
print(ingredient_df[ingredient_df['representation'] == 'underrepresented'].sort_values('percentage'))

print("\nUnderrepresented Cuisines:")
print(cuisine_df[cuisine_df['representation'] == 'underrepresented'].sort_values('percentage'))

Underrepresented Ingredients:
            term  count  percentage    representation
183        olive     51        0.41  underrepresented
179         bars     51        0.41  underrepresented
182  goat cheese     51        0.41  underrepresented
181       noodle     51        0.41  underrepresented
180       citrus     51        0.41  underrepresented
..           ...    ...         ...               ...
69      rosemary    123        0.98  underrepresented
68       mustard    124        0.99  underrepresented
67       carrots    124        0.99  underrepresented
66     cranberry    125        1.00  underrepresented
65         chile    125        1.00  underrepresented

[119 rows x 4 columns]

Underrepresented Cuisines:
Empty DataFrame
Columns: [term, count, percentage, representation]
Index: []


In [None]:
ingredient_df = pd.read_csv('enhanced_ingredient_analysis_withgenerated.csv')
LOWER_THRESHOLD = 0.42
UPPER_THRESHOLD = 1

target_ingredients = ingredient_df[
    (ingredient_df['percentage'] >= LOWER_THRESHOLD) & 
    (ingredient_df['percentage'] <= UPPER_THRESHOLD)
].sort_values('percentage', ascending=False)

print(f"Ingredients appearing in {LOWER_THRESHOLD}% to {UPPER_THRESHOLD}% of dishes:")
print(target_ingredients)

target_ingredients.to_csv('target_ingredients_1_to_0.42_percent.csv', index=False)

Ingredients appearing in 0.42% to 1% of dishes:
            term  count  percentage    representation
59        greens    108        1.00  underrepresented
60         style    107        0.99  underrepresented
61       spinach    106        0.98  underrepresented
62          herb    104        0.97  underrepresented
63       sausage    100        0.93  underrepresented
..           ...    ...         ...               ...
173  goat cheese     46        0.43  underrepresented
174      bourbon     46        0.43  underrepresented
175      walnuts     46        0.43  underrepresented
176       relish     45        0.42  underrepresented
177          rib     45        0.42  underrepresented

[119 rows x 4 columns]


In [12]:
print(target_ingredients['term'].tolist())

['greens', 'style', 'spinach', 'herb', 'sausage', 'mushrooms', 'ricotta', 'kale', 'chops', 'avocado', 'maple', 'sandwiches', 'strawberry', 'cauliflower', 'goat', 'zucchini', 'vanilla', 'eggplant', 'arugula', 'cucumber', 'vegetables', 'parmesan', 'feta', 'asparagus', 'pot', 'cabbage', 'peanut', 'carrot', 'fruit', 'buttermilk', 'raspberry', 'walnut', 'sugar', 'pear', 'carrots', 'summer', 'sesame', 'ribs', 'basil', 'oil', 'egg', 'peas', 'spice', 'broccoli', 'pecan', 'slaw', 'curry', 'stew', 'glaze', 'mango', 'crust', 'pineapple', 'beet', 'ham', 'pesto', 'olives', 'celery', 'sage', 'crab', 'pizza', 'vegetable', 'baby', 'banana', 'noodles', 'blueberry', 'tacos', 'peppers', 'fish', 'pistachio', 'hazelnut', 'chipotle', 'sandwich', 'miso', 'butternut', 'spring', 'chili', 'thyme', 'herbs', 'ingredient', 'salt', 'dip', 'cheesecake', 'pancakes', 'gravy']


In [16]:
len([ 'spinach', 'herb', 'sausage', 'mushrooms', 'ricotta', 'kale', 'chops', 'avocado', 'maple', 'sandwiches', 'strawberry', 'cauliflower', 'goat', 'zucchini', 'vanilla', 'eggplant', 'arugula', 'cucumber', 'vegetables', 'parmesan', 'feta', 'asparagus', 'cabbage', 'peanut', 'carrot', 'fruit', 'buttermilk', 'raspberry', 'walnut', 'sugar', 'pear', 'carrots', 'sesame', 'ribs', 'basil', 'egg', 'peas', 'spice', 'broccoli', 'pecan', 'curry', 'stew', 'glaze', 'mango', 'pineapple', 'beet', 'ham', 'pesto', 'olives', 'celery', 'sage', 'crab', 'pizza', 'banana', 'noodles', 'blueberry', 'tacos', 'peppers', 'fish', 'pistachio', 'hazelnut', 'chipotle', 'miso', 'butternut', 'chili', 'thyme', 'herbs', 'dip', 'cheesecake', 'pancakes']
)

70

In [3]:
print(target_ingredients['term'].tolist())

['greens', 'style', 'spinach', 'herb', 'sausage', 'mushrooms', 'ricotta', 'kale', 'chops', 'avocado', 'maple', 'sandwiches', 'strawberry', 'cauliflower', 'goat', 'zucchini', 'vanilla', 'eggplant', 'arugula', 'cucumber', 'vegetables', 'parmesan', 'feta', 'asparagus', 'pot', 'cabbage', 'peanut', 'carrot', 'fruit', 'buttermilk', 'raspberry', 'walnut', 'sugar', 'pear', 'carrots', 'summer', 'sesame', 'ribs', 'basil', 'oil', 'egg', 'peas', 'spice', 'broccoli', 'pecan', 'slaw', 'curry', 'stew', 'glaze', 'mango', 'crust', 'pineapple', 'beet', 'ham', 'pesto', 'olives', 'celery', 'sage', 'crab', 'pizza', 'vegetable', 'baby', 'banana', 'noodles', 'blueberry', 'tacos', 'peppers', 'fish', 'pistachio', 'hazelnut', 'chipotle', 'sandwich', 'miso', 'butternut', 'spring', 'chili', 'thyme', 'herbs', 'ingredient', 'salt', 'dip', 'cheesecake', 'pancakes', 'gravy', 'steaks', 'wine', 'syrup', 'meatballs', 'cinnamon', 'milk', 'burgers', 'rosemary', 'parsley', 'cakes', 'nut', 'toast', 'pea', 'cheddar', 'dill',

In [8]:
list_ingredients=[ 'spinach', 'herb', 'sausage', 'mushrooms', 'ricotta', 'kale', 'chops', 'avocado', 'maple', 'sandwiches', 'strawberry', 'cauliflower', 'goat', 'zucchini', 'vanilla', 'eggplant', 'arugula', 'cucumber', 'vegetables', 'parmesan', 'feta', 'asparagus', 'cabbage', 'peanut', 'carrot', 'fruit', 'buttermilk', 'raspberry', 'walnut', 'sugar', 'pear', 'carrots', 'sesame', 'ribs', 'basil', 'egg', 'peas', 'spice', 'broccoli', 'pecan', 'curry', 'stew', 'glaze', 'mango', 'pineapple', 'beet', 'ham', 'pesto', 'olives', 'celery', 'sage', 'crab', 'pizza', 'banana', 'noodles', 'blueberry', 'tacos', 'peppers', 'fish', 'pistachio', 'hazelnut', 'chipotle', 'miso', 'butternut', 'chili', 'thyme', 'dip', 'cheesecake', 'pancakes','gravy', 'steaks', 'syrup', 'meatballs', 'milk', 'burgers', 'nut', 'toast', 'pea', 'cheddar', 'peach', 'brussels', 'prosciutto', 'apples', 'duck', 'tenderloin','spaghetti', 'horseradish', 'walnuts', 'rib']
len(list_ingredients)

89

In [9]:
def remove_plural_duplicates(ingredients):
    cleaned = set()
    for item in ingredients:
        singular = item.rstrip('s') if item.endswith('s') else item
        # Solo agregamos el plural si no existe la forma singular
        if singular not in cleaned:
            cleaned.add(item)
    return list(cleaned)


cleaned_items = remove_plural_duplicates(list_ingredients)
print(len(cleaned_items))
print(cleaned_items)

87
['pea', 'cheesecake', 'pear', 'peach', 'glaze', 'raspberry', 'spinach', 'meatballs', 'nut', 'buttermilk', 'strawberry', 'chops', 'apples', 'burgers', 'egg', 'celery', 'ricotta', 'cauliflower', 'sugar', 'avocado', 'spaghetti', 'gravy', 'crab', 'brussels', 'feta', 'asparagus', 'herb', 'pancakes', 'peppers', 'sandwiches', 'sausage', 'dip', 'curry', 'horseradish', 'ham', 'mango', 'pizza', 'maple', 'carrot', 'hazelnut', 'ribs', 'rib', 'miso', 'goat', 'olives', 'vegetables', 'broccoli', 'duck', 'prosciutto', 'fish', 'peas', 'chipotle', 'sesame', 'zucchini', 'spice', 'kale', 'syrup', 'chili', 'walnut', 'eggplant', 'tenderloin', 'pineapple', 'sage', 'banana', 'cucumber', 'peanut', 'toast', 'arugula', 'cheddar', 'stew', 'tacos', 'vanilla', 'noodles', 'mushrooms', 'pesto', 'steaks', 'milk', 'beet', 'blueberry', 'butternut', 'basil', 'parmesan', 'thyme', 'fruit', 'pecan', 'pistachio', 'cabbage']


In [9]:
import csv
from collections import Counter

# Path to your CSV file
csv_path = "/ghome/c5mcv04/MCV-C5-2025-Team4/w5/enhanced_ingredient_analysis.csv"  # Change this to your actual filename

# Read data from CSV
items = []
with open(csv_path, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)  # Using DictReader to handle columns by name
    for row in reader:
        try:
            term = row['term'].strip()
            count = int(row['count'])
            percentage = float(row['percentage'])
            # We'll calculate representation ourselves, ignoring the CSV's representation column
            items.append((term, count, percentage))
        except (ValueError, KeyError):
            continue  # Skip rows with missing/malformed data

if not items:
    print("No valid items found in the CSV file.")
    exit()

# Total count across all items
total_count = sum(count for _, count, _ in items)

if total_count == 0:
    print("Total count is zero - cannot calculate category sums.")
    exit()

# Category-wise sum of counts
representation_sums = defaultdict(int)

# Classify and sum based on percentage values
for term, count, percentage in items:
    if percentage < 0.40:
        category = 'discarded'
    elif percentage < 1:
        category = 'underrepresented'
    else:
        category = 'common'
    representation_sums[category] += count

# Display final results
print("\nRepresentation analysis Before Generation:")
print(f"Total items processed: {len(items)}")
print(f"Total count: {total_count:,}\n")

print("Percentage of total counts by representation category:")
for category in ['discarded', 'underrepresented', 'common']:
    category_sum = representation_sums.get(category, 0)
    pct_of_total = (category_sum / total_count) * 100
    print(f"{category.capitalize():<15}: {pct_of_total:.2f}% ({category_sum:,}/{total_count:,})")

# Bonus: Show top 3 items in each category
print("\nTop items by category:")
category_items = {'discarded': [], 'underrepresented': [], 'common': []}
for term, count, percentage in items:
    if percentage < 0.40:
        category_items['discarded'].append((term, count, percentage))
    elif percentage < 1:
        category_items['underrepresented'].append((term, count, percentage))
    else:
        category_items['common'].append((term, count, percentage))

for category in ['common', 'underrepresented', 'discarded']:
    print(f"\n{category.capitalize()} (top 3):")
    sorted_items = sorted(category_items[category], key=lambda x: x[1], reverse=True)[:3]
    for term, count, percentage in sorted_items:
        print(f"  {term:<15}: {count:,} ({percentage:.2f}%)")



Representation analysis Before Generation:
Total items processed: 5716
Total count: 38,007

Percentage of total counts by representation category:
Discarded      : 46.42% (17,644/38,007)
Underrepresented: 21.83% (8,296/38,007)
Common         : 31.75% (12,067/38,007)

Top items by category:

Common (top 3):
  salad          : 788 (7.32%)
  chicken        : 698 (6.48%)
  sauce          : 549 (5.10%)

Underrepresented (top 3):
  style          : 107 (0.99%)
  spinach        : 106 (0.98%)
  herb           : 104 (0.97%)

Discarded (top 3):
  quinoa         : 42 (0.39%)
  verde          : 42 (0.39%)
  watermelon     : 42 (0.39%)


In [8]:
import csv
from collections import Counter
from collections import defaultdict
# Path to your CSV file
csv_path = "/ghome/c5mcv04/MCV-C5-2025-Team4/w5/enhanced_ingredient_analysis_withgenerated.csv"  # Change this to your actual filename

# Read data from CSV
items = []
with open(csv_path, 'r', newline='') as csvfile:
    reader = csv.DictReader(csvfile)  # Using DictReader to handle columns by name
    for row in reader:
        try:
            term = row['term'].strip()
            count = int(row['count'])
            percentage = float(row['percentage'])
            # We'll calculate representation ourselves, ignoring the CSV's representation column
            items.append((term, count, percentage))
        except (ValueError, KeyError):
            continue  # Skip rows with missing/malformed data

if not items:
    print("No valid items found in the CSV file.")
    exit()

# Total count across all items
total_count = sum(count for _, count, _ in items)

if total_count == 0:
    print("Total count is zero - cannot calculate category sums.")
    exit()

# Category-wise sum of counts
representation_sums = defaultdict(int)

# Classify and sum based on percentage values
for term, count, percentage in items:
    if percentage < 0.40:
        category = 'discarded'
    elif percentage < 1:
        category = 'underrepresented'
    else:
        category = 'common'
    representation_sums[category] += count

# Display final results
print("\nRepresentation analysis after Generation:")
print(f"Total items processed: {len(items)}")
print(f"Total count: {total_count:,}\n")

print("Percentage of total counts by representation category:")
for category in ['discarded', 'underrepresented', 'common']:
    category_sum = representation_sums.get(category, 0)
    pct_of_total = (category_sum / total_count) * 100
    print(f"{category.capitalize():<15}: {pct_of_total:.2f}% ({category_sum:,}/{total_count:,})")

# Bonus: Show top 3 items in each category
print("\nTop items by category:")
category_items = {'discarded': [], 'underrepresented': [], 'common': []}
for term, count, percentage in items:
    if percentage < 0.40:
        category_items['discarded'].append((term, count, percentage))
    elif percentage < 1:
        category_items['underrepresented'].append((term, count, percentage))
    else:
        category_items['common'].append((term, count, percentage))

for category in ['common', 'underrepresented', 'discarded']:
    print(f"\n{category.capitalize()} (top 3):")
    sorted_items = sorted(category_items[category], key=lambda x: x[1], reverse=True)[:3]
    for term, count, percentage in sorted_items:
        print(f"  {term:<15}: {count:,} ({percentage:.2f}%)")




Representation analysis after Generation:
Total items processed: 6055
Total count: 44,918

Percentage of total counts by representation category:
Discarded      : 44.09% (19,805/44,918)
Underrepresented: 21.37% (9,600/44,918)
Common         : 34.54% (15,513/44,918)

Top items by category:

Common (top 3):
  salad          : 1,016 (8.12%)
  chicken        : 867 (6.93%)
  sauce          : 684 (5.47%)

Underrepresented (top 3):
  carrots        : 124 (0.99%)
  mustard        : 124 (0.99%)
  rosemary       : 123 (0.98%)

Discarded (top 3):
  apples         : 49 (0.39%)
  tofu           : 49 (0.39%)
  barbecue       : 48 (0.38%)
