In [86]:
# %% [markdown]
# # 1. Imports and Initial Setup
#
# Import necessary libraries and download NLTK data.

# %%
import nltk
import pandas as pd
import numpy as np
import re
import ast  # To safely evaluate string literals
from tqdm.notebook import tqdm

# NLP Preprocessing
from nltk.corpus import stopwords
from nltk import word_tokenize

# Word2Vec
from gensim.models import Word2Vec

# Similarity
from sklearn.metrics.pairwise import cosine_similarity


# NLTK Data Download (run once if needed)
def download_nltk_resource(resource_id, resource_name):
    try:
        nltk.data.find(resource_name)
    except LookupError:
        print(f"Downloading NLTK resource: {resource_id}")
        nltk.download(resource_id)


download_nltk_resource('stopwords', 'corpora/stopwords')
download_nltk_resource('punkt', 'tokenizers/punkt')

# Initialize global resources
stop_words = set(stopwords.words('english'))
# Compile regex for efficiency
non_alpha_regex = re.compile('[^a-zA-Z ]')

# Configure tqdm for pandas
tqdm.pandas()

In [87]:
# %% [markdown]
# # 2. Preprocessing Functions (Phrase-Aware for Word2Vec)
#
# Define preprocessing functions. The key change is treating multi-word ingredients and techniques as single tokens for Word2Vec by joining them with underscores.

# %%
def preprocess_item_list_for_w2v(item_list: list) -> list:
    """
    Preprocesses a list of ingredient or technique strings for Word2Vec.
    - Lowercase
    - Removes non-alphabetic characters (keeping spaces initially)
    - Replaces spaces within items with underscores to treat them as single tokens.
    - Filters out very short items and stopwords (optional, here we keep stopwords
      to preserve phrases like 'cream_of_tartar').
    Returns a list of processed, unique strings (tokens/phrases).
    """
    processed_items = set()
    for item_phrase in item_list:
        # Basic cleaning: lowercase, remove non-alpha (keep spaces for now)
        cleaned_phrase = non_alpha_regex.sub('', item_phrase.lower()).strip()
        # Remove extra spaces
        cleaned_phrase = re.sub(r'\s+', ' ', cleaned_phrase).strip()

        if cleaned_phrase and len(cleaned_phrase) > 1:
            # Replace spaces with underscores to form single tokens
            token_phrase = cleaned_phrase.replace(' ', '_')
            # Optional: Add further filtering if needed (e.g., length)
            # if len(token_phrase) > 2: # Example filter
            processed_items.add(token_phrase)

    return list(processed_items)


# --- Helper Function for String List Evaluation ---
def safe_eval_list_string(list_string: str) -> list:
    """Safely evaluate a string representation of a list."""
    if not isinstance(list_string, str): return []
    try:
        evaluated = ast.literal_eval(list_string)
        return evaluated if isinstance(evaluated, list) else []
    except (ValueError, SyntaxError, TypeError):
        return []

In [88]:
# %% [markdown]
# # 3. Data Loading and Preprocessing Execution
#
# Load the dataset and apply the new phrase-aware preprocessing.

# %%
DATASET_PATH = '../dataset/RAW_merged.csv'  # Make sure this path is correct
df = pd.DataFrame()  # Initialize df

try:
    print(f"Loading dataset from: {DATASET_PATH}")
    df = pd.read_csv(DATASET_PATH)
    print(f"Dataset loaded with {len(df)} rows.")

    # Drop rows with missing essential data
    df.dropna(subset=['ingredients', 'techniques_list'], inplace=True)
    print(f"Rows after removing NaNs in ingredients/techniques: {len(df)}")

    print("Converting string columns to lists...")
    # Use progress_apply for visual feedback with tqdm
    df['ingredients_list'] = df['ingredients'].progress_apply(safe_eval_list_string)
    df['techniques_list_eval'] = df['techniques_list'].progress_apply(safe_eval_list_string)

    print("Applying phrase-aware preprocessing for Word2Vec...")
    # Apply the NEW preprocessing to both ingredients and techniques
    df['ingredients_processed_w2v'] = df['ingredients_list'].progress_apply(preprocess_item_list_for_w2v)
    df['techniques_processed_w2v'] = df['techniques_list_eval'].progress_apply(preprocess_item_list_for_w2v)

    # Display processed data preview
    print("\nPreview of the processed DataFrame:")
    print(df[['ingredients_list', 'ingredients_processed_w2v', 'techniques_list_eval',
              'techniques_processed_w2v']].head())

except FileNotFoundError:
    print(f"ERROR: Dataset file not found at '{DATASET_PATH}'")
except Exception as e:
    print(f"ERROR during data loading/preprocessing: {e}")


Loading dataset from: dataset/RAW_merged.csv
Dataset loaded with 178265 rows.
Rows after removing NaNs in ingredients/techniques: 178265
Converting string columns to lists...


  0%|          | 0/178265 [00:00<?, ?it/s]

  0%|          | 0/178265 [00:00<?, ?it/s]

Applying phrase-aware preprocessing for Word2Vec...


  0%|          | 0/178265 [00:00<?, ?it/s]

  0%|          | 0/178265 [00:00<?, ?it/s]


Preview of the processed DataFrame:
                                    ingredients_list  \
0  [winter squash, mexican seasoning, mixed spice...   
1  [prepared pizza crust, sausage patty, eggs, mi...   
2  [spreadable cheese with garlic and herbs, new ...   
3  [milk, vanilla ice cream, frozen apple juice c...   
4  [fennel seeds, green olives, ripe olives, garl...   

                           ingredients_processed_w2v  \
0  [olive_oil, winter_squash, honey, mexican_seas...   
1  [eggs, milk, sausage_patty, prepared_pizza_cru...   
2  [shallots, parsley, olive_oil, new_potatoes, r...   
3  [apple, vanilla_ice_cream, frozen_apple_juice_...   
4  [garlic, orange_juice, orange_rind, green_oliv...   

                    techniques_list_eval  \
0                    [bake, grate, melt]   
1                    [bake, pour, whisk]   
2              [bake, boil, dice, drain]   
3               [blend, combine, smooth]   
4  [crush, marinate, refrigerate, toast]   

                techniqu

In [89]:
# %% [markdown]
# # 4. Word2Vec Model Training
#
# Train the Word2Vec model using the processed ingredients and techniques, where multi-word items are treated as single tokens (e.g., 'tomato_puree').

# %%
model_w2v = None  # Initialize model variable

if not df.empty and 'ingredients_processed_w2v' in df.columns and 'techniques_processed_w2v' in df.columns:
    # Word2Vec Parameters
    VECTOR_SIZE = 100  # Dimensionality of the word vectors
    WINDOW_SIZE = 5  # Context window size
    MIN_COUNT = 3  # Ignore words/phrases with frequency lower than this
    SG_MODEL = 1  # 1 for Skip-gram, 0 for CBOW
    EPOCHS = 5  # Number of training iterations (increase for potentially better results)
    WORKERS = -1  # Use all available CPU cores

    print("Preparing data for Word2Vec training...")
    # Combine ingredient and technique lists for each recipe
    # Each item in the list is now a potential phrase (e.g., 'tomato_puree' or 'bake')
    corpus_for_w2v = df['ingredients_processed_w2v'] + df['techniques_processed_w2v']
    # Filter out any potentially empty lists that resulted from preprocessing
    corpus_for_w2v = [item_list for item_list in corpus_for_w2v if item_list]
    if corpus_for_w2v:
        print(f"Training Word2Vec model on {len(corpus_for_w2v)} documents...")
        # Train the model - corpus_for_w2v is already a list of lists of "tokens" (words or underscore_phrases)
        model_w2v = Word2Vec(sentences=corpus_for_w2v,
                             vector_size=VECTOR_SIZE,
                             window=WINDOW_SIZE,
                             min_count=MIN_COUNT,
                             sg=SG_MODEL,
                             epochs=EPOCHS,
                             workers=WORKERS)
        print("Word2Vec training completed.")
        print(f"Vocabulary size: {len(model_w2v.wv.index_to_key)}")

        # Optional: Save the model for later use
        # model_w2v.save("cooking_word2vec.model")
        # print("Model saved to cooking_word2vec.model")

    else:
        print("ERROR: No valid textual data found to train Word2Vec after preprocessing.")
else:
    print("ERROR: DataFrame is empty or required processed columns are missing. Cannot train Word2Vec.")


Preparing data for Word2Vec training...
Training Word2Vec model on 178265 documents...
Word2Vec training completed.
Vocabulary size: 8716


In [90]:
# %% [markdown]
# # 5. Prepare Technique Vectors for Prediction
#
# Extract the vectors for all unique processed techniques from the trained Word2Vec model.

# %%
techniques_vectors = {}  # Dictionary to store {technique_phrase: vector}

if model_w2v:
    print("Extracting technique vectors from the trained Word2Vec model...")
    # Get unique techniques from the processed column
    if 'techniques_processed_w2v' in df.columns:
        # Explode the list column, drop NaNs, get unique values
        unique_processed_techniques = df['techniques_processed_w2v'].explode().dropna().unique()
        print(f"Found {len(unique_processed_techniques)} unique processed techniques.")

        count_found = 0
        count_missing = 0
        missing_techniques = []

        for technique in unique_processed_techniques:
            if technique in model_w2v.wv:
                techniques_vectors[technique] = model_w2v.wv[technique]
                count_found += 1
            else:
                # This happens if a technique appeared less than MIN_COUNT times
                count_missing += 1
                missing_techniques.append(technique)

        print(f"Successfully extracted vectors for {count_found} techniques.")
        if count_missing > 0:
            print(
                f"WARNING: {count_missing} techniques were not found in the Word2Vec vocabulary (likely due to min_count={MIN_COUNT}).")
            # print(f"Missing techniques sample: {missing_techniques[:20]}") # Optional: show missing ones
    else:
        print("ERROR: Column 'techniques_processed_w2v' not found in DataFrame.")
else:
    print("ERROR: Word2Vec model ('model_w2v') is not available. Cannot extract technique vectors.")

# Optional: Inspect some extracted vectors
if techniques_vectors:
    print("\nSample of techniques with extracted vectors:")
    print(list(techniques_vectors.keys())[:15])


Extracting technique vectors from the trained Word2Vec model...
Found 57 unique processed techniques.
Successfully extracted vectors for 57 techniques.

Sample of techniques with extracted vectors:
['melt', 'bake', 'grate', 'pour', 'whisk', 'dice', 'boil', 'drain', 'combine', 'blend', 'smooth', 'crush', 'refrigerate', 'toast', 'marinate']


In [91]:
# %% [markdown]
# # 6. Prediction Function (Average Vector Similarity)
#
# Define the prediction function using the average vector approach. It calculates a single vector for the input ingredients and finds the closest technique vectors.

# %%
def predict_cooking_methods_avg_vector(ingredients_str: str,
                                       technique_vectors_dict: dict,
                                       w2v_model: Word2Vec,
                                       top_n: int = 5,
                                       debug: bool = False) -> list or str:
    """
    Predicts cooking techniques based on similarity to the average vector of input ingredients.
    """
    if not w2v_model: return "Error: Word2Vec model not available."
    if not technique_vectors_dict: return "Error: No technique vectors available."
    if not ingredients_str: return "Error: Input ingredients string is empty."

    # --- 1. Preprocess input ingredients (using the same method as training) ---
    ingredient_input_list = [ing.strip() for ing in ingredients_str.split(',') if ing.strip()]
    if not ingredient_input_list: return "Input ingredients list is empty after cleaning."
    processed_input_ingredients = preprocess_item_list_for_w2v(ingredient_input_list)

    if debug: print(f"DEBUG: Processed input ingredients (for W2V lookup): {processed_input_ingredients}")

    # --- 2. Get vectors for valid input ingredients and calculate average vector ---
    valid_ingredient_vectors = []
    not_found_ingredients = []
    for ingredient_token in processed_input_ingredients:
        if ingredient_token in w2v_model.wv:
            valid_ingredient_vectors.append(w2v_model.wv[ingredient_token])
        else:
            not_found_ingredients.append(ingredient_token)

    if debug:
        print(f"DEBUG: Found vectors for {len(valid_ingredient_vectors)} ingredients.")
        if not_found_ingredients: print(f"DEBUG: Ingredients not found in W2V vocab: {not_found_ingredients}")

    if not valid_ingredient_vectors:
        return "None of the input ingredients were found in the model's vocabulary."

    # Calculate the average vector for the input ingredients
    average_ingredient_vector = np.mean(valid_ingredient_vectors, axis=0).reshape(1, -1)

    if debug: print(f"DEBUG: Calculated average ingredient vector shape: {average_ingredient_vector.shape}")

    # --- 3. Calculate similarity between average ingredient vector and all technique vectors ---
    technique_similarities = {}
    tech_items = list(technique_vectors_dict.items())  # List of (technique_name, vector)

    for technique, tech_vector in tech_items:
        try:
            # Ensure tech_vector is valid numpy array for cosine_similarity
            if isinstance(tech_vector, np.ndarray) and tech_vector.shape[0] == w2v_model.vector_size:
                similarity = cosine_similarity(average_ingredient_vector, tech_vector.reshape(1, -1))[0][0]
                if not np.isnan(similarity):  # Check for NaN results
                    technique_similarities[technique] = similarity
            # else: # Optional: Log invalid technique vectors
            #    if debug: print(f"DEBUG: Skipping invalid vector for technique: {technique}")
        except Exception as e:
            if debug: print(f"DEBUG: Error calculating similarity for {technique}: {e}")

    if not technique_similarities:
        return "Could not calculate similarities for any techniques."

    # --- 4. Sort techniques by similarity and return top N ---
    # Sort the dictionary by similarity scores in descending order
    sorted_techniques = sorted(technique_similarities.items(), key=lambda item: item[1], reverse=True)

    if debug: print(f"DEBUG: Top 10 techniques by similarity: {sorted_techniques[:10]}")

    # Extract just the names of the top N techniques
    top_predicted_techniques = [tech_name for tech_name, score in sorted_techniques[:top_n]]

    return top_predicted_techniques


In [92]:
# %% [markdown]
# # 7. Example Usage of the Prediction Function
#
# Test the prediction function with a sample list of ingredients.

# %%
if model_w2v and techniques_vectors:
    # Example input
    new_ingredients_example = "tomato puree, lemon juice, salt, oregano, basil, thyme, garlic powder"
    print(f"\nPredicting techniques for ingredients: '{new_ingredients_example}'")
    print("(Using average ingredient vector similarity approach)")

    # Call the prediction function
    predicted_methods = predict_cooking_methods_avg_vector(
        ingredients_str=new_ingredients_example,
        technique_vectors_dict=techniques_vectors,
        w2v_model=model_w2v,
        top_n=5,
        debug=True  # Enable debug messages
    )

    print("\n-------------------------------------")
    if isinstance(predicted_methods, list):
        print(f"Predicted cooking techniques (Top {len(predicted_methods)}):")
        # Replace underscores with spaces for better readability in the final output
        for i, method in enumerate(predicted_methods):
            print(f"{i + 1}. {method.replace('_', ' ')}")
    else:
        # Print error message if prediction failed
        print(f"Prediction result: {predicted_methods}")
    print("-------------------------------------")

else:
    print("\nCannot run prediction example: Word2Vec model or technique vectors are not available.")
    print(f"Is model_w2v trained? {'Yes' if model_w2v else 'No'}")
    print(f"Are techniques_vectors populated? {'Yes' if techniques_vectors else 'No'}")


Predicting techniques for ingredients: 'tomato puree, lemon juice, salt, oregano, basil, thyme, garlic powder'
(Using average ingredient vector similarity approach)
DEBUG: Processed input ingredients (for W2V lookup): ['basil', 'oregano', 'lemon_juice', 'garlic_powder', 'thyme', 'salt', 'tomato_puree']
DEBUG: Found vectors for 7 ingredients.
DEBUG: Calculated average ingredient vector shape: (1, 100)
DEBUG: Top 10 techniques by similarity: [('poach', 0.21975678), ('griddle', 0.17659804), ('microwave', 0.1519689), ('blend', 0.14328659), ('melt', 0.13427587), ('smooth', 0.13234997), ('brine', 0.13034436), ('drain', 0.13020164), ('combine', 0.12415667), ('shred', 0.108764276)]

-------------------------------------
Predicted cooking techniques (Top 5):
1. poach
2. griddle
3. microwave
4. blend
5. melt
-------------------------------------
