In [43]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize, pos_tag
import re
import pandas as pd
from tqdm import tqdm  # Importa tqdm per la barra di progresso

# Scarica i dati necessari di NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [44]:
# Inizializza le stopwords di NLTK
stop_words = set(stopwords.words('english'))

# Funzione di pulizia e preprocessing avanzato dei dati per gli ingredienti
def preprocess_ingredients(ingredients):
    ingredients_list = eval(ingredients)
    processed_ingredients = []
    regex = re.compile('[^a-zA-Z ]')
    lemmatizer = WordNetLemmatizer()

    # POS tags that represent nouns
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']

    # Define the words to be dropped
    #words_to_drop = {"powder", "brown", "salt", "water", "sugar", "onion", "butter", "pepper", "ground", "cream"} 

    for ingr in ingredients_list:
        ingr = regex.sub(' ', ingr.lower()).strip()
        components = [comp.strip() for comp in ingr.split('and')]
        
        for comp in components:
            sentence = ""
            tokens = word_tokenize(comp)  # Tokenize each component
            tagged_tokens = pos_tag(tokens)  # Perform POS tagging
            
            # Extract main nouns while handling compound nouns
            nouns = []
            current_noun = ""
            for word, tag in tagged_tokens:
                word = lemmatizer.lemmatize(word.strip())
                if len(word) > 2 and word not in stop_words and tag in noun_tags: #and word not in words_to_drop
                    if current_noun:
                        nouns.append(current_noun)
                        current_noun = ""
                    current_noun = word
            
            # Add last current noun if exists
            if current_noun:
                nouns.append(current_noun)            
            
            for word in nouns:
                singular_comp = lemmatizer.lemmatize(word.strip())
                if singular_comp not in stop_words and len(singular_comp) > 2:
                    sentence += singular_comp + " "
                    
            if sentence.strip():
                processed_ingredients.append(sentence.strip())

    return list(set(processed_ingredients))

# Funzione di preprocessing per le tecniche
def preprocess_techniques(techniques):
    techniques_list = eval(techniques)
    processed_techniques = []

    for technique in techniques_list:
        technique = technique.lower().strip()
        tokens = word_tokenize(technique)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        processed_techniques.append(' '.join(tokens))

    return processed_techniques

In [45]:
# Carica il dataset
df = pd.read_csv('dataset/RAW_merged.csv')

tqdm.pandas(desc="Processing Ingredients")
df['ingredients_processed'] = df['ingredients'].progress_apply(preprocess_ingredients)
# Applica il preprocessing sugli ingredienti
#df['ingredients_processed_spacy'] = df['ingredients'].progress_apply(preprocess_ingredients_spacy)

tqdm.pandas(desc="Processing Techniques")
df['techniques_processed'] = df['techniques_list'].progress_apply(preprocess_techniques)

print(df[['ingredients', 'ingredients_processed', 'techniques_list', 'techniques_processed']].head())

Processing Ingredients: 100%|██████████| 178265/178265 [02:22<00:00, 1247.34it/s]
Processing Techniques: 100%|██████████| 178265/178265 [00:08<00:00, 20537.93it/s]

                                         ingredients  \
0  ['winter squash', 'mexican seasoning', 'mixed ...   
1  ['prepared pizza crust', 'sausage patty', 'egg...   
2  ['spreadable cheese with garlic and herbs', 'n...   
3  ['milk', 'vanilla ice cream', 'frozen apple ju...   
4  ['fennel seeds', 'green olives', 'ripe olives'...   

                               ingredients_processed  \
0  [seasoning, salt, honey, spice, oil, winter sq...   
1  [egg, milk, salt, sausage patty, pizza crust, ...   
2  [potato, salt, shallot, wine vinegar, tarragon...   
3  [milk, apple, apple juice concentrate, vanilla...   
4  [fennel seed, garlic, orange juice, peppercorn...   

                                 techniques_list  \
0                      ['bake', 'grate', 'melt']   
1                      ['bake', 'pour', 'whisk']   
2              ['bake', 'boil', 'dice', 'drain']   
3                 ['blend', 'combine', 'smooth']   
4  ['crush', 'marinate', 'refrigerate', 'toast']   

             




In [46]:
from gensim.models import Word2Vec

# df['ingredients'] = df['ingredients'].apply(eval)
# df['techniques'] = df['techniques_list'].apply(lambda x: eval(x) if pd.notnull(x) else [])

all_texts = df['ingredients_processed'] + df['techniques_processed']

model = Word2Vec(all_texts, vector_size=100, window=5, min_count=1, sg=1, epochs=10)

ingredients_vectors = {ingredient: model.wv[ingredient] for ingredient in model.wv.index_to_key}
techniques_vectors = {technique: model.wv[technique] for technique in df['techniques_processed'].explode().dropna().unique() if technique in model.wv}

In [54]:
import pickle
import os # Import the os module

# ... (your existing code up to the creation of techniques_vectors)

# Create the directory if it doesn't exist
model_dir = "models"
os.makedirs(model_dir, exist_ok=True) # exist_ok=True prevents error if directory already exists

# Save the Word2Vec model (as you already do)
model.save(os.path.join(model_dir, "word2vec_ingredients_techniques2.model"))
print(f"Word2Vec model saved to {os.path.join(model_dir, 'word2vec_ingredients_techniques2.model')}")

# Save the techniques_vectors dictionary using pickle
techniques_vectors_path = os.path.join(model_dir, "techniques_vectors2.pkl")
with open(techniques_vectors_path, 'wb') as f:
    pickle.dump(techniques_vectors, f)
print(f"Techniques vectors dictionary saved to {techniques_vectors_path}")

# ... (rest of your script, like the prediction example)

Word2Vec model saved to models/word2vec_ingredients_techniques2.model
Techniques vectors dictionary saved to models/techniques_vectors.pkl


In [47]:
def preprocess_ingredients(ingredients):
    processed_ingredients = []
    regex = re.compile('[^a-zA-Z ]')
    lemmatizer = WordNetLemmatizer()

    # POS tags that represent nouns
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']

    # Define the words to be dropped
    #words_to_drop = {"powder", "brown", "salt", "water", "sugar", "onion", "butter", "pepper", "ground", "cream"} 

    for ingr in ingredients:
        ingr = regex.sub(' ', ingr.lower()).strip()
        components = [comp.strip() for comp in ingr.split('and')]
        
        for comp in components:
            sentence = ""
            tokens = word_tokenize(comp)  # Tokenize each component
            tagged_tokens = pos_tag(tokens)  # Perform POS tagging
            
            # Extract main nouns while handling compound nouns
            nouns = []
            current_noun = ""
            for word, tag in tagged_tokens:
                word = lemmatizer.lemmatize(word.strip())
                if len(word) > 2 and word not in stop_words and tag in noun_tags: #and word not in words_to_drop
                    if current_noun:
                        nouns.append(current_noun)
                        current_noun = ""
                    current_noun = word
            
            # Add last current noun if exists
            if current_noun:
                nouns.append(current_noun)            
            
            for word in nouns:
                singular_comp = lemmatizer.lemmatize(word.strip())
                if singular_comp not in stop_words and len(singular_comp) > 2:
                    sentence += singular_comp + " "
                    
            if sentence.strip():
                processed_ingredients.append(sentence.strip())

    return list(set(processed_ingredients))

In [48]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def predict_cooking_methods(ingredients, techniques_vectors, model, top_n=3):
    ingredients = [ingredient.strip() for ingredient in ingredients.split(",")]
    print(ingredients)
    ingredient_list = preprocess_ingredients(ingredients)
    print(ingredient_list)
    ingredient_vectors = [model.wv[ingredient] for ingredient in ingredient_list if ingredient in model.wv]
    
    if len(ingredient_vectors) == 0:
        return "Ingredienti non trovati nel vocabolario."

    avg_ingredient_vector = np.mean(ingredient_vectors, axis=0).reshape(1, -1)

    similarities = {}
    for technique, technique_vector in techniques_vectors.items():
        normalized_technique_vector = normalize(np.array(technique_vector).reshape(1, -1))
        similarities[technique] = cosine_similarity(avg_ingredient_vector, normalized_technique_vector)[0][0]

    sorted_techniques = sorted(similarities, key=similarities.get, reverse=True)
    
    return sorted_techniques[:top_n] if len(sorted_techniques) > 0 else "Nessuna tecnica di cottura trovata."

In [52]:
# Lista di ingredienti di esempio
new_ingredients = "pasta, onion, tomato, olive oil, salt"

# Predizione
predicted_methods = predict_cooking_methods(new_ingredients, techniques_vectors, model)
print("Metodi di cottura predetti:", predicted_methods)

['pasta', 'onion', 'tomato', 'olive oil', 'salt']
['tomato', 'pasta', 'salt', 'onion', 'oil']
Metodi di cottura predetti: ['parboil', 'dice', 'drain']
