In [237]:
import pandas as pd
import numpy as np
import math
import random
import csv

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
from transformers import get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [238]:
# some parameters
epochs = 3
learning_rate = 1e-4
warmup_steps = 1e2
epsilon = 1e-8
model_name = "gpt2"
batch_size = 2


# this produces sample output every 1000 steps
sample_every = 1000
# save the model every 5000 step
save_every = 5000
# save the model to this file name
save_model = "trial_2"

In [239]:
# load and also preprocess the raw data
def load_preprocess_raw_data(raw_data):
    recipe_instances = []

    with open(raw_data, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Extract relevant fields from CSV row
            #name = row['name'].lower().replace('"', '')  # Remove any extra quotes
            ingredients = row['ingredients'].lower().replace('\'', '').replace('[', '').replace(']', '')
            instructions = row['steps'].lower().replace('\'', '').replace('[', '').replace(']', '')
            
            # Prepare recipe instance string
            recipe_instance = '[BOS]'+ingredients+'[STEPS]'+instructions+'[EOS]' #+name+'[INGREDIENTS]'
            
            # Limit length to 2000 characters as per your function
            if len(recipe_instance) <= 2000:
                recipe_instances.append(recipe_instance)
    
    return recipe_instances

In [240]:
# create text list for dataset
# https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data
recipe_list = load_preprocess_raw_data("dataset/RAW_recipes.csv")

reduced_recipe_list = random.sample(recipe_list, int(0.1 * len(recipe_list)))
print(reduced_recipe_list[:1])

train_list, test_list = np.split(reduced_recipe_list, [int(.8*len(reduced_recipe_list))])
print('\nNumber of train data: ', len(train_list))
print('Number of test data: ', len(test_list))

['[BOS]vanilla ice cream, brandy, white creme de cacao, black coffee[STEPS]place all ingredients in a blender, blend on high speed until smooth, refrigerate at least 2 hours, "dont skip this step !", blend quickly on high just before serving[EOS]']

Number of train data:  18304
Number of test data:  4576


In [241]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]')
# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['[STEPS]']} #'[INGREDIENTS]', 
# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

We have added 1 tokens


In [242]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
        txt = self.custom_preprocessing(txt)
        
        encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
    
        self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
        self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
          
  def custom_preprocessing(self, text):
    # Example preprocessing: Lowercase the text and remove punctuation
    text = text.lower()
    # Add more preprocessing steps as needed
    return text
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

In [243]:
dataset = GPT2Dataset(train_list, tokenizer, max_length=200)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

14,643 training samples
3,661 validation samples


In [244]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [245]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_name, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
else:
    device = torch.device("cpu")
    

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if device == "cuda":
    torch.cuda.manual_seed_all(seed_val)



In [246]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [247]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print('Total number of steps: ', total_steps)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

Total number of steps:  21966


In [248]:
training_stats = []
print("Currently using device type: ", device)

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    losses = []

    total_train_loss = 0

    model.train()
    
    loop = tqdm(train_dataloader, leave=True)
    for step, batch in enumerate(loop):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask =b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss
        losses.append(batch_loss)

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            print('Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))

        loss.backward()

        optimizer.step()

        scheduler.step()

        if step % save_every == 0:
            model.save_pretrained(save_model)
            
        loop.set_postfix(loss=batch_loss)

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Calculate perplexity.
    losses = torch.tensor(losses)
    train_perplexity = math.exp(torch.mean(losses))

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Perplexity: {0:.2f}".format(train_perplexity))        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    model.eval()

    losses = []
    total_eval_loss = 0
    nb_eval_steps = 0


    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))        

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Perplexity': train_perplexity,
            'Valid. Perplexity': val_perplexity,
        }
    )

print("")
print("Training complete!")

Currently using device type:  cpu

Training...


  0%|          | 24/7322 [00:27<2:21:46,  1.17s/it, loss=5.59]


KeyboardInterrupt: 

In [None]:
model.save_pretrained(save_model)

In [None]:
# prepare datasets for dev_list and test_list
test_dataset = GPT2Dataset(test_list, tokenizer, max_length=768)

In [None]:
# load the datasets
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
def evaluate_model(model, dataloaded):
    model = model.to(device)
    model.eval()

    losses = []
    perplexity = []
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dataloaded:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids, 
    #                            token_type_ids=None, 
                            attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]  

        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(dataloaded)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))
    perplexity.append(val_perplexity)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))
    return avg_val_loss, val_perplexity

In [None]:
print('Testing...')
test_loss, test_perplexity = evaluate_model(model, test_dataloader)
test_eval_df = pd.DataFrame(columns = ["test_loss", "test_perplexity"])
test_eval_df['test_loss'] = test_loss
test_eval_df['test_perplexity'] = test_perplexity
test_eval_df.to_csv("test_eval.csv")

In [None]:
# Load the trained GPT-2 model and tokenizer
#model = GPT2LMHeadModel.from_pretrained(save_file)
#tokenizer = GPT2Tokenizer.from_pretrained(save_file

# Ensure the model is on the right device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

In [None]:
# Function to generate a recipe from a list of ingredients
"""def generate_recipe(ingredients, model, tokenizer, max_length=400):
    # Prepare the input prompt with the list of ingredients
    input_text = ingredients
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    
    # Generate the recipe
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output to get the recipe text
    recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    return recipe"""

In [None]:
def generate_recipe(ingredients, model, tokenizer, max_length=200, temperature=0.1, top_k=50, top_p=0.1):
    input_text = '[BOS]' + ingredients + '[STEPS]'
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature, # Lower values make the model more confident (less random), while higher values increase randomness.
        top_k=top_k,  #Increase to consider more tokens, decrease to restrict the model’s choices.
        top_p=top_p,  # Increase to allow more diversity, decrease to make the model more conservative.
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    
    recipe = tokenizer.decode(output[0], skip_special_tokens=False)
        
    return recipe

In [None]:
def custom_preprocessing(text):
    # Example preprocessing: Lowercase the text and remove punctuation
    text = text.lower()
    #text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace("(", "").replace(")", "").replace(":", "").replace(";", "").replace("'", "").replace('"', "")
    # Add more preprocessing steps as needed
    return text

In [None]:
def print_highlighted(generated_recipe, ingredients):
    recipe=generated_recipe
    ingredients_list = [ing.strip().lower() for ing in ingredients.split(',')]
    for ingredient in ingredients_list:
        recipe = recipe.replace(ingredient, f'\033[91m{ingredient}\033[0m')
    return recipe

In [None]:
# Example usage
ingredients = "flour, sugar, cinnamon, carrot, apple, walnut"
#ingredients = custom_preprocessing(ingredients)

generated_recipe = generate_recipe(ingredients, model, tokenizer)

    
print(print_highlighted(generated_recipe, ingredients))
print("\n", len(generated_recipe) - len(ingredients))

In [None]:
from rouge import Rouge
import numpy as np

# Initialize Rouge object
rouge = Rouge()

def calculate_rouge_score(text1, text2):
    # Calculate ROUGE score between two texts
    scores = rouge.get_scores(text1, text2)
    rouge_l_f1 = scores[0]['rouge-l']['f']
    return rouge_l_f1

def evaluate_generated_recipe_with_rouge(generated_recipe, real_recipes):
    rouge_scores = []
    for real_recipe in real_recipes:
        rouge_score = calculate_rouge_score(generated_recipe, real_recipe)
        rouge_scores.append((rouge_score, real_recipe))  # Store ROUGE score and recipe pair

    # Sort ROUGE scores based on ROUGE-L F1 score in descending order
    rouge_scores.sort(key=lambda x: x[0], reverse=True)

    return rouge_scores  # Return sorted list of ROUGE scores


rouge_scores = evaluate_generated_recipe_with_rouge(generated_recipe, reduced_recipe_list)

# Print the sorted ROUGE scores
for score, real_recipe in rouge_scores[:5]:
    print(f"ROUGE-L F1 Score: {score:.4f}")
    print("Real Recipe:", print_highlighted(real_recipe, ingredients))
    print()

In [None]:
def get_gpt2_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
    hidden_states = outputs[0]  # Shape: [batch_size, seq_len, hidden_size]
    pooled_embedding = torch.mean(hidden_states, dim=1)  # Average pooling over the sequence length
    return pooled_embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    embedding1 = get_gpt2_embedding(text1)
    embedding2 = get_gpt2_embedding(text2)
    similarity = cosine_similarity(embedding1, embedding2).item()
    return similarity

In [None]:
def evaluate_generated_recipe(generated_recipe, real_recipes_df):
    similarities = []
    for real_recipe in real_recipes_df:
        similarity = calculate_similarity(generated_recipe, real_recipe)
        similarities.append(similarity)

    return max(similarities)  # or return other statistics like mean similarity


In [None]:
def evaluate_generated_recipe2(generated_recipe, real_recipes_df):
    similarities = []
    for real_recipe in real_recipes_df:
        similarity = calculate_similarity(generated_recipe, real_recipe)
        similarities.append((similarity, real_recipe))  # Store similarity score and recipe pair

    # Sort similarities based on similarity score in descending order
    similarities.sort(key=lambda x: x[0], reverse=True)

    return similarities  # Optionally, you can return the sorted list of all similarities

In [None]:
def predict_rating(generated_recipe, real_recipes_df):
    similarities = []
    ratings = []
    for real_recipe in real_recipes_df:
        print(real_recipe)
        similarity = calculate_similarity(generated_recipe, real_recipe)
        similarities.append(similarity)
        ratings.append(real_recipe['rating'])

    if similarities:
        predicted_rating = sum([sim * rating for sim, rating in zip(similarities, ratings)]) / sum(similarities)
        return predicted_rating
    return None


In [None]:
# Example usage
#print("Generated recipe:", generated_recipe)

#similarity_score = evaluate_generated_recipe(generated_recipe, reduced_recipe_list)
#print("\nSimilarity score:", similarity_score)

similarities = evaluate_generated_recipe2(generated_recipe, reduced_recipe_list)
# Print the top 2 most similar recipes and their similarity scores
#print("\nTop most similar recipes:")
#for i in range(min(1, len(similarities))):
#    print(f"\nRecipe {i + 1}:")
#    print("Similarity score:", similarities[i][0])
#    print("Recipe:", similarities[i][1])

In [None]:
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
model_name_bert = 'bert-base-uncased'
tokenizer_bert = BertTokenizer.from_pretrained(model_name_bert)
model_bert = BertModel.from_pretrained(model_name_bert)

In [None]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to tokenize and encode text
def encode_text(text):
    input_ids = tokenizer_bert.encode(text, return_tensors='pt', max_length=512, truncation=True)
    return input_ids

# Function to compute BERT embeddings
def get_bert_embedding(input_ids):
    with torch.no_grad():
        outputs = model_bert(input_ids)
        last_hidden_state = outputs.last_hidden_state
        pooled_embedding = torch.mean(last_hidden_state, dim=1)  # Average pooling over the sequence length
    return pooled_embedding

# Function to calculate cosine similarity
def calculate_cosine_similarity(embedding1, embedding2):
    emb1_np = embedding1.cpu().numpy()
    emb2_np = embedding2.cpu().numpy()
    similarity = cosine_similarity(emb1_np, emb2_np)
    return similarity[0][0]

# Example usage
print("\n", print_highlighted(generated_recipe, ingredients))
real_recipe = similarities[0][1]
print("\nmost sililar:\n", print_highlighted(real_recipe, ingredients))

# Tokenize and encode texts
input_generated = encode_text(generated_recipe)
input_real = encode_text(real_recipe)

# Get embeddings for generated and real recipes
embedding_generated = get_bert_embedding(input_generated)
embedding_real = get_bert_embedding(input_real)

# Calculate cosine similarity
similarity_score = calculate_cosine_similarity(embedding_generated, embedding_real)
print("Cosine Similarity Score:", similarity_score)

In [None]:
from rouge import Rouge

# Initialize Rouge object
rouge = Rouge()

# Calculate ROUGE Score
scores = rouge.get_scores(generated_recipe, real_recipe)
rouge_score = scores[0]['rouge-l']['f']
print(f"ROUGE Score between generated recipe and real recipe: {rouge_score}")

In [None]:
# Function to get GPT-2 embeddings
def get_embeddings(text):
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        pooled_embedding = torch.mean(outputs.last_hidden_state, dim=1)  # Average pooling over sequence length
    return pooled_embedding

# Get embeddings for generated and real recipes
embedding_generated = get_embeddings(generated_recipe)
embedding_real = get_embeddings(real_recipe)

# Calculate Euclidean distance
embedding_distance = torch.norm(embedding_generated - embedding_real).item()
print(f"Euclidean Embedding Distance between generated recipe and real recipe: {embedding_distance}")

In [None]:
#real_recipe = "Start with a ball of pizza dough, roll it out into a thin crust. Spread a layer of tomato sauce evenly over the dough. Add a generous amount of shredded mozzarella cheese on top. Optionally, sprinkle with dried oregano and a pinch of salt. Preheat your oven to a high temperature, around 450°F (230°C). Place the pizza on a baking sheet or pizza stone and bake for 10-15 minutes, or until the crust is golden brown and the cheese is bubbly. Remove from the oven, let it cool slightly, then slice and enjoy your delicious homemade pizza!"

In [None]:
# Tokenize and encode for GPT-2 input
recipe1_input_ids = tokenizer(generated_recipe, return_tensors='pt').input_ids.to(device)
recipe2_input_ids = tokenizer(real_recipe, return_tensors='pt').input_ids.to(device)

# Generate outputs using GPT-2
with torch.no_grad():
    outputs1 = model(recipe1_input_ids)
    outputs2 = model(recipe2_input_ids)

# Extract logits or hidden states
logits1 = outputs1.logits  # Adjust according to what you need (logits, hidden_states, etc.)
logits2 = outputs2.logits

# Average pooling of logits (as an example)
pooled_logits1 = torch.mean(logits1, dim=1)  # Adjust pooling strategy as needed
pooled_logits2 = torch.mean(logits2, dim=1)

# Calculate cosine similarity
similarity_score = cosine_similarity(pooled_logits1.cpu(), pooled_logits2.cpu()).item()

print(f"Cosine Similarity Score between recipe1 and recipe2: {similarity_score}")