In [46]:
import pandas as pd
import numpy as np
import math
import random
import csv

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config
from transformers import get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matteorigat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# some parameters
epochs = 1
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
model_name = "gpt2"


# this produces sample output every 100 steps
sample_every = 1000
# save the model every 5000 step
save_every = 5000
# save the model to this file name
save_model = "trial_2"

In [48]:
# load and also preprocess the raw data
def load_preprocess_raw_data(raw_data):
    recipe_instances = []

    with open(raw_data, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            # Extract relevant fields from CSV row
            #name = row['name'].lower().replace('"', '')  # Remove any extra quotes
            ingredients = row['ingredients'].lower().replace('\'', '').replace('[', '').replace(']', '')
            instructions = row['steps'].lower().replace('\'', '').replace('[', '').replace(']', '')
            
            # Prepare recipe instance string
            recipe_instance = '[BOS]'+ingredients+'[STEPS]'+instructions+'[EOS]' #+name+'[INGREDIENTS]'
            
            # Limit length to 2000 characters as per your function
            if len(recipe_instance) <= 3000:
                recipe_instances.append(recipe_instance)
    
    return recipe_instances

In [49]:
# create text list for dataset
# https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data
recipe_list = load_preprocess_raw_data("dataset/RAW_recipes.csv")

reduced_recipe_list = random.sample(recipe_list, int(0.01 * len(recipe_list)))
print(reduced_recipe_list[:1])

train_list, test_list = np.split(reduced_recipe_list, [int(.8*len(reduced_recipe_list))])
print('\nNumber of train data: ', len(train_list))
print('Number of test data: ', len(test_list))

['[BOS]sausages, tomato ketchup, clear honey, mild chili powder, garlic cloves, dried oregano[STEPS]heat the oven to 220c / gas mark 7, arrange the sausages in a roasting tin in a single layer and bake for 10 minutes, mix together the remaining ingredients with 1 tbsp water, pour the ketchup mixture over the sausages and mix well, bake for 30 minutes , turning and basting occasionally , until the sausages are golden[EOS]']

Number of train data:  1849
Number of test data:  463


In [50]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(model_name, bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]')
# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['[STEPS]']} #'[INGREDIENTS]', 
# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

We have added 1 tokens


In [51]:
class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
        txt = self.custom_preprocessing(txt)
        
        encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
    
        self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
        self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
          
  def custom_preprocessing(self, text):
    # Example preprocessing: Lowercase the text and remove punctuation
    text = text.lower()
    # Add more preprocessing steps as needed
    return text
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

In [52]:
dataset = GPT2Dataset(train_list, tokenizer, max_length=200)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

1,479 training samples
  370 validation samples


In [53]:
batch_size = 2

In [54]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [55]:
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained(model_name, output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained(model_name, config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
if torch.cuda.is_available():
    device = torch.device("cuda")
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
else:
    device = torch.device("cpu")
    

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if device == "cuda":
    torch.cuda.manual_seed_all(seed_val)



In [56]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [57]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print('Total number of steps: ', total_steps)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

Total number of steps:  740


In [58]:
training_stats = []
print("Currently using device type: ", device)

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    losses = []

    total_train_loss = 0

    model.train()
    
    loop = tqdm(train_dataloader, leave=True)
    for step, batch in enumerate(loop):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask =b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss
        losses.append(batch_loss)

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            print('Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.'.format(step, len(train_dataloader), batch_loss))

        loss.backward()

        optimizer.step()

        scheduler.step()

        if step % save_every == 0:
            model.save_pretrained(save_model)
            
        loop.set_postfix(loss=batch_loss)

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Calculate perplexity.
    losses = torch.tensor(losses)
    train_perplexity = math.exp(torch.mean(losses))

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Perplexity: {0:.2f}".format(train_perplexity))        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    model.eval()

    losses = []
    total_eval_loss = 0
    nb_eval_steps = 0


    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))        

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Perplexity': train_perplexity,
            'Valid. Perplexity': val_perplexity,
        }
    )

print("")
print("Training complete!")

Currently using device type:  cpu

Training...


100%|██████████| 740/740 [13:01<00:00,  1.06s/it, loss=1.29] 



  Average training loss: 2.50
  Perplexity: 12.14

Running Validation...
  Validation Loss: 1.63
  Validation perplexity: 5.12

Training complete!


In [59]:
model.save_pretrained(save_model)

In [60]:
# prepare datasets for dev_list and test_list
test_dataset = GPT2Dataset(test_list, tokenizer, max_length=768)

In [61]:
# load the datasets
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [62]:
def evaluate_model(model, dataloaded):
    model = model.to(device)
    model.eval()

    losses = []
    perplexity = []
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dataloaded:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids, 
    #                            token_type_ids=None, 
                            attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]  

        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(dataloaded)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))
    perplexity.append(val_perplexity)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))
    return avg_val_loss, val_perplexity

In [63]:
print('Testing...')
test_loss, test_perplexity = evaluate_model(model, test_dataloader)
test_eval_df = pd.DataFrame(columns = ["test_loss", "test_perplexity"])
test_eval_df['test_loss'] = test_loss
test_eval_df['test_perplexity'] = test_perplexity
test_eval_df.to_csv("test_eval.csv")

Testing...
  Validation Loss: 0.50
  Validation perplexity: 1.64


In [64]:
# Load the trained GPT-2 model and tokenizer
#model = GPT2LMHeadModel.from_pretrained(save_file)
#tokenizer = GPT2Tokenizer.from_pretrained(save_file

# Ensure the model is on the right device
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

In [65]:
# Function to generate a recipe from a list of ingredients
"""def generate_recipe(ingredients, model, tokenizer, max_length=400):
    # Prepare the input prompt with the list of ingredients
    input_text = ingredients
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    
    # Generate the recipe
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output to get the recipe text
    recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    return recipe"""

"def generate_recipe(ingredients, model, tokenizer, max_length=400):\n    # Prepare the input prompt with the list of ingredients\n    input_text = ingredients\n    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)\n    \n    # Generate the recipe\n    output = model.generate(\n        input_ids,\n        max_length=max_length,\n        num_beams=5,\n        no_repeat_ngram_size=2,\n        num_return_sequences=1,\n        pad_token_id=tokenizer.pad_token_id,\n        eos_token_id=tokenizer.eos_token_id\n    )\n    \n    # Decode the output to get the recipe text\n    recipe = tokenizer.decode(output[0], skip_special_tokens=True)\n    return recipe"

In [66]:
def generate_recipe(ingredients, model, tokenizer, max_length=400, temperature=0.1, top_k=100, top_p=0.2):
    input_text = ingredients
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature, # Lower values make the model more confident (less random), while higher values increase randomness.
        top_k=top_k,  #Increase to consider more tokens, decrease to restrict the model’s choices.
        top_p=top_p,  # Increase to allow more diversity, decrease to make the model more conservative.
        num_beams=20,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    
    recipe = tokenizer.decode(output[0], skip_special_tokens=False)
    return recipe

In [67]:
def custom_preprocessing(self, text):
    # Example preprocessing: Lowercase the text and remove punctuation
    text = text.lower()
    text = text.replace(",", "").replace(".", "").replace("!", "").replace("?", "").replace("(", "").replace(")", "").replace(":", "").replace(";", "").replace("'", "").replace('"', "")
    # Add more preprocessing steps as needed
    return text

In [68]:
# Example usage
ingredients = "flour, sugar, cinnamon, carrot, apple, walnuts"

ingredients = '[BOS]'+ ingredients + '[STEPS]'
ingredients = custom_preprocessing(ingredients)
generated_recipe = generate_recipe(ingredients, model, tokenizer)

print(generated_recipe)
print("\n", len(generated_recipe) - len(ingredients))

TypeError: custom_preprocessing() missing 1 required positional argument: 'text'

In [None]:
def get_gpt2_embedding(text):
    input_ids = tokenizer(text, return_tensors='pt').input_ids
    with torch.no_grad():
        outputs = model(input_ids)
    # Average the embeddings over the sequence length
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(text1, text2):
    embedding1 = get_gpt2_embedding(text1)
    embedding2 = get_gpt2_embedding(text2)
    similarity = cosine_similarity(embedding1, embedding2).item()
    return similarity

In [None]:
def evaluate_generated_recipe(generated_recipe, real_recipes_df):
    similarities = []
    for _, real_recipe in real_recipes_df.iterrows():
        real_recipe_text = real_recipe['name'] + ' ' + real_recipe['ingredients'] + ' ' + real_recipe['steps']
        similarity = calculate_similarity(generated_recipe, real_recipe_text)
        similarities.append(similarity)

    return max(similarities)  # or return other statistics like mean similarity


In [None]:
def predict_rating(generated_recipe, real_recipes_df):
    similarities = []
    ratings = []
    for _, real_recipe in real_recipes_df.iterrows():
        real_recipe_text = real_recipe['name'] + ' ' + real_recipe['ingredients'] + ' ' + real_recipe['steps']
        similarity = calculate_similarity(generated_recipe, real_recipe_text)
        similarities.append(similarity)
        ratings.append(real_recipe['rating'])

    if similarities:
        predicted_rating = sum([sim * rating for sim, rating in zip(similarities, ratings)]) / sum(similarities)
        return predicted_rating
    return None


In [None]:
# Load your real recipes dataset
real_recipes_df = pd.read_csv("dataset/RAW_recipes.csv")

# Example usage
generated_recipe = "[INGREDIENTS] flour, sugar, cinnamon, carrot, apple, walnuts [STEPS] Mix all ingredients together. Bake at 350°F for 30 minutes."

similarity_score = evaluate_generated_recipe(generated_recipe, reduced_recipe_list)
print("Similarity score:", similarity_score)