In [121]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tqdm import tqdm

In [122]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05

In [123]:
df = pd.read_csv("dataset/RAW_recipes.csv")

# Sample a fraction of the dataset
sample_size = int(len(df) * 0.001)
new_df = df.sample(n=sample_size).reset_index(drop=True)

# Preprocess ingredients and steps
separator = ", "
new_df['ingredients'] = new_df['ingredients'].apply(lambda x: separator.join(eval(x)))
new_df['steps'] = new_df['steps'].apply(lambda x: separator.join(eval(x)))

print(new_df.shape)
#print(new_df.head())

print("\nName:", new_df.iloc[0]['name'])
print("\nIngredients:", new_df.iloc[0]['ingredients'])
print("\nSteps:", new_df.iloc[0]['steps'])

(231, 6)
Name: lamb stew

Ingredients: lamb, sugar, oil, salt, pepper, flour, water, red wine, garlic powder, worcestershire sauce, carrots, onions, celery ribs, potatoes

Steps: sprinkle lamb with sugar, brown in oil in skillet, remove lamb and place in slow cooker , reserving drippings, stir in salt , pepper , and flour into drippings until smooth, stir in water and wine , until smooth , stirring until broth simmers and thickens, pour into cooker, add remaining ingredients and stir until well mixed, cover, cook on low 8-10 hours, serve with crusty bread


In [124]:
train_size = 0.8
train_dataset = new_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)

print("\nFULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))


FULL Dataset: (231, 6)
TRAIN Dataset: (185, 6)
TEST Dataset: (46, 6)


In [125]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.name = dataframe.name
        self.ingredients = dataframe.ingredients
        self.steps = dataframe.steps
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        #ingredients = str(self.ingredients[index])
        #steps = str(self.steps[index])
        
        recipe = '[BOS]'+self.ingredients[index]+'[STEPS]'+self.steps[index]+'[EOS]' 
        #+self.name[index]+'[INGREDIENTS]'+
        
        # Tokenize ingredients
        inputs = self.tokenizer.encode_plus(
            recipe,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long),
        }

In [126]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", bos_token='[BOS]', eos_token='[EOS]', pad_token='[PAD]')

# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['[STEPS]']}  #'[INGREDIENTS]', 

# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

We have added 1 tokens


In [127]:
# Create instances of CustomDataset
train_dataset = CustomDataset(train_dataset, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [128]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0}

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 0}

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [129]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#if torch.backends.mps.is_available():
#    device = torch.device("mps")
#    x = torch.ones(1, device=device)
#    print(x)
#else:
#    print("MPS device not found.")
print(f"Using {device} device")

Using cpu device


In [130]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

In [131]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [132]:
def train_loop(dataloader, model, optimizer):
    
    # set the model to training model
    model.train()
    
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=input_ids)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
        

def test_loop(dataloader, model):
    # set the model of evaluation
    model.eval()
    val_loss = 0
    
    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    with torch.no_grad():
        for batch in dataloader:
            # previous tokens
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            
           # get outputs from model
            outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=input_ids)

            # calculate loss
            val_loss += outputs.loss.item()
    
    # Print the validation loss for this epoch
    print(f"Validation Loss: {val_loss / len(dataloader)}")

In [133]:
# Training and validation
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loop(training_loader, model, optimizer)
    test_loop(testing_loader, model)

print("Training completed!")

Epoch 1/3


100%|██████████| 24/24 [05:03<00:00, 12.65s/it, loss=2.83]


Validation Loss: 3.5405129194259644
Epoch 2/3


100%|██████████| 24/24 [04:55<00:00, 12.33s/it, loss=2.2] 


Validation Loss: 2.794088453054428
Epoch 3/3


100%|██████████| 24/24 [04:57<00:00, 12.42s/it, loss=2.62]


Validation Loss: 2.576580067475637
Training completed!


In [134]:
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json')

In [135]:
# Load the fine-tuned GPT-2 model and tokenizer
#model_name = "fine_tuned_model"
#model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
#tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [139]:
def generate_recipe(ingredients, model, tokenizer, max_length=400, temperature=0.1, top_k=100, top_p=0.2):
    input_text = ingredients
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature, # Lower values make the model more confident (less random), while higher values increase randomness.
        top_k=top_k,  #Increase to consider more tokens, decrease to restrict the model’s choices.
        top_p=top_p,  # Increase to allow more diversity, decrease to make the model more conservative.
        num_beams=20,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    
    recipe = tokenizer.decode(output[0], skip_special_tokens=False)
    return recipe

In [140]:
# Define the question
ingredients = "flour, sugar, cinnamon, carrot, apple, walnuts"

ingredients = '[INGREDIENTS]'+ ingredients + '[STEPS]'
generated_recipe = generate_recipe(ingredients, model, tokenizer)

print(generated_recipe)
print("\n", len(generated_recipe) - len(ingredients))

[INGREDIENTS]flour, sugar, cinnamon, carrot, apple, walnuts [STEPS] [PAD][PAD][EOS]

 17
