In [1]:
import os
from pyprojroot import here
os.environ['TRANSFORMERS_CACHE'] = str(here("cache/transformers"))

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [2]:
# test one example
model.eval()
# Add padding token if not already present
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
model.resize_token_embeddings(len(tokenizer))

# Generate text
prompt = "The future of artificial intelligence is"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate with different parameters
with torch.no_grad():
  outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=True,
    temperature=0.7
  )

# Decode and display
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


The future of artificial intelligence is still in its infancy. But the future is bright, and we are already well on our way.


In [3]:
# dataset from https://recipenlg.cs.put.poznan.pl/
import pandas as pd
import duckdb as sql
data = sql.query(f"""
  select title, directions
  from read_csv(
    '{here('data/recipeNLG/recipeNLG.csv')}', 
    header=True,
    delim=',',
    types = {{
      'title': 'VARCHAR',
      'ingredients': 'VARCHAR[]',
      "directions": 'VARCHAR[]'
    }}
  )
""")
data

┌────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                 title                  │                          

In [4]:
# transform directions list to string using sql
datad = sql.query(f"""
  select title, array_to_string(directions, '\\n') as directions
  from data
""")
datad

┌────────────────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│                 title                  │                                          

In [5]:
# Prepare the dataset for fine-tuning
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
from tqdm import tqdm

# Convert data to pandas for easier manipulation
df = datad.df()

# Create formatted training text
def format_recipe(title, directions):
    return f"Recipe: {title}\n\nInstructions:\n{directions}\n<|endoftext|>"

# Format the data
formatted_texts = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Formatting recipes"):
    formatted_text = format_recipe(row['title'], row['directions'])
    formatted_texts.append(formatted_text)

print(f"Prepared {len(formatted_texts)} recipes for training")
print("\nExample formatted recipe:")
print(formatted_texts[0][:300] + "...")

Formatting recipes: 100%|██████████| 2231142/2231142 [01:12<00:00, 30913.30it/s]

Prepared 2231142 recipes for training

Example formatted recipe:
Recipe: No-Bake Nut Cookies

Instructions:
In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.\nStir over medium heat until mixture bubbles all over top.\nBoil and stir 5 minutes more. Take off heat.\nStir in vanilla and cereal; mix well.\nUsing 2 teaspoons, ...





In [6]:
# Custom Dataset class
class RecipeDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Create dataset (using a subset for faster training)
subset_size = min(1000, len(formatted_texts))  # Use first 1000 recipes or all if less
train_texts = formatted_texts[:subset_size]

train_dataset = RecipeDataset(train_texts, tokenizer, max_length=256)
print(f"Created dataset with {len(train_dataset)} samples")

Created dataset with 1000 samples


In [7]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir=str(here('models/gpt2-recipe-finetuned')),
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    eval_strategy="no",  # No validation set for simplicity
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir=str(here('logs')),
    report_to=None,  # Disable wandb logging
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 is not a masked language model
)

print("Training arguments configured")

Training arguments configured


In [8]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized")
print(f"Number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

  trainer = Trainer(


Trainer initialized
Number of trainable parameters: 124,440,576


In [9]:
# Start fine-tuning
print("Starting fine-tuning...")
trainer.train()

# Save the fine-tuned model
model_save_path = here('models/gpt2-recipe-finetuned')
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Fine-tuned model saved to: {model_save_path}")

Starting fine-tuning...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,3.1022
100,2.3546
150,2.1822
200,2.0654
250,2.1072
300,1.9972
350,1.9935


Fine-tuned model saved to: c:\Users\Plancha\Desktop\AML-homework\models\gpt2-recipe-finetuned


In [12]:
# Test the fine-tuned model
def generate_recipe(prompt, max_length=200, temperature=0.8, do_sample=True):
    model.eval()
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            temperature=temperature,
            do_sample=do_sample,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Test with different prompts
test_prompts = [
    "Recipe: Chocolate Chip Cookies",
    "Recipe: Beef Stew",
    "Recipe: Vegetable Soup"
]

print("Testing fine-tuned model:")
print("=" * 50)

for prompt in test_prompts:
    print(f"\n{prompt}")
    print("-" * 30)
    generated = generate_recipe(prompt)
    print(generated)
    print("=" * 50)

Testing fine-tuned model:

Recipe: Chocolate Chip Cookies
------------------------------
Recipe: Chocolate Chip Cookies

Instructions:
Spread peanut butter in a greased cookie sheet.\nChill until set, about 5 minutes.\nStir in sugar; beat 2 minutes.\nPour chocolate chips on top.\nMix remaining ingredients into a greased 9 x 13-inch loaf pan.\nPour into greased 9 x 13-inch pan. Bake in 350u00b0 oven for 45 minutes.


Recipe: Beef Stew
------------------------------
Recipe: Chocolate Chip Cookies

Instructions:
Spread peanut butter in a greased cookie sheet.\nChill until set, about 5 minutes.\nStir in sugar; beat 2 minutes.\nPour chocolate chips on top.\nMix remaining ingredients into a greased 9 x 13-inch loaf pan.\nPour into greased 9 x 13-inch pan. Bake in 350u00b0 oven for 45 minutes.


Recipe: Beef Stew
------------------------------
Recipe: Beef Stew

Instructions:
Combine all ingredients before making stew.\nMix well.\nAdd chicken in broth.\nAdd onion and carrots; stir well.\nAdd 

In [13]:
# Compare with original model (optional)
print("Comparison with original model:")
print("=" * 50)

# Load original model for comparison
original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_model.resize_token_embeddings(len(tokenizer))

def generate_with_original(prompt, max_length=200, temperature=0.8):
    original_model.eval()
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = original_model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Compare on one example
test_prompt = "Recipe: Chocolate Chip Cookies"
print(f"Prompt: {test_prompt}\n")

print("FINE-TUNED MODEL:")
print(generate_recipe(test_prompt))
print("\nORIGINAL MODEL:")
print(generate_with_original(test_prompt))

Comparison with original model:
Prompt: Recipe: Chocolate Chip Cookies

FINE-TUNED MODEL:
Prompt: Recipe: Chocolate Chip Cookies

FINE-TUNED MODEL:
Recipe: Chocolate Chip Cookies

Instructions:
Cut the butter in half.\nMix the chocolate chips and graham crackers together.\nAdd the soda, water, syrup and flour.\nMix well.\nAdd the chopped marshmallows.


ORIGINAL MODEL:
Recipe: Chocolate Chip Cookies

Instructions:
Cut the butter in half.\nMix the chocolate chips and graham crackers together.\nAdd the soda, water, syrup and flour.\nMix well.\nAdd the chopped marshmallows.


ORIGINAL MODEL:
Recipe: Chocolate Chip Cookies Recipe Print Ingredients 2 1/2 cups flour

2 cups sugar

1/2 cup granulated sugar

1/4 teaspoon baking powder

1/2 tsp baking soda Instructions Preheat the oven to 325 degrees. (I used a 350 degree oven) In a large mixing bowl, blend the flour, sugar, sugar, baking powder and baking soda until crumbly. Add the flour mixture to the bowl and mix with a spoon until smooth. 