In [12]:
import pandas as pd
import ast
from pathlib import Path


DATA_PATH = Path("/kaggle/input/3a2mext/3A2M_EXTENDED.csv")
MAX_SAMPLES = 100_000


print("Loading dataset...")
df = pd.read_csv(DATA_PATH)
if MAX_SAMPLES:
    df = df.head(MAX_SAMPLES)
print(f"Loaded {len(df)} samples.")


def safe_parse_list(val):
    if isinstance(val, str) and val.strip().startswith("["):
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list):
                return parsed
        except (ValueError, SyntaxError):
            return []
    return []

print("Parsing NER and directions columns...")
df['NER'] = df['NER'].apply(safe_parse_list)
df['directions'] = df['directions'].apply(safe_parse_list)

def format_example(title, ingredients, directions):
    ingredients_str = ", ".join(ingredients)
    directions_str = " ".join(directions)
    return f"Title: {title}\nIngredients: {ingredients_str}\nDirections: {directions_str}"

df['formatted_text'] = df.apply(
    lambda row: format_example(row['title'], row['NER'], row['directions']), axis=1
)


df = df[df['formatted_text'].str.len() > 50].reset_index(drop=True)


print("\nSample formatted entry:\n")
print(df['formatted_text'].iloc[0][:1000])  # show only the first 1000 chars
print("\nTotal valid samples:", len(df))


Loading dataset...
Loaded 100000 samples.
Parsing NER and directions columns...

Sample formatted entry:

Title: 	 Arugula Pomegranate Salad
Ingredients: baby spinach, baby arugula, pomegranate arils, persimmon, alfalfa sprouts
Directions: Toss together spinach and arugula, then place in your serving bowl. Remove the stem and leaves of the persimmon, then slice into thin wedges. Arrange the persimmon on top of the spinach and arugula. Garnish with pomegranate arils and alfalfa sprouts.

Total valid samples: 99997


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import ast 
import os
import time


DATASET_PATH = "/kaggle/input/3a2mext/3A2M_EXTENDED.csv"


OUTPUT_DIR = "/kaggle/working/"
OUTPUT_FILE_TRAIN = os.path.join(OUTPUT_DIR, "train_recipes.txt")
OUTPUT_FILE_VAL = os.path.join(OUTPUT_DIR, "val_recipes.txt")


BOS_TOKEN = "<|startofrecipe|>"
GENRE_TOKEN = "<|genre|>"
TITLE_TOKEN = "<|title|>"
INGREDIENTS_TOKEN = "<|ingredients|>"
STEPS_TOKEN = "<|steps|>"
EOS_TOKEN = "<|endofrecipe|>"


def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def safe_eval_list(raw_list_str):

    if not isinstance(raw_list_str, str) or not raw_list_str.startswith('['):
        return []
    try:
        return ast.literal_eval(raw_list_str)
    except (ValueError, SyntaxError):
        return []

def format_recipe(row):
    try:
        genre = clean_text(row['genre'])
        title = clean_text(row['title'])
        ner_list = safe_eval_list(row['NER'])
        ext_ner_list = safe_eval_list(row['Extended_NER'])
        combined_entities = sorted(list(set(ner_list + ext_ner_list)), key=str.lower)
        ingredients_str = ', '.join(combined_entities)
        directions_list = safe_eval_list(row['directions'])
        steps = ' '.join(f"{i+1}. {clean_text(step)}" for i, step in enumerate(directions_list))
        if not genre or not title or not ingredients_str or not steps:
            return None
            
        formatted_string = (
            f"{BOS_TOKEN}"
            f"{GENRE_TOKEN}{genre}"
            f"{TITLE_TOKEN}{title}"
            f"{INGREDIENTS_TOKEN}{ingredients_str}"
            f"{STEPS_TOKEN}{steps}"
            f"{EOS_TOKEN}\n" # Add newline to separate recipes
        )
        return formatted_string
        
    except Exception as e:
        return None

def run_data_prep():
    print(f"Loading dataset from {DATASET_PATH}...")
    start_time = time.time()
    try:
        columns_to_load = ['title', 'NER', 'Extended_NER', 'genre', 'directions']
        df = pd.read_csv(DATASET_PATH, usecols=columns_to_load)
        
    except FileNotFoundError:
        print(f"Error: Dataset file not found at {DATAFSET_PATH}")
        print("Please ensure your that the '3a2mext' dataset added as input.")
        return
    except ValueError as e:
        print(f"Error loading CSV.{e}")
        return

    df = df.dropna()
    load_time = time.time()
    print(f"Loaded {len(df)} non-null recipes in {load_time - start_time:.2f} seconds.")

    formatted_recipes = df.apply(format_recipe, axis=1).tolist()
    formatted_recipes = [r for r in formatted_recipes if r is not None]
    
    format_time = time.time()
    print(f"Successfully formatted {len(formatted_recipes)} recipes in {format_time - load_time:.2f} seconds.")

    if not formatted_recipes:
        print("No recipes were formatted. Please check dataset columns and helper functions.")
        return

    
    train_data, val_data = train_test_split(formatted_recipes, test_size=0.1, random_state=30)

    with open(OUTPUT_FILE_TRAIN, "w", encoding="utf-8") as f:
        f.writelines(train_data)
    
    with open(OUTPUT_FILE_VAL, "w", encoding="utf-8") as f:
        f.writelines(val_data)

    write_time = time.time()
    print(f"Successfully saved {len(train_data)} training recipes to {OUTPUT_FILE_TRAIN}")
    print(f"Successfully saved {len(val_data)} validation recipes to {OUTPUT_FILE_VAL}")
    print(f"Files written in {write_time - format_time:.2f} seconds.")
    print("\n Example of new formatted recipe ")
    print(train_data[0][:500] + "...")
    
    total_time = time.time() - start_time
    print(f"\nData Formatting is complete. Total time: {total_time:.2f} seconds.")


run_data_prep()

Loading dataset from /kaggle/input/3a2mext/3A2M_EXTENDED.csv...
Loaded 2231142 non-null recipes in 22.34 seconds.
Successfully formatted 1964305 recipes in 284.74 seconds.
Successfully saved 1767874 training recipes to /kaggle/working/train_recipes.txt
Successfully saved 196431 validation recipes to /kaggle/working/val_recipes.txt
Files written in 5.32 seconds.

 Example of new formatted recipe 
<|startofrecipe|><|genre|>bakery<|title|>Christmas Fudge<|ingredients|>1 minute, 4 to 5 minutes, 8-inch, Bring, butter, Combine, Cook, Cool, marshmallows, milk, salt, semi-sweet chocolate chips, semisweet chocolate chips, Stir, sugar, vanilla<|steps|>1. Cream sugar, syrup and salt; cook in large pan. 2. Cook until sugar dissolves (boil 1 minute covered). 3. Uncover; cook to soft ball stage. 4. Add vanilla; beat on high speed mixer until creamy. Add all other ingredients. 5. Pour into square pan....

Data Formatting is complete. Total time: 312.39 seconds.


In [None]:

!pip install -q "transformers==4.38.2" "datasets==2.18.0" "accelerate==0.27.2" "huggingface_hub==0.20.3" "peft==0.9.0"

print("Install complete.")

import os
os._exit(0)

In [1]:
import os
import torch
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    Trainer, 
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
print("Libraries imported successfully.")


INPUT_DIR = "/kaggle/working/"
TRAIN_FILE = os.path.join(INPUT_DIR, "train_recipes.txt")
VAL_FILE = os.path.join(INPUT_DIR, "val_recipes.txt")
OUTPUT_DIR = os.path.join(INPUT_DIR, "recipe-gpt2-model")
MODEL_NAME = "openai-community/gpt2" # Use the full, correct name


BOS_TOKEN = "<|startofrecipe|>"
GENRE_TOKEN = "<|genre|>"
TITLE_TOKEN = "<|title|>"
INGREDIENTS_TOKEN = "<|ingredients|>"
STEPS_TOKEN = "<|steps|>"
EOS_TOKEN = "<|endofrecipe|>"
PAD_TOKEN = "<|pad|>"

SPECIAL_TOKENS = {
    "bos_token": BOS_TOKEN,
    "eos_token": EOS_TOKEN,
    "pad_token": PAD_TOKEN,
    "additional_special_tokens": [GENRE_TOKEN, TITLE_TOKEN, INGREDIENTS_TOKEN, STEPS_TOKEN]
}

print("Loading tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, use_fast=False)
print("Tokenizer loaded successfully.")

print(f"Original tokenizer vocab size: {len(tokenizer)}")


num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS)

tokenizer.pad_token = PAD_TOKEN

print(f"Added {num_added_tokens} new tokens.")
print(f"New tokenizer vocab size: {len(tokenizer)}")
print(f"Tokenizer pad token set to: {tokenizer.pad_token}")

tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Tokenizer saved to {OUTPUT_DIR}")
print("\n--- SETUP COMPLETE ---")
print("All libraries and the tokenizer are now correctly loaded.")

2025-11-02 06:35:29.878941: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762065329.903494    3458 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762065329.910678    3458 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Libraries imported successfully.
Loading tokenizer...
Tokenizer loaded successfully.
Original tokenizer vocab size: 50257
Added 7 new tokens.
New tokenizer vocab size: 50264
Tokenizer pad token set to: <|pad|>
Tokenizer saved to /kaggle/working/recipe-gpt2-model

--- SETUP COMPLETE ---
All libraries and the tokenizer are now correctly loaded.


In [6]:
import os
from datasets import load_dataset, DatasetDict
print("Loading and tokenizing datasets...")

num_proc = os.cpu_count()
print(f"Using {num_proc} processes for tokenization.")

TRAIN_SUBSET_SIZE = 20000
VAL_SUBSET_SIZE = 2000
print(f"Loading full dataset to create a {TRAIN_SUBSET_SIZE}-sample subset...")

full_dataset = load_dataset('text', data_files={'train': TRAIN_FILE, 'validation': VAL_FILE})
train_subset = full_dataset['train'].shuffle(seed=30).select(range(TRAIN_SUBSET_SIZE))
val_subset = full_dataset['validation'].shuffle(seed=30).select(range(VAL_SUBSET_SIZE))

dataset = DatasetDict({
    'train': train_subset,
    'validation': val_subset
})

print(f"Dataset subset created: {TRAIN_SUBSET_SIZE} train, {VAL_SUBSET_SIZE} validation samples.")


block_size = 512
def tokenize_function(examples):

    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=block_size,
    )

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=num_proc,
    remove_columns=["text"]
)

print("\nTokenization complete.")
print(f"Training dataset features: {tokenized_datasets['train'].features}")

Loading and tokenizing datasets...
Using 4 processes for tokenization.
Loading full dataset to create a 20000-sample subset...
Dataset subset created: 20000 train, 2000 validation samples.


Map (num_proc=4):   0%|          | 0/20000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]


Tokenization complete.
Training dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


In [7]:
print("Initializing model...")


model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Model embedding matrix resized to {len(tokenizer)} to match new tokens.")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

print("Model and Data Collator initialized successfully.")

Initializing model...
Model embedding matrix resized to 50264 to match new tokens.
Model and Data Collator initialized successfully.


In [8]:
print("Defining training arguments")


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,       
    fp16=True,
    
  
    evaluation_strategy="steps",
    eval_steps=250,
    logging_steps=100,
    save_steps=500,
    
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="none",
    dataloader_num_workers=2,
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

print("Trainer initialized. Ready for training.")

Defining training arguments
Trainer initialized. Ready for training.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [9]:
print("--- Starting Model Training ---")
trainer.train()
print("--- Training Complete ---")
trainer.save_model()
print(f"Final model and tokenizer saved to {OUTPUT_DIR}")
import math
try:
    eval_results = trainer.evaluate()
    print(f"Final Validation Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
except Exception as e:
    print(f"Could not calculate final perplexity: {e}")

print(f"\nFine-Tuning is complete. Your model is saved in {OUTPUT_DIR}")

--- Starting Model Training ---




Step,Training Loss,Validation Loss
250,2.4598,2.260021
500,2.2912,2.185492


Checkpoint destination directory /kaggle/working/recipe-gpt2-model/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


--- Training Complete ---
Final model and tokenizer saved to /kaggle/working/recipe-gpt2-model




Final Validation Perplexity: 8.90

Fine-Tuning is complete. Your model is saved in /kaggle/working/recipe-gpt2-model


In [10]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

print("Loading fine-tuned model and tokenizer...")


MODEL_DIR = "/kaggle/working/recipe-gpt2-model"


tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)

model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() 

print(f"Model loaded and moved to {device}.")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading fine-tuned model and tokenizer...
Model loaded and moved to cuda.


In [12]:

BOS_TOKEN = "<|startofrecipe|>"
GENRE_TOKEN = "<|genre|>"
TITLE_TOKEN = "<|title|>"
INGREDIENTS_TOKEN = "<|ingredients|>"
STEPS_TOKEN = "<|steps|>"
EOS_TOKEN = "<|endofrecipe|>"

def generate_recipe(prompt_title, prompt_genre):
    print(f"--- Generating recipe for: {prompt_title} ({prompt_genre}) ---")
    prompt_text = (
        f"{BOS_TOKEN}"
        f"{GENRE_TOKEN}{prompt_genre}"
        f"{TITLE_TOKEN}{prompt_title}"
        f"{INGREDIENTS_TOKEN}" 
    )
    
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to(device)

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=512,                  
        temperature=1.0,                 
        top_k=50,                        
        top_p=0.95,                      
        do_sample=True,                  
        num_return_sequences=1,          
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=False)
    
    
    clean_text = generated_text.replace(prompt_text, "").replace(EOS_TOKEN, "").strip()
    
    
    clean_text = clean_text.replace(STEPS_TOKEN, f"\n\n{STEPS_TOKEN}\n")
    clean_text = clean_text.replace(INGREDIENTS_TOKEN, f"{INGREDIENTS_TOKEN}\n")
    
    print(clean_text)

In [16]:
# Example 1: A dessert
generate_recipe(prompt_title="Black Bean And Turkey Chili", prompt_genre="sides")

print("\n" + "="*50 + "\n")

# Example 2: A side dish
generate_recipe(prompt_title="Spicy Garlic Potatoes", prompt_genre="sides")

print("\n" + "="*50 + "\n")

# Example 3: A main course
generate_recipe(prompt_title="Simple Chicken Curry", prompt_genre="non-veg")

--- Generating recipe for: Black Bean And Turkey Chili (sides) ---
<|startofrecipe|> <|genre|> sides <|title|> Black Bean And Turkey Chili <|ingredients|>
 bacon, cumin, garlic, ground beef, ground pepper, onion, pepper, water, taco seasoning, zucchini, tomatoes, yellow onion 

<|steps|>
 1. Mix all ingredients together. Cook over low heat, stirring constantly. 2. Sprinkle all over. 3. Serve at room temperature.


--- Generating recipe for: Spicy Garlic Potatoes (sides) ---
<|startofrecipe|> <|genre|> sides <|title|> Spicy Garlic Potatoes <|ingredients|>
 3, 2, 1/4 cup, 30 seconds, about 3/4, another 1/4 cup, at least 30 minutes, Bake, baking powder, garlic powder, ground cinnamon, ground nutmeg, ground ginger, ground sage, ground thyme, light cornmeal, pecans, olive oil, onion, salt, Season, soy sauce, water 

<|steps|>
 1. Heat oil in Dutch oven. 2. Add the garlic. 3. Brown evenly, about 3/4 full. 4. Add cornmeal and turn over to heat. 5. Add the onion and carrot and cook 5 min. 6. C

In [20]:
import evaluate
import re 
import math
try:
    ppl = math.exp(eval_results['eval_loss'])
except NameError:
    ppl = 8.90 # Fallback

print(f"--- Evaluating NEW Model (Perplexity {ppl:.2f}) ---")
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
GENRE_TOKEN_ESC = re.escape(GENRE_TOKEN)
TITLE_TOKEN_ESC = re.escape(TITLE_TOKEN)
INGREDIENTS_TOKEN_ESC = re.escape(INGREDIENTS_TOKEN)


try:
    with open("/kaggle/working/val_recipes.txt", "r", encoding="utf-8") as f:
        reference_recipe_full = f.readline()
    
    ref_genre = re.search(f'{GENRE_TOKEN_ESC}(.*?){TITLE_TOKEN_ESC}', reference_recipe_full).group(1)
    ref_title = re.search(f'{TITLE_TOKEN_ESC}(.*?){INGREDIENTS_TOKEN_ESC}', reference_recipe_full).group(1)
    
    reference_text = INGREDIENTS_TOKEN + reference_recipe_full.split(INGREDIENTS_TOKEN)[1]
    reference_text = reference_text.replace(EOS_TOKEN, "").strip()

    print(f"--- Evaluating with reference recipe: {ref_title} ({ref_genre}) ---")

   
    prompt_text = (
        f"{BOS_TOKEN}"
        f"{GENRE_TOKEN}{ref_genre}"
        f"{TITLE_TOKEN}{ref_title}"
        f"{INGREDIENTS_TOKEN}"
    )
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt').to(device)
    
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=512,
        do_sample=False, 
        num_beams=3,     
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=2  
    )
    

    prediction_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    clean_prompt = f"{ref_genre}{ref_title}"
    prediction_text = prediction_text.replace(clean_prompt, "").strip()

    
    reference_text_cleaned = reference_text.replace(INGREDIENTS_TOKEN, "").replace(STEPS_TOKEN, " ").strip()

    print("\n--- ROUGE Scores (Recall-Oriented) ---")
    rouge_results = rouge.compute(predictions=[prediction_text], references=[reference_text_cleaned])
    print(f"ROUGE-1 (Unigram): {rouge_results['rouge1']:.4f}")
    print(f"ROUGE-2 (Bigram): {rouge_results['rouge2']:.4f}")
    print(f"ROUGE-L (Longest Common Subsequence): {rouge_results['rougeL']:.4f}")
    
    print("\n--- BLEU Score (Precision-Oriented) ---")
    bleu_results = bleu.compute(predictions=[prediction_text], references=[[reference_text_cleaned]])
    print(f"BLEU: {bleu_results['bleu']:.4f}")
    
    print("\n--- COMPARISON ---")
    print("\n**REFERENCE (Ground Truth):**\n", reference_text_cleaned[:500] + "...")
    print("\n**PREDICTION (Generated):**\n", prediction_text[:500] + "...")

except Exception as e:
    print(f"Could not run evaluation: {e}")
    print("Please ensure 'val_recipes.txt' exists in /kaggle/working/")

--- Evaluating NEW Model (Perplexity 8.90) ---
--- Evaluating with reference recipe: The Best Tres Leches Cake (bakery) ---

--- ROUGE Scores (Recall-Oriented) ---
ROUGE-1 (Unigram): 0.3262
ROUGE-2 (Bigram): 0.0693
ROUGE-L (Longest Common Subsequence): 0.1803

--- BLEU Score (Precision-Oriented) ---
BLEU: 0.0000

--- COMPARISON ---

**REFERENCE (Ground Truth):**
 15-20 minute, 25-35 minute, 350 degrees, cold.(at least 1-2 hours, condensed milk, eggs, milk, overnight, three, toothpick, topping, water, yellow cake 1. Mix together the cake mix, eggs, and water in a large bowl. 2. Pour into a greased Large baking pan, and bake at 350 degrees for 25-35 minute or until toothpick comes out clean. 3. Meanwhile, in a large sauce pan heat up the three milks, and whisk until well combined. 4. Take the cake out of the oven, and let it cool for 15-20 minute Once cool...

**PREDICTION (Generated):**
 1/2, 1/4 cup, 30 minutes, 350\u00b0, Bake, baking powder, butter, eggs, flour, milk, salt, sugar, va