### data pre processing

In [1]:
import kagglehub
import pandas as pd
import ast

path = kagglehub.dataset_download("prashantsingh001/recipes-dataset-64k-dishes")
df = pd.read_csv(path + "/1_Recipe_csv.csv")
print("DataFrame cleaned and loaded successfully.")
# df.head()
# ----------------------------
# 2. Clean column formats
# ----------------------------
# Convert ingredients and directions from string → Python lists
for col in ["ingredients", "directions"]:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

print(f'dataframe size before dropping missing values: {len(df)}')
# Drop rows with missing key fields
df = df.dropna(subset=["recipe_title", "ingredients", "directions", "category", "subcategory"])
print(f'dataframe size after dropping missing values : {len(df)}')

# ----------------------------
# 3. Create unified text fields
# ----------------------------
# Input text that ALL models use
df["ingredients_text"] = df["ingredients"].apply(lambda lst: ", ".join(lst))

# Target text for generation models (T5, BART, GPT-2)
df["directions_text"] = df["directions"].apply(lambda lst: " ".join(lst))

# ----------------------------
# 4. Keep only needed columns
# ----------------------------
df = df[[
    "recipe_title",
    "ingredients_text",
    "directions_text",
    "category",
    "subcategory"
]]

print("Preprocessing complete. Final shape:", df.shape)
df.head()

Using Colab cache for faster access to the 'recipes-dataset-64k-dishes' dataset.
DataFrame cleaned and loaded successfully.
dataframe size before dropping missing values: 62126
dataframe size after dropping missing values : 62126
Preprocessing complete. Final shape: (62126, 5)


Unnamed: 0,recipe_title,ingredients_text,directions_text,category,subcategory
0,Air Fryer Potato Slices with Dipping Sauce,"3/4 cup ketchup, 1/2 cup beer, 1 tablespoon Wo...","Combine ketchup, beer, Worcestershire sauce, o...",Air Fryer Recipes,Air Fryer Recipes
1,Gochujang Pork Belly Bites,"1 pound pork belly, 1/4 cup gochujang, 2 table...",Preheat an air fryer to 400 degrees F (200 deg...,Air Fryer Recipes,Air Fryer Recipes
2,3-Ingredient Air Fryer Everything Bagel Chicke...,"1 ¼ pounds chicken tenders, 1 tablespoon olive...",Gather all ingredients. Preheat an air fryer t...,Air Fryer Recipes,Air Fryer Recipes
3,Air Fryer Everything Bagel Chicken Cutlets,"4 chicken cutlets (about 1 pound total), salt ...",Preheat an air fryer to 400 degrees F (200 deg...,Air Fryer Recipes,Air Fryer Recipes
4,Air Fryer Honey Sriracha Salmon Bites,"1 tablespoon soy sauce, 1 tablespoon honey, 1 ...",Preheat an air fryer to 400 degrees F (200 deg...,Air Fryer Recipes,Air Fryer Recipes


### split between train/test

In [2]:
from sklearn.model_selection import train_test_split
from collections import Counter

# ----------------------------
# 5. Basic sanity checks
# ----------------------------
print("\nSample rows:")
print(df.head(3))

print("\nCategory counts (top 20):")
print(df["category"].value_counts().head(20))

print("\nSubcategory counts (top 20):")
print(df["subcategory"].value_counts().head(20))

# ----------------------------
# 6. Encode category & subcategory labels
# ----------------------------
# Make label → id mappings
category2id = {cat: i for i, cat in enumerate(sorted(df["category"].unique()))}
id2category = {i: cat for cat, i in category2id.items()}

subcategory2id = {sub: i for i, sub in enumerate(sorted(df["subcategory"].unique()))}
id2subcategory = {i: sub for sub, i in subcategory2id.items()}

df["category_id"] = df["category"].map(category2id)
df["subcategory_id"] = df["subcategory"].map(subcategory2id)

print("\nNumber of categories:", len(category2id))
print("Number of subcategories:", len(subcategory2id))



Sample rows:
                                        recipe_title  \
0         Air Fryer Potato Slices with Dipping Sauce   
1                         Gochujang Pork Belly Bites   
2  3-Ingredient Air Fryer Everything Bagel Chicke...   

                                    ingredients_text  \
0  3/4 cup ketchup, 1/2 cup beer, 1 tablespoon Wo...   
1  1 pound pork belly, 1/4 cup gochujang, 2 table...   
2  1 ¼ pounds chicken tenders, 1 tablespoon olive...   

                                     directions_text           category  \
0  Combine ketchup, beer, Worcestershire sauce, o...  Air Fryer Recipes   
1  Preheat an air fryer to 400 degrees F (200 deg...  Air Fryer Recipes   
2  Gather all ingredients. Preheat an air fryer t...  Air Fryer Recipes   

         subcategory  
0  Air Fryer Recipes  
1  Air Fryer Recipes  
2  Air Fryer Recipes  

Category counts (top 20):
category
Main Dishes              3387
Healthy Recipes          2237
Appetizers And Snacks    2084
Cakes            

In [3]:
# ----------------------------
# 7. Train / Val / Test split
# ----------------------------

# First split (stratified)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    random_state=42,
    stratify=df["category_id"]
)

# Second split (NOT stratified)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42
)

print("\nSplit sizes:")
print("Train:", len(train_df))
print("Val:  ", len(val_df))
print("Test: ", len(test_df))



Split sizes:
Train: 43488
Val:   9319
Test:  9319


### Build the Dataset Class for DistilBERT (Single-Task Classification)

#### Step 3a — Import tokenizer + create Dataset class

In [4]:
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Dataset for single-task CATEGORY classification
class RecipeCategoryDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["ingredients_text"].tolist()   # inputs
        self.labels = df["category_id"].tolist()       # targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

#### Step 3b — Create the actual datasets

In [5]:
train_dataset = RecipeCategoryDataset(train_df, tokenizer)
val_dataset = RecipeCategoryDataset(val_df, tokenizer)
test_dataset = RecipeCategoryDataset(test_df, tokenizer)


#### Step 3c — Create DataLoaders

In [6]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


### Step 4 — Train the Single-Task DistilBERT Category Classifier

#### Step 4a — Import the model + training utilities

In [7]:
!pip install evaluate
import evaluate
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


#### Step 4b — Load the DistilBERT classifier


In [8]:
num_categories = df["category_id"].nunique()

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_categories
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Step 4c — Define accuracy metric


In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"]}

Downloading builder script: 0.00B [00:00, ?B/s]

#### Step 4d — Define training arguments


In [14]:
# training_args = TrainingArguments(
#     output_dir="distilbert_category_classifier",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     logging_strategy="steps",
#     logging_steps=200,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=2,
#     learning_rate=5e-5,
#     load_best_model_at_end=True
# )

training_args = TrainingArguments(
    output_dir="distilbert_category_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    report_to="none"   # ← THIS DISABLES W&B, TensorBoard, ALL loggers
)


#### Step 4e — Create the Trainer


In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



#### Step 4f — Train the model


In [15]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.4093,3.243284,0.242945
2,2.9535,2.946483,0.276639


TrainOutput(global_step=5436, training_loss=3.4204668153113, metrics={'train_runtime': 2023.2049, 'train_samples_per_second': 42.989, 'train_steps_per_second': 2.687, 'total_flos': 5787966917541888.0, 'train_loss': 3.4204668153113, 'epoch': 2.0})

#### Step 4g — Evaluate on the test set


In [16]:
test_results = trainer.evaluate(test_dataset)
print(test_results)


{'eval_loss': 2.925588369369507, 'eval_accuracy': 0.28254104517652107, 'eval_runtime': 66.8917, 'eval_samples_per_second': 139.315, 'eval_steps_per_second': 8.716, 'epoch': 2.0}


### Step 5 — Evaluate the Single-Task DistilBERT Category Classifier

In [17]:
test_results = trainer.evaluate(test_dataset)
print(test_results)


{'eval_loss': 2.925588369369507, 'eval_accuracy': 0.28254104517652107, 'eval_runtime': 66.0101, 'eval_samples_per_second': 141.175, 'eval_steps_per_second': 8.832, 'epoch': 2.0}


### Step 6 — Train the Single-Task SUBCATEGORY Classifier (DistilBERT)

#### Step 6a — Create the Subcategory Dataset Class

In [18]:
class RecipeSubcategoryDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["ingredients_text"].tolist()
        self.labels = df["subcategory_id"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


#### Step 6b — Create train/val/test datasets for subcategory


In [19]:
train_sub_dataset = RecipeSubcategoryDataset(train_df, tokenizer)
val_sub_dataset = RecipeSubcategoryDataset(val_df, tokenizer)
test_sub_dataset = RecipeSubcategoryDataset(test_df, tokenizer)

#### Step 6c — Create a new DistilBERT model for subcategories

In [20]:
num_subcategories = df["subcategory_id"].nunique()

model_sub = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_subcategories
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Step 6d — Reuse the same metrics & training arguments

In [21]:
trainer_sub = Trainer(
    model=model_sub,
    args=training_args,     # same training args as before
    train_dataset=train_sub_dataset,
    eval_dataset=val_sub_dataset,
    compute_metrics=compute_metrics
)


#### Step 6e — Train the subcategory classifier

In [22]:
trainer_sub.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,5.9061,5.756142,0.028973
2,5.408,5.355568,0.066531


TrainOutput(global_step=5436, training_loss=5.946333558880467, metrics={'train_runtime': 2018.1192, 'train_samples_per_second': 43.098, 'train_steps_per_second': 2.694, 'total_flos': 5866558932123648.0, 'train_loss': 5.946333558880467, 'epoch': 2.0})

### Step 7 — Evaluate the Subcategory Classifier on the Test Set

In [23]:
test_results_sub = trainer_sub.evaluate(test_sub_dataset)
print(test_results_sub)


{'eval_loss': 5.403534412384033, 'eval_accuracy': 0.0666380512930572, 'eval_runtime': 66.5336, 'eval_samples_per_second': 140.065, 'eval_steps_per_second': 8.762, 'epoch': 2.0}


### Step 8 — Single-Task Text Generation Baseline (T5-small)

#### Step 8a — Prepare the Dataset for Seq2Seq Models

In [24]:
from transformers import T5TokenizerFast

t5_tokenizer = T5TokenizerFast.from_pretrained("t5-small")

class T5RecipeDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_len=256, max_output_len=256):
        self.inputs = df["ingredients_text"].tolist()
        self.targets = df["directions_text"].tolist()
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        inp = self.inputs[idx]
        out = self.targets[idx]

        model_inp = self.tokenizer(
            inp,
            truncation=True,
            padding="max_length",
            max_length=self.max_input_len,
            return_tensors="pt"
        )

        model_out = self.tokenizer(
            out,
            truncation=True,
            padding="max_length",
            max_length=self.max_output_len,
            return_tensors="pt"
        )

        return {
            "input_ids": model_inp["input_ids"].squeeze(0),
            "attention_mask": model_inp["attention_mask"].squeeze(0),
            "labels": model_out["input_ids"].squeeze(0)
        }


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

#### Step 8b — Create T5 Datasets

In [25]:
train_t5_dataset = T5RecipeDataset(train_df, t5_tokenizer)
val_t5_dataset = T5RecipeDataset(val_df, t5_tokenizer)
test_t5_dataset = T5RecipeDataset(test_df, t5_tokenizer)


#### Step 8c — Load T5-small Model

In [26]:
from transformers import T5ForConditionalGeneration

t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Step 8d — Training Arguments

In [27]:
training_args_t5 = TrainingArguments(
    output_dir="t5_recipe_generation",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=4,   # T5 needs small batch size
    per_device_eval_batch_size=4,
    num_train_epochs=1,              # we start with 1 epoch (fast)
    learning_rate=5e-5,
    load_best_model_at_end=True,
    report_to="none"
)


#### Step 8e — Trainer for T5

In [28]:
trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=train_t5_dataset,
    eval_dataset=val_t5_dataset,
)


#### Step 8f — Train T5

In [29]:
trainer_t5.train()


Epoch,Training Loss,Validation Loss
1,1.3587,1.187049


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=10872, training_loss=1.4568656657529806, metrics={'train_runtime': 1959.3572, 'train_samples_per_second': 22.195, 'train_steps_per_second': 5.549, 'total_flos': 2942872131207168.0, 'train_loss': 1.4568656657529806, 'epoch': 1.0})

### Step 9 — Evaluate T5-small (BLEU, ROUGE, Examples)

#### Step 9a — Generate predictions on the test set

In [32]:
def generate_t5_predictions(model, tokenizer, dataset, num_samples=200):
    preds = []
    refs = []

    device = model.device     # <-- IMPORTANT

    for i in range(num_samples):
        item = dataset[i]

        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256
            )

        pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        ref_text = tokenizer.decode(item["labels"], skip_special_tokens=True)

        preds.append(pred_text)
        refs.append(ref_text)

    return preds, refs


In [33]:
preds, refs = generate_t5_predictions(t5_model, t5_tokenizer, test_t5_dataset, num_samples=200)


#### Step 9b — Compute BLEU

In [34]:
import evaluate
bleu_metric = evaluate.load("bleu")

bleu_score = bleu_metric.compute(predictions=preds, references=[[r] for r in refs])
print("BLEU score:", bleu_score)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU score: {'bleu': 0.15860667309914053, 'precisions': [0.5533721364874246, 0.2860689804069736, 0.18306324002837343, 0.13377834170022618], 'brevity_penalty': 0.6356275365850288, 'length_ratio': 0.6881637452688053, 'translation_length': 18727, 'reference_length': 27213}


#### Step 9c — Compute ROUGE-L

In [36]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=51252e571f293b1de2ceca660f34773fee87a5533e9a4397bcbc5ef219dc6dff
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [37]:
rouge_metric = evaluate.load("rouge")

rouge_score = rouge_metric.compute(predictions=preds, references=refs)
print("ROUGE-L:", rouge_score["rougeL"])


ROUGE-L: 0.30580820802558323


#### Step 9d — Show example T5 generations


In [38]:
for i in range(5):
    print("\n=== Example", i+1, "===")
    print("Ingredients:", train_df["ingredients_text"].iloc[i])
    print("Generated:", preds[i])
    print("Reference:", refs[i])


=== Example 1 ===
Ingredients: 6 tablespoons unsalted butter, 1 ½ cups self-rising flour, 1 ½ cups white sugar, ⅔ cup milk, or more as needed, 1 teaspoon pure vanilla extract, 1 ½ cups packed light brown sugar, 1 tablespoon ground cinnamon, 1 cup pecan halves, 1 ½ cups hot water
Generated: Preheat the oven to 350 degrees F (175 degrees C). Line a baking sheet with parchment paper. Place lettuce, parsley, and dill in a large bowl. Stir ranch dressing, radishes, Red Onions, sunflower kernels, and pepper together in a large bowl. Place lettuce mixture into the prepared baking sheet. Bake in the preheated oven until lettuce is tender, about 30 minutes.
Reference: Put lettuce, parsley, and dill in a large bowl. Add 1/2 cup dressing; toss to coat. Top with radishes, pickled onions, sunflower kernels, and pepper. Serve with remaining dressing.

=== Example 2 ===
Ingredients: 1 ½ cups raw cleaned whole pumpkin seeds, ½ cup Worcestershire sauce, 1 tablespoon hot pepper sauce (such as Tabasco®)

### Step 10 — Train Single-Task BART-base Generation Model

#### Step 10a — Import BART tokenizer & model

In [39]:
from transformers import BartTokenizerFast, BartForConditionalGeneration

bart_tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

#### Step 10b — Create BART datasets using the SAME dataset class

In [40]:
train_bart_dataset = T5RecipeDataset(train_df, bart_tokenizer)
val_bart_dataset = T5RecipeDataset(val_df, bart_tokenizer)
test_bart_dataset = T5RecipeDataset(test_df, bart_tokenizer)


#### Step 10c — TrainingArguments for BART

In [41]:
training_args_bart = TrainingArguments(
    output_dir="bart_recipe_generation",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=2,   # heavier than T5
    per_device_eval_batch_size=2,
    num_train_epochs=1,              # start with 1
    learning_rate=5e-5,
    load_best_model_at_end=True,
    report_to="none"
)


#### Step 10d — Create Trainer

In [42]:
trainer_bart = Trainer(
    model=bart_model,
    args=training_args_bart,
    train_dataset=train_bart_dataset,
    eval_dataset=val_bart_dataset,
)


#### Step 10e — Train BART

In [43]:
trainer_bart.train()


Epoch,Training Loss,Validation Loss
1,0.862,0.718744


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=21744, training_loss=0.9142108689870266, metrics={'train_runtime': 4370.0432, 'train_samples_per_second': 9.951, 'train_steps_per_second': 4.976, 'total_flos': 6629054773985280.0, 'train_loss': 0.9142108689870266, 'epoch': 1.0})

### Step 11 — Evaluate BART (BLEU, ROUGE-L, Example Generations)

#### Step 11a — Generate predictions with BART

In [44]:
def generate_bart_predictions(model, tokenizer, dataset, num_samples=200):
    preds = []
    refs = []

    device = model.device

    for i in range(num_samples):
        item = dataset[i]

        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256
            )

        pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        ref_text = tokenizer.decode(item["labels"], skip_special_tokens=True)

        preds.append(pred_text)
        refs.append(ref_text)

    return preds, refs


In [45]:
preds_bart, refs_bart = generate_bart_predictions(
    bart_model,
    bart_tokenizer,
    test_bart_dataset,
    num_samples=200
)


#### Step 11b — Compute BLEU

In [46]:
bleu_metric = evaluate.load("bleu")
bleu_bart = bleu_metric.compute(
    predictions=preds_bart,
    references=[[r] for r in refs_bart]
)
print("BART BLEU:", bleu_bart)


BART BLEU: {'bleu': 0.2323350074078887, 'precisions': [0.6505164781111658, 0.3770491803278688, 0.25785248369292524, 0.19472883933096807], 'brevity_penalty': 0.6974292832701883, 'length_ratio': 0.7351026901938097, 'translation_length': 20330, 'reference_length': 27656}


#### Step 11c — Compute ROUGE-L

In [47]:
rouge_metric = evaluate.load("rouge")
rouge_bart = rouge_metric.compute(
    predictions=preds_bart,
    references=refs_bart
)
print("BART ROUGE-L:", rouge_bart["rougeL"])


BART ROUGE-L: 0.40605818098738006


#### Step 11d — Show 5 example outputs

In [48]:
for i in range(5):
    print(f"\n=== BART Example {i+1} ===")
    print("Ingredients:", test_df["ingredients_text"].iloc[i])
    print("Generated:", preds_bart[i])
    print("Reference:", refs_bart[i])



=== BART Example 1 ===
Ingredients: 3 heads lettuce, 1/2 cup chopped fresh parsley, 1/2 up chopped fresh dill, 1 1/3 cups ranch dressing, 3/4 cup radishes, 1/3 cup Red Onions, 1/4 cup sunflower kernels, 1/4 teaspoon black pepper
Generated: Toss lettuce, parsley, dill, ranch dressing, radishes, red onions, sunflower kernels, and pepper together in a large bowl. Refrigerate salad until chilled, at least 30 minutes.
Reference: Put lettuce, parsley, and dill in a large bowl. Add 1/2 cup dressing; toss to coat. Top with radishes, pickled onions, sunflower kernels, and pepper. Serve with remaining dressing.

=== BART Example 2 ===
Ingredients: 1 tablespoon olive oil, 10 ounces spinach, 1/4 cup onion, 1/4 teaspoon garlic powder, 1 teaspoon kosher salt, 1/2 teaspoon pepper, 4 ounces cream cheese, 1/3 cup feta cheese, 2 tablespoons fresh dill, 2 teaspoons lemon juice, 1 teaspoon lemon zest, 1 (17.3 ounce) box puff pastry (2 sheets), 1 (1 1/2 pound) salmon fillet, 1 large egg
Generated: Gather 

### STEP 12 — GPT-2 Single-Task Generation Baseline

#### Step 12a — Import GPT-2

In [50]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

gpt_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [51]:
#add PAD manually:
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
gpt_model.config.pad_token_id = gpt_tokenizer.eos_token_id


#### Step 12b — Build the GPT-2 dataset

In [52]:
class GPT2RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        text = (
            "Ingredients: " + row["ingredients_text"] +
            "\nDirections: " + row["directions_text"]
        )

        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )

        input_ids = torch.tensor(enc["input_ids"])
        attention_mask = torch.tensor(enc["attention_mask"])

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone()
        }


#### Step 12c — Create train/val/test datasets

In [53]:
train_gpt_dataset = GPT2RecipeDataset(train_df, gpt_tokenizer)
val_gpt_dataset = GPT2RecipeDataset(val_df, gpt_tokenizer)
test_gpt_dataset = GPT2RecipeDataset(test_df, gpt_tokenizer)


#### Step 12d — TrainingArguments

In [55]:
from transformers import Trainer, TrainingArguments

training_args_gpt = TrainingArguments(
    output_dir="gpt2_recipe_generation",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=1,    # GPT2 is heavy
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    learning_rate=5e-5,
    report_to="none"
)


#### Step 12e — Create Trainer

In [56]:
trainer_gpt = Trainer(
    model=gpt_model,
    args=training_args_gpt,
    train_dataset=train_gpt_dataset,
    eval_dataset=val_gpt_dataset,
)


#### Step 12f — Start GPT-2 training

In [57]:
trainer_gpt.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.5844,0.565148


TrainOutput(global_step=43488, training_loss=0.6650636395257216, metrics={'train_runtime': 8582.5534, 'train_samples_per_second': 5.067, 'train_steps_per_second': 5.067, 'total_flos': 1.1363067887616e+16, 'train_loss': 0.6650636395257216, 'epoch': 1.0})

### STEP 13 — Evaluate GPT-2 (BLEU, ROUGE, Example Outputs)

#### Step 13a — GPT-2 Generation Function

In [61]:
def generate_gpt2_predictions(model, tokenizer, dataset, num_samples=50):
    preds = []
    refs = []

    device = model.device

    for i in range(num_samples):
        item = dataset[i]

        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)

        # Stop generation at the end-of-sequence token
        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # max_length=256,
                max_new_tokens=200,

                num_beams=3,
                early_stopping=True
            )

        full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Extract only the generated part after "Directions:"
        if "Directions:" in full_text:
            pred_text = full_text.split("Directions:")[1].strip()
        else:
            pred_text = full_text

        # Reference
        ref_text = tokenizer.decode(item["labels"], skip_special_tokens=True)
        ref_text = ref_text.split("Directions:")[1].strip()

        preds.append(pred_text)
        refs.append(ref_text)

    return preds, refs


#### Step 13b — Generate predictions (use ~50 samples)

In [62]:
preds_gpt, refs_gpt = generate_gpt2_predictions(
    gpt_model,
    gpt_tokenizer,
    test_gpt_dataset,
    num_samples=50
)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

#### Step 13c — Compute BLEU

In [63]:
bleu_metric = evaluate.load("bleu")
bleu_gpt = bleu_metric.compute(
    predictions=preds_gpt,
    references=[[r] for r in refs_gpt]
)
print("GPT-2 BLEU:", bleu_gpt)


GPT-2 BLEU: {'bleu': 0.9991542815123285, 'precisions': [0.999163179916318, 0.9991573033707866, 0.9991513437057992, 0.9991452991452991], 'brevity_penalty': 1.0, 'length_ratio': 1.0008375209380234, 'translation_length': 7170, 'reference_length': 7164}


#### Step 13d — Compute ROUGE-L

In [64]:
rouge_metric = evaluate.load("rouge")
rouge_gpt = rouge_metric.compute(
    predictions=preds_gpt,
    references=refs_gpt
)
print("GPT-2 ROUGE-L:", rouge_gpt["rougeL"])


GPT-2 ROUGE-L: 0.9998635743519781


#### Step 13e — Show example outputs

In [65]:
for i in range(5):
    print(f"\n=== GPT-2 Example {i+1} ===")
    print("Ingredients:", test_df["ingredients_text"].iloc[i])
    print("Generated:", preds_gpt[i])
    print("Reference:", refs_gpt[i])



=== GPT-2 Example 1 ===
Ingredients: 3 heads lettuce, 1/2 cup chopped fresh parsley, 1/2 up chopped fresh dill, 1 1/3 cups ranch dressing, 3/4 cup radishes, 1/3 cup Red Onions, 1/4 cup sunflower kernels, 1/4 teaspoon black pepper
Generated: Put lettuce, parsley, and dill in a large bowl. Add 1/2 cup dressing; toss to coat. Top with radishes, pickled onions, sunflower kernels, and pepper. Serve with remaining dressing.
Reference: Put lettuce, parsley, and dill in a large bowl. Add 1/2 cup dressing; toss to coat. Top with radishes, pickled onions, sunflower kernels, and pepper. Serve with remaining dressing.

=== GPT-2 Example 2 ===
Ingredients: 1 tablespoon olive oil, 10 ounces spinach, 1/4 cup onion, 1/4 teaspoon garlic powder, 1 teaspoon kosher salt, 1/2 teaspoon pepper, 4 ounces cream cheese, 1/3 cup feta cheese, 2 tablespoons fresh dill, 2 teaspoons lemon juice, 1 teaspoon lemon zest, 1 (17.3 ounce) box puff pastry (2 sheets), 1 (1 1/2 pound) salmon fillet, 1 large egg
Generated: H

### evaluation of all models

In [None]:
# ============================================
#   UNIVERSAL METRIC UTILITIES
# ============================================

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import torch
import evaluate

# ----------------------------
# CLASSIFICATION METRICS
# ----------------------------
def evaluate_classifier(trainer, dataset, true_labels):
    pred_output = trainer.predict(dataset)
    logits = pred_output.predictions
    y_pred = np.argmax(logits, axis=1)

    acc = accuracy_score(true_labels, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, y_pred, average="macro", zero_division=0
    )

    # Softmax probabilities
    probs = torch.tensor(logits).softmax(dim=1).numpy()

    # ROC–AUC for multi-class (may fail for single-class)
    try:
        roc_auc = roc_auc_score(
            true_labels,
            probs,
            multi_class="ovr",
            average="macro"
        )
    except Exception:
        roc_auc = None

    return {
        "accuracy": float(acc),
        "precision_macro": float(precision),
        "recall_macro": float(recall),
        "f1_macro": float(f1),
        "roc_auc_macro_ovr": None if roc_auc is None else float(roc_auc)
    }


# ----------------------------
# GENERATION METRICS
# ----------------------------
def evaluate_generation(preds, refs):
    bleu = evaluate.load("bleu").compute(
        predictions=preds,
        references=[[r] for r in refs]
    )

    rouge = evaluate.load("rouge").compute(
        predictions=preds,
        references=refs
    )

    return {
        "bleu": bleu["bleu"],
        "rouge_l": rouge["rougeL"]
    }


# ----------------------------
# SEQ2SEQ GENERATORS (T5 / BART)
# ----------------------------
def generate_seq2seq(model, tokenizer, dataset, num_samples=200):
    preds, refs = [], []
    device = model.device

    for i in range(num_samples):
        item = dataset[i]
        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256
            )

        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        ref = tokenizer.decode(item["labels"], skip_special_tokens=True)

        preds.append(pred)
        refs.append(ref)

    return preds, refs


# ----------------------------
# GPT-2 GENERATOR
# ----------------------------
def generate_gpt2(model, tokenizer, dataset, num_samples=50):
    preds, refs = [], []
    device = model.device

    for i in range(num_samples):
        item = dataset[i]
        input_ids = item["input_ids"].unsqueeze(0).to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=input_ids,
                max_length=256,
                num_beams=3,
                early_stopping=True
            )

        full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        if "Directions:" in full_text:
            pred = full_text.split("Directions:")[1].strip()
        else:
            pred = full_text

        ref_full = tokenizer.decode(item["labels"], skip_special_tokens=True)
        ref = ref_full.split("Directions:")[1].strip()

        preds.append(pred)
        refs.append(ref)

    return preds, refs


# ============================================
#   RUN ALL METRICS ACROSS ALL MODELS
# ============================================

all_metrics = {}

# 1) DISTILBERT CATEGORY ← your trainer is "trainer"
all_metrics["distilbert_category"] = evaluate_classifier(
    trainer,
    test_dataset,
    test_df["category_id"].to_numpy()
)

# 2) DISTILBERT SUBCATEGORY ← your trainer is "trainer_sub"
all_metrics["distilbert_subcategory"] = evaluate_classifier(
    trainer_sub,
    test_sub_dataset,
    test_df["subcategory_id"].to_numpy()
)

# 3) T5 GENERATION
preds_t5, refs_t5 = generate_seq2seq(
    t5_model, t5_tokenizer, test_t5_dataset, num_samples=200
)
all_metrics["t5_generation"] = evaluate_generation(preds_t5, refs_t5)

# 4) BART GENERATION
preds_bart, refs_bart = generate_seq2seq(
    bart_model, bart_tokenizer, test_bart_dataset, num_samples=200
)
all_metrics["bart_generation"] = evaluate_generation(preds_bart, refs_bart)

# 5) GPT-2 GENERATION
preds_gpt, refs_gpt = generate_gpt2(
    gpt_model, gpt_tokenizer, test_gpt_dataset, num_samples=50
)
all_metrics["gpt2_generation"] = evaluate_generation(preds_gpt, refs_gpt)

# ============================================

print("\n===== FINAL METRICS FOR ALL MODELS =====\n")
for model_name, metrics in all_metrics.items():
    print(model_name.upper())
    print(metrics)
    print("----------------------------------------")
