In [2]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from tqdm import tqdm
import ast  # Library for parsing strings containing lists
import pandas as pd
import zipfile
import os

In [3]:
# Read the CSV file with the first column as index
df = pd.read_csv('/kaggle/input/inst-resp/inst-resp.csv', index_col=0)
df.head(2)

Unnamed: 0,id,Instruction,Response
0,137739,"Tags: ['60-minutes-or-less', 'time-to-make', '...",Name: arriba baked winter squash mexican sty...
1,31490,"Tags: ['30-minutes-or-less', 'time-to-make', '...",Name: a bit different breakfast pizza Minutes...


## Data Preprocessing

In [4]:
# Load the dataset
inst = df.Instruction.to_list()  # List of instructions strings
resp = df.Response.to_list()  # List of responses strings

In [5]:
class RecipeDataset(Dataset):
    """
    PyTorch dataset for recipe generation.

    Args:
        ingredients (list): List of ingredients for input.
        recipes (list): List of recipes for output.
        tokenizer (transformers.Tokenizer): Tokenizer for encoding text.
        max_length (int): Maximum length of input and output sequences.
    """
    
    def __init__(self, ingredients, recipes, tokenizer, max_length=512):
        """
        Initialize the RecipeDataset.

        Args:
            ingredients (list): List of ingredients for input.
            recipes (list): List of recipes for output.
            tokenizer (transformers.Tokenizer): Tokenizer for encoding text.
            max_length (int): Maximum length of input and output sequences.
        """
        self.ingredients = ingredients
        self.recipes = recipes
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """
        Returns the length of the dataset.
        """
        return len(self.ingredients)

    def __getitem__(self, idx):
        """
        Get an item from the dataset by index.

        Args:
            idx (int): Index of the item.

        Returns:
            dict: Dictionary containing the input_ids, attention_mask, and labels.
        """
        # Get the input text and target text
        input_text = str(self.ingredients[idx])
        target_text = str(self.recipes[idx])

        # Tokenize input text
        input_tokens = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize target text
        target_tokens = self.tokenizer.encode(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Return input_ids, attention_mask, and labels
        return {
            "input_ids": input_tokens.input_ids.flatten(),
            "attention_mask": input_tokens.attention_mask.flatten(),
            "labels": target_tokens.flatten()
        }

In [6]:
# Split the dataset into train and validation sets
train_inst, val_test_inst, train_resp, val_test_resp = train_test_split(
    inst, resp, test_size=0.2, random_state=42
)

val_inst, test_inst, val_resp, test_resp = train_test_split(
    val_test_inst, val_test_resp, test_size=0.5, random_state=42
)

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Prepare the dataset and dataloaders
train_dataset = RecipeDataset(train_inst, train_resp, tokenizer)
val_dataset = RecipeDataset(val_inst, val_resp, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

## Training the Model

In [7]:
def train_recipe_model(model, device, train_dataloader, val_dataloader, optimizer, epochs=1):
    """
    Train the recipe generation model.

    Args:
        model (torch.nn.Module): The recipe generation model.
        train_dataloader (DataLoader): Dataloader for training data.
        val_dataloader (DataLoader): Dataloader for validation data.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        epochs (int): Number of epochs for training. Default is 1.
    """
    # Define training parameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Validation"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

        # Calculate average training and validation loss
        avg_train_loss = train_loss / len(train_dataloader)
        avg_val_loss = val_loss / len(val_dataloader)

        # Print epoch-wise training and validation loss
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}")

    # Save the trained model
    torch.save(model.state_dict(), "recipe_generation_model.pth")
    return model

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 1
model = train_recipe_model(model, device, train_dataloader, val_dataloader, optimizer, epochs=1)

Epoch 1/1: 100%|██████████| 20000/20000 [2:03:00<00:00,  2.71it/s]  
Validation: 100%|██████████| 2500/2500 [04:47<00:00,  8.69it/s]


Epoch 1/1, Train Loss: 0.7127893011666835, Val Loss: 0.5979571074783802


## Saving the model

Saving in the Hugging Face format

In [15]:
# CONFIGURATION
# Load the model configuration
config = BartConfig.from_pretrained('facebook/bart-base', output_hidden_states=False)
# Save the configuration to disk
config.save_pretrained('Recipe_Generation_BART')

# MODEL
# Load your fine-tuned model weights into the predefined BART architecture
model = BartForConditionalGeneration(config)
state_dict = torch.load('/kaggle/working/recipe_generation_model.pth')
model.load_state_dict(state_dict)
# Save the model in the Hugging Face format
model.save_pretrained('Recipe_Generation_BART')

# TOKENIZER
# Load and save the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
tokenizer.save_pretrained('Recipe_Generation_BART')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('Recipe_Generation_BART/tokenizer_config.json',
 'Recipe_Generation_BART/special_tokens_map.json',
 'Recipe_Generation_BART/vocab.json',
 'Recipe_Generation_BART/merges.txt',
 'Recipe_Generation_BART/added_tokens.json')

In [16]:
def zip_directory(folder_path, output_path):
    # Create a zip file at the specified output path
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through all the directories and files in the folder_path
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a proper path for each file to be added
                file_path = os.path.join(root, file)
                # Create an archive name to store the file in the zip
                # This is the path within the zip file
                archive_name = os.path.relpath(file_path, os.path.dirname(folder_path))
                # Add the file to the zip file with its new archive name
                zipf.write(file_path, arcname=archive_name)
    print(f"Created zip file {output_path}")
    
# Directory to be zipped
directory_to_zip = "/kaggle/working/Recipe_Generation_BART"
# Output zip file path
zip_output_path = "/kaggle/working/Recipe_Generation_BART.zip"

# Call the function
zip_directory(directory_to_zip, zip_output_path)

Created zip file /kaggle/working/Recipe_Generation_BART.zip


Downloaded the zip & Manually uploaded to the Hugging Face Platform

Downloading test data for inference

In [21]:
# Saving for later inference
test_data = {"Instruction": test_inst, "Response": test_resp}

# Create DataFrame
test_df = pd.DataFrame(test_data)

# Save DataFrame as CSV
test_df.to_csv("test_data.csv", index=False)
