<a href="https://colab.research.google.com/github/oliviasteeed/ChefGPT/blob/main/trying_LLM_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# olivia attempt llm fine tuning

In [26]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

In [40]:
# randomly sample 1000 rows from recipes dataset with like a million rows

import pandas as pd

# Define the path to your CSV file
csv_file = '/Users/oliviasteed/Desktop/full_dataset.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

# Randomly sample 1000 rows
sampled_df = df.sample(n=1000, random_state=42)  # random_state for reproducibility

# Optionally, inspect the first few rows of the sampled data
print(sampled_df.head())

# Save the sampled rows to a new CSV file (optional)
sampled_df.to_csv('/Users/oliviasteed/Desktop/1000_sample_dataset.csv', index=False)

         Unnamed: 0                         title  \
2015528     2015528  Marinated Flank Steak Recipe   
1608734     1608734           French Chicken Stew   
778500       778500                Glazed Carrots   
1334975     1334975               Moms Pie Dough    
116562       116562      Pretzel Salad Or Dessert   

                                               ingredients  \
2015528  ["1 1/2 pound flank steak", "1/2 c. finely min...   
1608734  ["1 tablespoon rosemary", "1 teaspoon thyme", ...   
778500   ["3 to 4 carrots", "1 1/2 Tbsp. butter", "1/3 ...   
1334975  ["4.5 Cups Flour", "1.5 Tsp Salt", "Pinch Baki...   
116562   ["2 c. crushed small thin pretzels (sticks)", ...   

                                                directions  \
2015528  ["Remove tenderloin from steak.", "Score meat....   
1608734  ["combine all ingredients in slow cooker (6 qu...   
778500   ["Cook 3 to 4 carrots; cut crosswise in 1-inch...   
1334975  ["Mix all dry ingredients in a bowl.", "", "Ad...  

In [44]:
# IMPORT DATA

# If your data is in a CSV file:
# from datasets import Dataset

# Replace this with the path to your dataset
data = Dataset.from_csv("/Users/oliviasteed/Desktop/1000_sample_dataset.csv") #has recipes

In [45]:
data

Dataset({
    features: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER'],
    num_rows: 1000
})

In [46]:
# Assuming 'column_name1' and 'column_name2' are the columns you want to remove
data = data.remove_columns(['link', 'source', 'Unnamed: 0'])

In [47]:
# SPLIT DATASET INTO TEST AND TRAIN

# Split dataset into training and validation sets (80% for training, 20% for validation)
train_test_split = data.train_test_split(test_size=0.2)

# Create training and validation datasets
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [48]:
# PREPROCESS THE DATA

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the entire dataset
def tokenize_function(examples):

  text = [f"{title} {ingredients} {directions}" for title, ingredients, directions in zip(examples["title"], examples["ingredients"], examples["directions"])]

    # Tokenize the concatenated text
  tokenized = tokenizer(text, padding="max_length", truncation=True)

     # Create labels (shifted input IDs)
  tokenized["labels"] = tokenized["input_ids"].copy()

  return tokenized

    # return tokenizer(examples["title", "ingredients", "directions"], padding="256", truncation=True)

tokenized_data = train_dataset.map(tokenize_function, batched=True) # Tokenize training data
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True) # Tokenize validation data

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [49]:
# LOAD GPT MODEL

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize if adding custom tokens

Embedding(50257, 768)

In [50]:
# SET UP TRAINING ARGUMENTS

training_args = TrainingArguments(
    output_dir="/Users/oliviasteed/Desktop/chefgpt/gpt2_recipe_model",  # Directory to save the model
    overwrite_output_dir=True,
    num_train_epochs=5,  # Adjust the number of epochs
    per_device_train_batch_size=2,  # Adjust the batch size based on your GPU
    logging_dir="./logs",
    logging_steps=200,
    save_steps=500,
    save_total_limit=2,
    eval_steps=500,  # Evaluate on validation set every 500 steps
    eval_strategy="steps",  # Evaluate on validation set during training
)

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [None]:
# TRAIN MODEL

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_data,
    eval_dataset = val_dataset,
    tokenizer = tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


In [None]:
# SAVE FINE TUNED MODEL

model.save_pretrained("/Users/oliviasteed/Desktop/ChefGPT/gpt2_recipe_model")
tokenizer.save_pretrained("/Users/oliviasteed/Desktop/ChefGPT/gpt2_recipe_model")

In [None]:
# GENERATE RECIPES WITH MODEl

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./gpt2_recipe_model")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_recipe_model")

# Function to generate a recipe based on ingredients
def generate_recipe(ingredients):
    prompt = f"Recipe: {ingredients}\nIngredients: {ingredients}\nInstructions:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=200, num_return_sequences=1, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage:
ingredients = "spaghetti, eggs, pancetta, Parmesan, black pepper"
recipe = generate_recipe(ingredients)
print(recipe)