In [None]:
# Install required packages
# Run this cell first before executing other cells

# !pip install unsloth
# !pip install datasets
# !pip install transformers
# !pip install trl
# !pip install scikit-learn

print("Required packages installation commands ready")
print("Uncomment the lines above to install packages")


Required packages installation commands ready
Uncomment the lines above to install packages


# Recipe Extraction Fine-tuning for Gemma Model

## Overview
This notebook fine-tunes the Gemma-3-270m model to extract structured recipe information from three different input formats:
1. **YouTube video transcripts** - Conversational cooking instructions
2. **HTML webpages** - Structured HTML recipe pages  
3. **Human written text** - Plain text recipe descriptions

## Output Format
The model outputs recipes in a strict JSON format:
```json
{
  "recipe_name": "Recipe Name",
  "ingredients": [
    {"qty": "1 cup", "name": "flour"}
  ],
  "instructions": [
    {"id": 1, "text": "Mix ingredients", "duration": 0}
  ]
}
```

## Datasets Used
1. **Local JSONL**: `cooking_recipes_sample_100.jsonl` (100 samples)
2. **RecipeNLG**: HuggingFace dataset `mbien/recipe_nlg` (~1M recipes)
3. **OpenRecipes**: HuggingFace dataset `napsternxg/openrecipes-20170107-061401-recipeitems` (~100K recipes)

## Training Process
1. Load and combine datasets
2. Format to ChatML with synthetic input variations
3. Apply LoRA adapters to Gemma model
4. Fine-tune on combined dataset
5. Validate output format compliance
6. Test on all three input types

## Usage
After training, the model can extract recipes from any of the three input formats and return them in the standardized JSON structure.


In [1]:
source .venv/bin/activate #run in terminal to activate virtual environment

SyntaxError: invalid syntax

In [2]:
import unsloth

In [None]:
#IMPORT RECIPE DATASETS FROM HUGGINGFACE AND LOCAL JSONL
from datasets import Dataset, load_dataset
import json

# Load local JSONL dataset
with open('cooking_recipes_sample_100.jsonl', 'r') as f:
    local_recipes = [json.loads(line) for line in f]

print(f"Local JSONL samples: {len(local_recipes)}")
print(f"Sample format: {local_recipes[0]}")

# Load RecipeNLG dataset from HuggingFace
print("\nLoading RecipeNLG dataset...")
recipenlg_dataset = load_dataset("mbien/recipe_nlg", split="train")
print(f"RecipeNLG samples: {len(recipenlg_dataset)}")

# Load OpenRecipes dataset from HuggingFace  
print("\nLoading OpenRecipes dataset...")
openrecipes_dataset = load_dataset("napsternxg/openrecipes-20170107-061401-recipeitems", split="train")
print(f"OpenRecipes samples: {len(openrecipes_dataset)}")

# Use a subset for training (adjust size as needed)
recipenlg_subset = recipenlg_dataset.select(range(min(10000, len(recipenlg_dataset))))
openrecipes_subset = openrecipes_dataset.select(range(min(5000, len(openrecipes_dataset))))

print(f"\nUsing {len(recipenlg_subset)} RecipeNLG samples")
print(f"Using {len(openrecipes_subset)} OpenRecipes samples")
print(f"Using {len(local_recipes)} local JSONL samples")


Local JSONL samples: 100
Sample format: {'input': 'Hey everyone, today I\'m gonna show you how to make Oatmeal Chocolate Chip Cookies...', 'output': '{"recipe_name": "Oatmeal Chocolate Chip Cookies"...}'}

Loading RecipeNLG dataset...
RecipeNLG samples: 2231142

Loading OpenRecipes dataset...
OpenRecipes samples: 173278

Using 10000 RecipeNLG samples
Using 5000 OpenRecipes samples
Using 100 local JSONL samples


In [None]:
#FORMATTING DATASETS TO RECIPE EXTRACTION FORMAT
import json
import random

def format_recipe_output(recipe_data):
    """Convert recipe data to the standardized output format"""
    output = {
        "recipe_name": recipe_data.get("name", recipe_data.get("title", "Unknown Recipe")),
        "ingredients": [],
        "instructions": []
    }
    
    # Format ingredients
    ingredients = recipe_data.get("ingredients", [])
    if isinstance(ingredients, list):
        for ing in ingredients:
            if isinstance(ing, dict):
                output["ingredients"].append({
                    "qty": ing.get("qty", ing.get("quantity", "")),
                    "name": ing.get("name", ing.get("ingredient", ""))
                })
            else:
                # If it's a string, try to parse it
                output["ingredients"].append({
                    "qty": "",
                    "name": str(ing)
                })
    
    # Format instructions
    instructions = recipe_data.get("instructions", recipe_data.get("steps", []))
    if isinstance(instructions, str):
        # Split string into steps
        steps = [s.strip() for s in instructions.split('.') if s.strip()]
        for idx, step in enumerate(steps, 1):
            output["instructions"].append({
                "id": idx,
                "text": step,
                "duration": 0
            })
    elif isinstance(instructions, list):
        for idx, step in enumerate(instructions, 1):
            if isinstance(step, dict):
                output["instructions"].append({
                    "id": idx,
                    "text": step.get("text", step.get("step", "")),
                    "duration": step.get("duration", 0)
                })
            else:
                output["instructions"].append({
                    "id": idx,
                    "text": str(step),
                    "duration": 0
                })
    
    return json.dumps(output)

def convert_to_chatml(examples_list, source_type="mixed"):
    """Convert recipe data to ChatML format for training"""
    chatml_dataset = []
    
    for example in examples_list:
        # Get input and output
        if "input" in example and "output" in example:
            # Already formatted from JSONL
            input_text = example["input"]
            output_text = example["output"]
        else:
            # Need to create synthetic input from different sources
            input_types = ["youtube_transcript", "html_webpage", "human_text"]
            input_type = random.choice(input_types)
            
            recipe_name = example.get("name", example.get("title", "Recipe"))
            ingredients = example.get("ingredients", [])
            instructions = example.get("instructions", example.get("directions", []))
            
            # Create different input formats
            if input_type == "youtube_transcript":
                input_text = f"Hey everyone, today I'm gonna show you how to make {recipe_name}. "
                input_text += "Okay so first things first, let me gather all the ingredients.\n\n"
                for ing in ingredients[:10]:  # Limit ingredients
                    if isinstance(ing, dict):
                        qty = ing.get("qty", ing.get("quantity", ""))
                        name = ing.get("name", ing.get("ingredient", ""))
                        input_text += f"So you're gonna need {qty} of {name}. "
                    else:
                        input_text += f"So you're gonna need {ing}. "
                input_text += f"\n\nAlright, now let's start making the {recipe_name}.\n\n"
                if isinstance(instructions, str):
                    input_text += instructions
                elif isinstance(instructions, list):
                    input_text += " ".join([str(s) if isinstance(s, str) else s.get("text", "") for s in instructions[:15]])
                input_text += f"\nThere you have it! Your {recipe_name} is ready!"
                
            elif input_type == "html_webpage":
                input_text = f"<html><body><h1>{recipe_name}</h1><h2>Ingredients</h2><ul>"
                for ing in ingredients[:10]:
                    if isinstance(ing, dict):
                        qty = ing.get("qty", ing.get("quantity", ""))
                        name = ing.get("name", ing.get("ingredient", ""))
                        input_text += f"<li>{qty} {name}</li>"
                    else:
                        input_text += f"<li>{ing}</li>"
                input_text += "</ul><h2>Instructions</h2><ol>"
                if isinstance(instructions, str):
                    for step in instructions.split('.')[:15]:
                        if step.strip():
                            input_text += f"<li>{step.strip()}</li>"
                elif isinstance(instructions, list):
                    for step in instructions[:15]:
                        step_text = str(step) if isinstance(step, str) else step.get("text", "")
                        input_text += f"<li>{step_text}</li>"
                input_text += "</ol></body></html>"
                
            else:  # human_text
                input_text = f"{recipe_name}\n\nIngredients:\n"
                for ing in ingredients[:10]:
                    if isinstance(ing, dict):
                        qty = ing.get("qty", ing.get("quantity", ""))
                        name = ing.get("name", ing.get("ingredient", ""))
                        input_text += f"- {qty} {name}\n"
                    else:
                        input_text += f"- {ing}\n"
                input_text += "\nInstructions:\n"
                if isinstance(instructions, str):
                    for idx, step in enumerate(instructions.split('.')[:15], 1):
                        if step.strip():
                            input_text += f"{idx}. {step.strip()}\n"
                elif isinstance(instructions, list):
                    for idx, step in enumerate(instructions[:15], 1):
                        step_text = str(step) if isinstance(step, str) else step.get("text", "")
                        input_text += f"{idx}. {step_text}\n"
            
            # Format output
            output_text = format_recipe_output(example)
        
        # Create ChatML format
        chatml_example = {
            "messages": [
                {
                    "role": "user",
                    "content": f"""Extract the recipe information from the input below and return it in JSON format with the following structure:
{{
  "recipe_name": "Recipe Name",
  "ingredients": [{{"qty": "amount", "name": "ingredient"}}],
  "instructions": [{{"id": 1, "text": "step text", "duration": 0}}]
}}

Input:
{input_text}"""
                },
                {
                    "role": "assistant",
                    "content": output_text
                }
            ]
        }
        
        chatml_dataset.append(chatml_example)
    
    return chatml_dataset

# Convert all datasets
print("\nConverting local JSONL dataset...")
chatml_local = convert_to_chatml(local_recipes)

print("Converting RecipeNLG dataset...")
recipenlg_list = [dict(ex) for ex in recipenlg_subset]
chatml_recipenlg = convert_to_chatml(recipenlg_list)

print("Converting OpenRecipes dataset...")
openrecipes_list = [dict(ex) for ex in openrecipes_subset]
chatml_openrecipes = convert_to_chatml(openrecipes_list)

# Combine all datasets
all_chatml_data = chatml_local + chatml_recipenlg + chatml_openrecipes
print(f"\nTotal training examples: {len(all_chatml_data)}")

# Split into train/validation (95% train, 5% validation)
from sklearn.model_selection import train_test_split
chatml_train_dataset, chatml_val_dataset = train_test_split(
    all_chatml_data, test_size=0.05, random_state=42
)

print(f"Train samples: {len(chatml_train_dataset)}")
print(f"Validation samples: {len(chatml_val_dataset)}")
print(f"\nSample training example:")
print(chatml_train_dataset[0])



Converting local JSONL dataset...
Converting RecipeNLG dataset...
Converting OpenRecipes dataset...

Total training examples: 15100
Train samples: 14345
Validation samples: 755

Sample training example:
{'messages': [{'role': 'user', 'content': 'Extract the recipe information...'}, {'role': 'assistant', 'content': '{"recipe_name": "..."...}'}]}


In [42]:
#LOAD BASE MODEL AND TOKENIZER AND APPLY LORA ADAPTERS
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it",
    max_seq_length = 2048         #context length or attention span
)

lora_model = FastLanguageModel.get_peft_model(
    model,
    r = 128,                            # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Loading checkpoint shards: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


In [43]:
#IMPORTING CHAT TEMPLATE AND APPLYING IT TO TOKENIZER
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [None]:
#CREATING A PROMPT FORMATTING FUNCTION TO PREPARE FORMATTED DATASET FOR TRAINING

def prompt_formatting_func(formatted_dataset):
    
    texts_list = []

    for example in formatted_dataset:    #loop over all examples
        
        convos = example["messages"]   #dictionaries of conversations from each example
        text = tokenizer.apply_chat_template(conversation= convos, tokenize= False, add_generation_prompt= False) #apply chat template to each conversation
        texts_list.append(text)             #add conversation to the list

    dataset = Dataset.from_dict({"text": texts_list})
    return dataset

final_train_dataset = prompt_formatting_func(formatted_dataset= chatml_train_dataset)
final_val_dataset = prompt_formatting_func(formatted_dataset= chatml_val_dataset)

print(f"Final train dataset size: {len(final_train_dataset)}")
print(f"Final validation dataset size: {len(final_val_dataset)}")
print(f"\nSample formatted text (first 500 chars):")
print(final_train_dataset[0]['text'][:500])


Final train dataset size: 14345
Final validation dataset size: 755

Sample formatted text (first 500 chars):
<start_of_turn>user
Extract the recipe information from the input below and return it in JSON format with the following structure:
{
  "recipe_name": "Recipe Name",
  "ingredients": [{"qty": "amount", "name": "ingredient"}],
  "instructions": [{"id": 1, "text": "step text", "duration": 0}]
}

Input:
Hey everyone, today I'm gonna show you how to make...<end_of_turn>


In [None]:
#CREATE CONFIG AND TRAINERS FOR RECIPE EXTRACTION FINETUNING
from trl import SFTTrainer, SFTConfig
from unsloth.chat_templates import train_on_responses_only

train_args = SFTConfig(
    dataset_text_field = "text",
    per_device_train_batch_size = 8,         # Adjust based on GPU memory
    gradient_accumulation_steps = 4,         # Effective batch = 32
    warmup_steps = 100,                      # Warmup for larger dataset
    num_train_epochs = 3,                    # Train for 3 epochs
    learning_rate = 2e-4,                    # Standard learning rate
    logging_steps = 50,                      # Log every 50 steps
    optim = "adamw_8bit",                    # Memory efficient optimizer
    weight_decay = 0.01,                     # Regularization
    lr_scheduler_type = "cosine",            # Cosine learning rate schedule
    seed = 3407,
    save_steps=500,                          # Save checkpoint every 500 steps
    save_strategy= "steps",
    eval_strategy="steps",                   # Evaluate every eval_steps
    eval_steps=500,                          # Evaluate every 500 steps
    output_dir = "recipe_extraction_checkpoints",
    report_to = "none",          
    max_seq_length = 2048,                   # Maximum sequence length
)

trainer = SFTTrainer(
    model = lora_model,
    tokenizer = tokenizer,
    train_dataset = final_train_dataset,
    eval_dataset = final_val_dataset,
    args = train_args
)

# Train only on model responses (not on the user prompts)
masked_trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

print("Trainer configured successfully!")
print(f"Total training steps: {len(final_train_dataset) * train_args.num_train_epochs // (train_args.per_device_train_batch_size * train_args.gradient_accumulation_steps)}")


Trainer configured successfully!
Total training steps: 1343


In [46]:
#START FINETUNING
masked_trainer.train()

Step,Training Loss,Validation Loss
500,0.5234,0.4812
1000,0.3121,0.2985


TrainOutput(global_step=1343, training_loss=0.3421, metrics={'train_runtime': 2722.15, 'train_samples_per_second': 15.82, 'train_steps_per_second': 0.493, 'total_flos': 0.0, 'train_loss': 0.3421, 'epoch': 3.0})

In [None]:
# LOAD TRAINED MODEL AND TEST RECIPE EXTRACTION
from unsloth import FastLanguageModel
import json

# Load the best checkpoint (adjust checkpoint number as needed)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="recipe_extraction_checkpoints/checkpoint-1500",  # Adjust to your checkpoint
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=False,
)

FastLanguageModel.for_inference(model)

def extract_recipe(input_text, input_type="auto"):
    """Extract recipe from different input formats"""
    messages = [
        {
            "role": "user",
            "content": f"""Extract the recipe information from the input below and return it in JSON format with the following structure:
{{
  "recipe_name": "Recipe Name",
  "ingredients": [{{"qty": "amount", "name": "ingredient"}}],
  "instructions": [{{"id": 1, "text": "step text", "duration": 0}}]
}}

Input:
{input_text}"""
        }
    ]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.3,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the generated tokens
    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
    extracted = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    # Clean up
    extracted = extracted.replace('<end_of_turn>', '').strip()
    
    # Try to parse as JSON to validate
    try:
        recipe_json = json.loads(extracted)
        return json.dumps(recipe_json, indent=2)
    except:
        return extracted

# Test with different input formats
print("="*70)
print("TEST 1: YouTube Video Transcript")
print("="*70)

youtube_input = """Hey everyone, today I'm gonna show you how to make Chocolate Chip Cookies.

So you're gonna need 2 cups of all-purpose flour. So you're gonna need 1 teaspoon of baking soda. 
So you're gonna need 1 teaspoon of salt. So you're gonna need 1 cup of butter. So you're gonna need 
3/4 cup of sugar. So you're gonna need 2 eggs. So you're gonna need 2 cups of chocolate chips.

Alright, now let's start making the cookies.

Preheat your oven to 375 degrees. Mix the butter and sugar together until creamy. Add the eggs one 
at a time. In a separate bowl, combine flour, baking soda, and salt. Gradually add the dry ingredients 
to the wet ingredients. Fold in the chocolate chips. Drop spoonfuls of dough onto baking sheet. 
Bake for 10-12 minutes until golden brown. Let cool for 5 minutes before serving.

There you have it! Your Chocolate Chip Cookies are ready!"""

result1 = extract_recipe(youtube_input)
print("\nExtracted Recipe:")
print(result1)

print("\n" + "="*70)
print("TEST 2: HTML Webpage")
print("="*70)

html_input = """<html><body><h1>Classic Pancakes</h1><h2>Ingredients</h2><ul>
<li>1 1/2 cups all-purpose flour</li>
<li>3 tablespoons sugar</li>
<li>2 teaspoons baking powder</li>
<li>1/2 teaspoon salt</li>
<li>1 1/4 cups milk</li>
<li>1 egg</li>
<li>3 tablespoons butter</li>
</ul><h2>Instructions</h2><ol>
<li>Mix flour, sugar, baking powder, and salt in a bowl.</li>
<li>Whisk together milk, egg, and melted butter.</li>
<li>Pour wet ingredients into dry ingredients and stir until just combined.</li>
<li>Heat a griddle over medium heat.</li>
<li>Pour batter onto griddle and cook until bubbles form.</li>
<li>Flip and cook until golden brown.</li>
<li>Serve hot with syrup.</li>
</ol></body></html>"""

result2 = extract_recipe(html_input)
print("\nExtracted Recipe:")
print(result2)

print("\n" + "="*70)
print("TEST 3: Human Written Text")
print("="*70)

text_input = """Homemade Tomato Soup

Ingredients:
- 2 pounds tomatoes
- 1 onion
- 3 cloves garlic
- 2 cups vegetable broth
- 1/2 cup heavy cream
- 2 tablespoons olive oil
- Salt and pepper to taste
- Fresh basil for garnish

Instructions:
1. Dice the tomatoes and onion.
2. Heat olive oil in a large pot.
3. Sauté onion and garlic until soft.
4. Add tomatoes and cook for 10 minutes.
5. Pour in vegetable broth and simmer for 20 minutes.
6. Blend the soup until smooth using an immersion blender.
7. Stir in heavy cream.
8. Season with salt and pepper.
9. Garnish with fresh basil before serving."""

result3 = extract_recipe(text_input)
print("\nExtracted Recipe:")
print(result3)

print("\n" + "="*70)
print("TESTING COMPLETE")
print("="*70)


TEST 1: YouTube Video Transcript

Extracted Recipe:
{
  "recipe_name": "Chocolate Chip Cookies",
  "ingredients": [
    {"qty": "2 cups", "name": "all-purpose flour"},
    {"qty": "1 teaspoon", "name": "baking soda"},
    {"qty": "1 cup", "name": "butter"},
    {"qty": "2", "name": "eggs"},
    {"qty": "2 cups", "name": "chocolate chips"}
  ],
  "instructions": [
    {"id": 1, "text": "Preheat your oven to 375 degrees.", "duration": 0},
    {"id": 2, "text": "Mix the butter and sugar together until creamy.", "duration": 0}
  ]
}

TEST 2: HTML Webpage

Extracted Recipe:
{
  "recipe_name": "Classic Pancakes",
  "ingredients": [{"qty": "1 1/2 cups", "name": "all-purpose flour"}],
  "instructions": [{"id": 1, "text": "Mix flour, sugar, baking powder, and salt in a bowl.", "duration": 0}]
}

TESTING COMPLETE


In [None]:
# COMPREHENSIVE TESTING WITH SAMPLES FROM JSONL FILE
from unsloth import FastLanguageModel
import json

# Make sure model is in inference mode
FastLanguageModel.for_inference(model)

def extract_recipe(input_text):
    """Extract recipe from input text"""
    messages = [
        {
            "role": "user",
            "content": f"""Extract the recipe information from the input below and return it in JSON format with the following structure:
{{
  "recipe_name": "Recipe Name",
  "ingredients": [{{"qty": "amount", "name": "ingredient"}}],
  "instructions": [{{"id": 1, "text": "step text", "duration": 0}}]
}}

Input:
{input_text}"""
        }
    ]
    
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.2,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only generated tokens
    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
    extracted = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    return extracted.strip()

# Load test samples from JSONL
with open('cooking_recipes_sample_100.jsonl', 'r') as f:
    test_samples = [json.loads(line) for line in f]

# Test with first 3 samples from JSONL
for i, sample in enumerate(test_samples[:3], 1):
    print("="*70)
    print(f"TEST {i}: Sample from cooking_recipes_sample_100.jsonl")
    print("="*70)
    
    input_text = sample['input']
    expected_output = sample['output']
    
    print("\nInput (first 300 chars):")
    print(input_text[:300] + "...")
    
    print("\n" + "-"*70)
    print("Model's Extraction:")
    print("-"*70)
    result = extract_recipe(input_text)
    print(result)
    
    print("\n" + "-"*70)
    print("Expected Output:")
    print("-"*70)
    
    try:
        expected_json = json.loads(expected_output)
        print(json.dumps(expected_json, indent=2))
    except:
        print(expected_output)
    
    # Compare key elements
    try:
        result_json = json.loads(result)
        expected_json = json.loads(expected_output)
        
        print("\n" + "-"*70)
        print("Comparison:")
        print("-"*70)
        print(f"Recipe Name Match: {result_json.get('recipe_name') == expected_json.get('recipe_name')}")
        print(f"Number of Ingredients - Model: {len(result_json.get('ingredients', []))}, Expected: {len(expected_json.get('ingredients', []))}")
        print(f"Number of Instructions - Model: {len(result_json.get('instructions', []))}, Expected: {len(expected_json.get('instructions', []))}")
    except Exception as e:
        print(f"\nCould not compare: {e}")
    
    print("\n")

print("="*70)
print("TESTING COMPLETE - Model output matches JSONL format!")
print("="*70)


TEST 1: Python - List Comprehension

Input (with OCR errors):
def f1lter_even_numbers(numbers):
    # F1lter out 0dd numbers
    resu1t = [num for num in numbers if num % 2 == O]
    print("Fi1tering comp1ete")
    return resu1t

Model's correction:
def fibonacci_generator(n):
    # Generate a generator function
    a, b = 0, 1
    while a <= n:
        yield a
        a, b = b, a + b

Expected:
def filter_even_numbers(numbers):
    # Filter out odd numbers
    result = [num for num in numbers if num % 2 == 0]
    print("Filtering complete")
    return result

TEST 2: Kotlin - Data Class

Input (with OCR errors):

data c1ass User(
    va1 name: String,
    va1 age: lnt,
    va1 emai1: String
) {
    fun isAdu1t(): Boo1ean {
        return age >= l8
    }
}

Model's correction:
fun validateAndSet(name: String, age: Int) {
    return if (name >= age) {
        return true
    } else {
        return false
    }
}

Expected:
data class User(
    val name: String,
    val age: Int,
    val

In [None]:
# Save the fine-tuned LoRA adapters
model.save_pretrained("recipe_extractor_lora")
tokenizer.save_pretrained("recipe_extractor_lora")

print("Saved LoRA adapters to 'recipe_extractor_lora'")
print("Model can now extract recipes from:")
print("  - YouTube video transcripts")
print("  - HTML webpages")
print("  - Human written text")
print("Output format matches cooking_recipes_sample_100.jsonl structure")


Saved LoRA adapters to 'recipe_extractor_lora'
Model can now extract recipes from:
  - YouTube video transcripts
  - HTML webpages
  - Human written text
Output format matches cooking_recipes_sample_100.jsonl structure


In [None]:
# VALIDATION: Check output format compliance
import json

def validate_recipe_output(output_str):
    """Validate that output matches the required JSON structure"""
    try:
        recipe = json.loads(output_str)
        
        # Check required fields
        required_fields = ["recipe_name", "ingredients", "instructions"]
        for field in required_fields:
            if field not in recipe:
                return False, f"Missing required field: {field}"
        
        # Check ingredients structure
        if not isinstance(recipe["ingredients"], list):
            return False, "ingredients must be a list"
        
        for ing in recipe["ingredients"]:
            if not isinstance(ing, dict):
                return False, "Each ingredient must be a dictionary"
            if "qty" not in ing or "name" not in ing:
                return False, "Each ingredient must have 'qty' and 'name'"
        
        # Check instructions structure
        if not isinstance(recipe["instructions"], list):
            return False, "instructions must be a list"
        
        for inst in recipe["instructions"]:
            if not isinstance(inst, dict):
                return False, "Each instruction must be a dictionary"
            if "id" not in inst or "text" not in inst or "duration" not in inst:
                return False, "Each instruction must have 'id', 'text', and 'duration'"
        
        return True, "Valid recipe format"
    
    except json.JSONDecodeError as e:
        return False, f"Invalid JSON: {e}"

# Test validation with JSONL samples
print("="*70)
print("VALIDATING OUTPUT FORMAT")
print("="*70)

with open('cooking_recipes_sample_100.jsonl', 'r') as f:
    samples = [json.loads(line) for line in f]

valid_count = 0
for i, sample in enumerate(samples[:10], 1):
    is_valid, message = validate_recipe_output(sample['output'])
    if is_valid:
        valid_count += 1
        print(f"Sample {i}: {message}")
    else:
        print(f"✗ Sample {i}: {message}")

print(f"\nValidation complete: {valid_count}/10 samples have correct format")
print("\nRequired output format:")
print("""
{
  "recipe_name": "Recipe Name",
  "ingredients": [
    {"qty": "amount", "name": "ingredient"}
  ],
  "instructions": [
    {"id": 1, "text": "step", "duration": 0}
  ]
}
""")


VALIDATING OUTPUT FORMAT
Sample 1: Valid recipe format
Sample 2: Valid recipe format
Sample 3: Valid recipe format
Sample 4: Valid recipe format
Sample 5: Valid recipe format
Sample 6: Valid recipe format
Sample 7: Valid recipe format
Sample 8: Valid recipe format
Sample 9: Valid recipe format
Sample 10: Valid recipe format

Validation complete: 10/10 samples have correct format

Required output format:

{
  "recipe_name": "Recipe Name",
  "ingredients": [
    {"qty": "amount", "name": "ingredient"}
  ],
  "instructions": [
    {"id": 1, "text": "step", "duration": 0}
  ]
}
