In [5]:
%pip install transformers[torch]

Collecting accelerate>=0.26.0 (from transformers[torch])
  Using cached accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.11.0-py3-none-any.whl (375 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.11.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

generator = pipeline("text-generation", model="gpt2")

def flatten_conversation(messages):
    prompt = ""
    for m in messages:
        if m["role"] == "user":
            prompt += f"User: {m['content']}\n"
        elif m["role"] == "assistant":
            prompt += f"Assistant: {m['content']}\n"
    if not messages or messages[-1]["role"] != "assistant":
        prompt += "Assistant:"
    return prompt

def run_baseline_inference(input_file, output_file):
    print(f"Running baseline inference on {input_file}...")
    with open(input_file, "r", encoding='utf-8') as f_in, open(output_file, "w", encoding='utf-8') as f_out:
        for line in f_in:
            example = json.loads(line)
            prompt = ""
            result_base = {}

            # multi turn format (has "messages")
            if "messages" in example and example["messages"]:
                messages = example["messages"]
                # Use the function to format the prompt
                prompt = flatten_conversation(messages[:-1]) 
                result_base = {
                    "messages": messages,
                    "reference_output": messages[-1]["content"],
                }

            # single turn format (has "instruction")
            elif "instruction" in example:
                instruction = example["instruction"]
                inp = example.get("input", "")
                
                prompt = instruction if not inp else f"{instruction}\n{inp}"
                result_base = {
                    "instruction": instruction,
                    "input": inp,
                    "reference_output": example.get("output", ""),
                }
            
            else:
                continue

            if "constraints" in example:
                result_base["constraints"] = example["constraints"]
            if "evidence_ids" in example:
                result_base["evidence_ids"] = example["evidence_ids"]

            generations = generator(
                prompt,
                max_new_tokens=100,
                num_return_sequences=1,
                do_sample=True,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                eos_token_id=generator.tokenizer.eos_token_id,
                pad_token_id=generator.tokenizer.eos_token_id
            )
            
            model_output = generations[0]["generated_text"][len(prompt):].strip()

            result = result_base
            result["model_output"] = model_output
            f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
            
    print(f"Inference complete. Results saved to {output_file}\n")

run_baseline_inference(
    "single_turn_test.jsonl", 
    "baseline_single_turn_outputs.jsonl"
)
run_baseline_inference(
    "multi_turn_test.jsonl", 
    "baseline_multi_turn_outputs.jsonl"
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


Running baseline inference on single_turn_test.jsonl...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Inference complete. Results saved to baseline_single_turn_outputs.jsonl

Running baseline inference on multi_turn_test.jsonl...
Inference complete. Results saved to baseline_multi_turn_outputs.jsonl



In [2]:
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

print("Loading datasets separately...")

st_train_ds = load_dataset('json', data_files="single_turn_train.jsonl", split="train")
mt_train_ds = load_dataset('json', data_files="multi_turn_train.jsonl", split="train")

st_val_ds = load_dataset('json', data_files="single_turn_val.jsonl", split="train")
mt_val_ds = load_dataset('json', data_files="multi_turn_val.jsonl", split="train")

print("Concatenating datasets...")
train_ds = concatenate_datasets([st_train_ds, mt_train_ds])
validation_ds = concatenate_datasets([st_val_ds, mt_val_ds])

datasets = DatasetDict({
    'train': train_ds,
    'validation': validation_ds
})

print(f"Loaded and combined dataset:")
print(datasets)
def preprocess_function(examples):
    prompts = []
    full_texts = []
    
    for i in range(len(examples['instruction'])):
        prompt_part = ""
        response_content = ""

        # multi-turn (messages is not None)
        if examples['messages'][i] is not None:
            messages = examples['messages'][i]
            if not messages: continue
            prompt_messages = messages[:-1]
            response_content = messages[-1]['content']
            prompt_part = flatten_conversation(prompt_messages)
        
        # single turn (instruction is not None)
        elif examples['instruction'][i] is not None:
            instruction = examples['instruction'][i]
            inp = examples['input'][i] if examples['input'][i] else ""
            output = examples['output'][i]
            
            prompt_content = instruction
            if inp:
                prompt_content += f"\n{inp}"
            
            prompt_part = f"User: {prompt_content}\nAssistant: "
            response_content = output
        
        else:
            continue 
            
        full_text = prompt_part + response_content + tokenizer.eos_token
        prompts.append(prompt_part)
        full_texts.append(full_text)

    model_inputs = tokenizer(
        full_texts, max_length=256, truncation=True, padding="max_length"
    )
    prompt_token_lengths = [
        len(tokenizer(p, max_length=256, truncation=True)["input_ids"]) 
        for p in prompts
    ]
    labels = [list(row) for row in model_inputs["input_ids"]]
    for i in range(len(labels)):
        prompt_len = prompt_token_lengths[i]
        actual_prompt_len = min(prompt_len, len(labels[i]))
        labels[i][:actual_prompt_len] = [-100] * actual_prompt_len
    
    model_inputs["labels"] = labels
    return model_inputs

print("Tokenizing combined dataset...")
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=datasets["train"].column_names 
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-combined",
    overwrite_output_dir=True,
    num_train_epochs=3.0,                   
    per_device_train_batch_size=4,          
    per_device_eval_batch_size=4,
    learning_rate=5e-5,                     
    weight_decay=0.01,
    logging_dir='./logs-combined',
    logging_steps=100,                      
    eval_strategy="steps",                  
    eval_steps=200,
    save_steps=200,                         
    save_total_limit=2,                     
    fp16=torch.cuda.is_available(),         
    report_to="none"  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting combined fine-tuning...")
trainer.train()

final_model_path = "./gpt2-finetuned-combined-final"
print(f"Training complete. Saving model to {final_model_path}")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print("Done")

Loading datasets separately...
Concatenating datasets...
Loaded and combined dataset:
DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'constraints', 'evidence_ids', 'messages'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'constraints', 'evidence_ids', 'messages'],
        num_rows: 20
    })
})
Tokenizing combined dataset...


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Starting combined fine-tuning...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Training complete. Saving model to ./gpt2-finetuned-combined-final
Done


In [3]:
FINETUNED_MODEL_PATH = "./gpt2-finetuned-combined-final" 

print(f"Loading fine-tuned model from: {FINETUNED_MODEL_PATH}")
tokenizer = GPT2Tokenizer.from_pretrained(FINETUNED_MODEL_PATH)
generator = pipeline(
    "text-generation",
    model=FINETUNED_MODEL_PATH,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)
print("Model loaded.")

def run_finetuned_inference(input_file, output_file):
    print(f"Running fine-tuned inference on {input_file}...")
    with open(input_file, "r", encoding='utf-8') as f_in, open(output_file, "w", encoding='utf-8') as f_out:
        for line in f_in:
            example = json.loads(line)
            prompt = ""
            result_base = {}
            
            # Case 1: Multi-turn format (has "messages")
            if "messages" in example and example["messages"]:
                messages = example["messages"]
                prompt = flatten_conversation(messages[:-1]) 
                result_base = {
                    "messages": messages,
                    "reference_output": messages[-1]["content"],
                }

            # Case 2: Single-turn format (has "instruction")
            elif "instruction" in example:
                instruction = example["instruction"]
                inp = example.get("input", "")
                
                prompt_content = instruction if not inp else f"{instruction}\n{inp}"
                prompt = f"User: {prompt_content}\nAssistant:"
                result_base = {
                    "instruction": instruction,
                    "input": inp,
                    "reference_output": example.get("output", ""),
                }
            
            else:
                continue

            if "constraints" in example:
                result_base["constraints"] = example["constraints"]
            if "evidence_ids" in example:
                result_base["evidence_ids"] = example["evidence_ids"]

            generations = generator(
                prompt,
                max_new_tokens=100,
                num_return_sequences=1,
                do_sample=True,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id
            )
            
            model_output = generations[0]["generated_text"][len(prompt):].strip()
            
            result = result_base
            result["model_output"] = model_output
            f_out.write(json.dumps(result, ensure_ascii=False) + "\n")
            
    print(f"Inference complete. Results saved to {output_file}\n")


run_finetuned_inference(
    "single_turn_test.jsonl", 
    "combined_finetuned_single_turn_outputs.jsonl"
)

run_finetuned_inference(
    "multi_turn_test.jsonl", 
    "combined_finetuned_multi_turn_outputs.jsonl"
)

print("\nAll fine-tuned (combined model) inference complete.")

Device set to use cuda:0


Loading fine-tuned model from: ./gpt2-finetuned-combined-final
Model loaded.
Running fine-tuned inference on single_turn_test.jsonl...
Inference complete. Results saved to combined_finetuned_single_turn_outputs.jsonl

Running fine-tuned inference on multi_turn_test.jsonl...
Inference complete. Results saved to combined_finetuned_multi_turn_outputs.jsonl


All fine-tuned (combined model) inference complete.


In [4]:
from IPython.display import display
import pandas as pd
import json 

def compare_outputs(baseline_file, finetuned_file, num_examples=10):
    baseline_data = []
    finetuned_data = []

    with open(baseline_file, "r", encoding='utf-8') as f:
        for line in f:
            baseline_data.append(json.loads(line))

    with open(finetuned_file, "r", encoding='utf-8') as f:
        for line in f:
            finetuned_data.append(json.loads(line))

    comparisons = []
    for i in range(min(num_examples, len(baseline_data), len(finetuned_data))):
        baseline = baseline_data[i]
        finetuned = finetuned_data[i]
        
        # Handle single-turn format
        if "instruction" in baseline:
            instruction = baseline["instruction"]
            if baseline.get("input"):
                instruction += f"\nInput: {baseline['input']}"
        # Handle multi-turn format
        else:
            messages = baseline["messages"]
            instruction = "\n".join([f"{m['role']}: {m['content']}" for m in messages[:-1]])
        
        comparisons.append({
            "Example": i + 1,
            "Instruction": instruction,
            "Reference": baseline["reference_output"],
            "Baseline Output": baseline["model_output"],
            "Fine-tuned Output": finetuned["model_output"]
        })
    
    df = pd.DataFrame(comparisons)
    
    styled_df = df.style.set_properties(**{
        'text-align': 'left',
        'white-space': 'pre-wrap',
        'word-wrap': 'break-word',
        'max-width': '300px'
    }).set_table_styles([
        {'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold')]},\
        {'selector': 'td', 'props': [('padding', '10px'), ('border', '1px solid #ddd')]}
    ])
    
    return styled_df

print("\n\n DETAILED SINGLE-TURN COMPARISON\n")
display(compare_outputs(
    "baseline_single_turn_outputs.jsonl",
    "combined_finetuned_single_turn_outputs.jsonl",
    num_examples=5
))

print("\n\n DETAILED MULTI-TURN COMPARISON\n")
display(compare_outputs(
    "baseline_multi_turn_outputs.jsonl",
    "combined_finetuned_multi_turn_outputs.jsonl",
    num_examples=5
))



 DETAILED SINGLE-TURN COMPARISON



Unnamed: 0,Example,Instruction,Reference,Baseline Output,Fine-tuned Output
0,1,Find four highly-rated gluten-free recipes with at least 5.0 stars and under 20 minutes.,1) Cottage-Cheese Stuffed Tomatoes—10 min; 2) Nutritious Food Enhancer—15 min; 3) Chili Lime BBQ Mangoes—15 min; 4) No-Bake Low-Carb Protein Bars—5 min,[image 1] [/img][/size],"1) Pumpkin Spice Cookies With Cream—39 min; 2). Szechuan Couscous Cheese (Chickpea)—5 hr 45 sec; 3); 4.) Cinnamon Toast Crunch Bars, Fresh & Raw Vegan Honeyed Peas in a Custard Sauce.—3 hrs 35 mins.; 6): Moroccan Red Onion Quiche Bake for Gluten Free Kids! Perfectly Spiced Pineapple Chives Filled Topping The Ingredients Under 230 kcal Author : Jennifer Binder Nutrition Information Yields"
1,2,Find four high-protein lunchs over 34g protein in under 35 minutes.,1) Awesome Steamed Cheeseburgers!—46.0 g protein—16 min; 2) Vegetarian Sandwich with Herbed Cream Cheese and Guacamole—40.5 g protein—10 min; 3) Barbecued Buffalo Wing Dip With a Twist—195.9 g protein—25 min; 4) Aussie Tuna Summer Salad—47.4 g protein—16 min,"In the first, you'll be given a 30 minute rest and then get your meal to work on 2nd Wednesday of each month for 4 weeks! This will allow it to take care not to eat too much or that morning afterwork food can come out quickly – especially when they're so hungry we don't have time (which is why this diet works). Once full again I'd recommend having 3 meals per day: 1 breakfast with pasta & potatoes; another one using breadsticks/cups","1) Feta Mackerel (Majesto)—40 min; 2—10 g carbs, 4–6 puffs.—4 mg sodium and 0% potassium.; 3). Thai Dessert for the Apple Potluck! Easy Bake With Sesame Seeds & Almonds And Goat Cheese Wraps!! Yes No Instructions Vegan or gluten free Allergy Warning — Under 119 kcal with less than 17mg sugar required.[/user]"
2,3,Find four low-carb dinners under 12g total carbohydrates with at least 34g protein.,1) Egyptian Red Snapper in Red Pepper Mint Sauce—60.1 g protein; 2) Mustard-Grilled Scandinavian Salmon—34.9 g protein; 3) Moroccan-Spiced Tuna—53.4 g protein; 4) Melodee's Hot Wings—207.7 g protein,"Dietary Approaches for Fasting: The following methods have been used in fasting or long term studies to produce a high percentage of lean muscle mass (i%E1): 1) NUTRITION & CHOICE, 2), PROTEIN, 3). For the first two diets that were considered ""fasting"", there was no need not only for blood glucose tests but also fat oxidation and lipids synthesis as well before supplementation by any foods besides fruits/proteins such As","1) Roast Chicken—494 g Protein; 2,146 min.; 3 ""Dirty Homemade Low Carb Chocolate Chip Cookies"" (Low Fat)—15 mg Fiber —5 minutes; 4). Oatmeal Crust Desserts With Cocoa Nuts and Cranberries—""Coconut Milk Pancakes""—27.8 G Total Carbs 1747 kcal.—1 hr 16 sec; 5.) Undercooked Pork Wings in Coconut Sandwiches Cake Sauce/Pumpkin Pie Mix,"""
3,4,Find four highly-rated vegan recipes with at least 5.0 stars and under 30 minutes.,1) Tomato Lentil Stew—25 min; 2) Vegan Waldorf Salad—20 min; 3) Quick Elephant Ears—10 min; 4) Summer Fruit & Vegetable Medley—18 min,"This post may contain links to Amazon or other partners; your purchases via these merchants are influenced by Skimlinks.""","1) Shrimp Omelette—8 min; 2). Chunky Italian Tenderloin With Cream Cheese & Mint Sauce (Amber's Cheddar)—10 g protein, less than 24g carbs; 3.) Smoked Salmon Stews! Seasoned Fish for the Summer —35 ml sodium buffer recommended; 4): The Big Easy Dessert Cake Bake Recipe Ever Crispy? Yes No Weight in grams 25 mg carbohydrate Carbohydrate Per Serving 0–13 wt Avg Protein 763 kcal"
4,5,Find four highly-rated vegetarian recipes with at least 4.0 stars and under 30 minutes.,"1) Super-Quick Brown Rice With Asparagus, Chickpeas, and Almonds—15 min; 2) The Best Cranberry Chutney—15 min; 3) Lychees & Ice Cream (With and Without Frangelico or Amaretto—5 min; 4) New-Fashioned Apple and Raisin Slaw (From Cooking Light)—10 min","The recipe will be ready in 15 to 40 minutes, so you can also prepare it on your own as a side dish or just enjoy eating the vegan food without worrying about not being able go back for more!","1) Grilled Onion Chops—25 min; 2). Gingerbread Cabbage (Vegan, No Calorie)—20 secs; 3); Thai Vegetarian Pita Salad With Tomato Avocado & Green Peppercorn Tarragon Sauce or Blackberry Lime Juice.—30 sb total time between prepandry/dinner breakfasts;4)(Ages 11+) Vegan Chicken Breast Soup—""6"" Mozzarella Cheese Risotto Sandwich""—17 fds net carbs (−"




 DETAILED MULTI-TURN COMPARISON



Unnamed: 0,Example,Instruction,Reference,Baseline Output,Fine-tuned Output
0,1,"user: I need lunch ideas. assistant: What's your time constraint and protein goal? user: Under 55 minutes, at least 17g.",1) Crispy Chicken Fingers with Spicy Honey Mustard—62.8 g protein—25 min; 2) California Avocado Monte Cristo—22.5 g protein—18 min; 3) Chicken Burrito Bundles—33.4 g protein—30 min; 4) Meatloaf Sandwiches Without the Loaf—34.2 g protein—40 min,Where do you work for people to find out what they're getting into when their body is eating the wrong way with too much fructose in it that makes them feel sick not just physically but psychologically as well! (See my blog post here). The last thing we want from a fat person are stomachs feeling like garbage or having some sort of pain-killer effect on us after consuming so little energy because our bodies have already made up all this sugar… How about someone who isn't skinny enough yet could,1) Chicken Soup—31 min; 2). Chunky Garlic-Roasted Peppers with Lime Dressing (Orange Juice)—26 g total carbs (−10 mg sodium); 3.) Apple Pie With Honey Salsa —18 pct in under 45 seconds; 4): Broccoli Curry Wrapped In Coconut Cream Sauce…29 mpg kcal−16 s—-0 hrs dia.—5–6 wk SDL2/4 hr.; 5)/In case you haven't noticed by
1,2,user: Find lunch ideas. assistant: Are you looking for low-carb options? user: Yes — under 12g carbs and at least 25g protein.,1) Crunchy Tuna Walnut Salad—25.0 g protein—4.9 g carbs; 2) Whole Foods' Tarragon Chicken Salad—26.1 g protein—8.0 g carbs; 3) Hackney's Inside-Out Burger—59.7 g protein—0.8 g carbs; 4) Appetizer or Snack Party Tray—27.7 g protein—7.2 g carbs,"Is there anything new that can help with your ketogenic diet? [sigh] And if so, what is it about carbohydrate or fat in general.? If not carbohydrates then we need to be more specific on how many calories each meal should contain. That would mean a lower number of grams per day plus lots less energy expenditure (see here ). Would carbohydrate intake have any effect when trying to achieve the same results as other foods by just eating an extra 3% gbs/day instead?? You","1) Chicken Parmesan With Basil—28 g total carbohydrates; 2). Spicy Thai Peanut Butter Cookies (Mushroom Crab)—34 moles sodium, 17 mg potassium; 3); Asian Fried Tuna Salad with Fresh Cilantro & Spinach Omelette.—14 fts carbohydrate reduction or less than 4 min time limit; fourteen minutes.; 4.) White Chocolate Coconut Mousse Mixed Creamy Curry Fettuccine Rice Sandwich Bites Easy Roast Pork Chop in French R"
2,3,"user: I need breakfast ideas. assistant: What's your time constraint and carbs goal? user: Under 55 minutes, under 24g.","1) Campbelled Eggs—7.1 g carbs—10 min; 2) Asparagus, Mushroom and Cheese Omelet With Herbs—3.7 g carbs—15 min; 3) Corsican Omelette—4.0 g carbs—6 min; 4) Vegan Rice Pudding—11.6 g carbs—15 min","How much does it cost to eat lunch a week in the U States compared with what you do at home (i.e., my typical day)? I'm on 5pm-6am EST every night during normal hours of 3AM - 2PM PST so there are no weekends off unless that is necessary for work or school! All tips used herein should be considered unofficial as these may vary by location but include any applicable information including when/where food was purchased from which store(s) please",1) Chicken Broccoli Salad—35 min; 2). Chocolate Chip Cookie Dough Sandwich With Peanut Butter & Jelly Crust (Vegan)—25 kcal.; 3); Banana Bread Bites with Cheddar Cheese Parmesan Salsa Tenderloin Roast Beef Steak-In Soup! Easy Vegan Version 1.4 g protein in less than 12 seconds — 2520 mg sodium • 4.) Strawberry Cheesecake Cream Pie Bake Recipe for Low Carb Breakfast Pretzelies or Ketchup Cookies**
3,4,"user: I want to make soup. assistant: Any dietary restrictions or sodium concerns? user: Yes, low sodium under 576 mg and vegetarian.",1) Zesty Mexican Tomato Soup—519.1 mg sodium; 2) Garlic Stock for Soups—129.2 mg sodium; 3) Lebanese Adas Bis Silq (Lentil & Swisschard Soup)—218.4 mg sodium; 4) Moroccan Harira Soup—182.0 mg sodium,"What is the actual nutritional value of this product/food (i.e., calcium)? Is it more beneficial than food grade organic protein etc.? The amount you use depends on your needs for specific nutrients through blood sugar control. Are there any other ways that they are utilized throughout their development? These include a number-crunching approach involving an assessment with family members as well as medical professionals who examine each individual patient at every visit based upon how much risk he poses in terms' physical health...","1) Garlic-Dipped Chicken—73! (28 g protein)—17 min; 2). Asian Shrimp Soup with Cucumber Cornbread Spreads & Almonds""—35 mL water/25 minutes; 3] Keto Lemonade With Carrot Parmesan Sauce on Top—60–90 G Protein Content.—3 h 45 sec; 4); Thai Style Salsa And Feta Roasted Red Velvet Lime Dressing —0 s 25 μm glucose tolerance limit ± 1 hr BP"
4,5,user: Find lunch ideas. assistant: Are you looking for low-carb options? user: Yes — under 18g carbs and at least 34g protein.,1) Grilled Blue Cheese Burgers—37.1 g protein—2.1 g carbs; 2) Barbecued Lobster Tails—58.6 g protein—8.3 g carbs; 3) Sandra Lee's Beef Kebabs—48.1 g protein—4.0 g carbs; 4) Any-Way-You-Want 'em Burgers—36.5 g protein—7.5 g carbs,"Have we mentioned that it's okay to eat less than 30% of your daily calories from carbohydrates in the morning, but if not eating a lot then don't have enough time on my end or I'll be late. Can someone tell me more about this? Email [email protected] If all else fails please send us an email with comments explaining why they're bad choices","1) Broccoli Shrimp—38 g Protein; 2 ) Pineapple Crisps With Cream Sauce (Pine Barbecue)—32 mg Carbohydrate, 17 min.; 3/4 Cup Chicken Breast Brunch with Cabbage And Basil Soup Mixing Lime Juice For Two Spices! Gluten Free & Low Cal Diet!""Content Warning–Reduced sodium is recommended.–564 kcal.—037 s.""Vegetarian""--65% less saturated fat than average intake of 2098mg cholesterol"


In [5]:
from collections import Counter
from typing import Dict

class SimpleEvaluator:
    
    def calculate_bleu(self, reference: str, hypothesis: str) -> float:
        """Simple unigram BLEU score"""
        ref_tokens = reference.lower().split()
        hyp_tokens = hypothesis.lower().split()
        
        if not hyp_tokens:
            return 0.0
        
        ref_counts = Counter(ref_tokens)
        hyp_counts = Counter(hyp_tokens)
        
        # Count matches
        matches = sum((hyp_counts & ref_counts).values())
        precision = matches / len(hyp_tokens) if hyp_tokens else 0.0
        
        # Brevity penalty
        bp = min(1.0, len(hyp_tokens) / len(ref_tokens)) if ref_tokens else 0.0
        
        return precision * bp
    
    def evaluate_file(self, output_file: str, is_multi_turn: bool = False) -> Dict:
        """Evaluate all examples in a file"""
        bleu_scores = []
        total = 0
        valid = 0
        
        with open(output_file, 'r') as f:
            for line in f:
                example = json.loads(line)
                total += 1
                model_output = example.get('model_output', '').strip()
        
                reference_output = example.get('reference_output', '').strip()
                
                if not model_output or not reference_output:
                    continue
                
                valid += 1
                bleu = self.calculate_bleu(reference_output, model_output)
                bleu_scores.append(bleu)
        
        return {
            'total_examples': total,
            'valid_outputs': valid,
            'avg_bleu': sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
        }

def compare_baseline_vs_finetuned():
    """Compare baseline and fine-tuned model outputs"""
    evaluator = SimpleEvaluator()
    
    print("=== EVALUATING SINGLE-TURN ===")
    baseline_st = evaluator.evaluate_file('baseline_single_turn_outputs.jsonl')
    finetuned_st = evaluator.evaluate_file('combined_finetuned_single_turn_outputs.jsonl')
    
    print(f"Baseline   - BLEU: {baseline_st['avg_bleu']:.3f} ({baseline_st['valid_outputs']}/{baseline_st['total_examples']} examples)")
    print(f"Fine-tuned - BLEU: {finetuned_st['avg_bleu']:.3f} ({finetuned_st['valid_outputs']}/{finetuned_st['total_examples']} examples)")
    
    if baseline_st['avg_bleu'] > 0:
        st_improvement = ((finetuned_st['avg_bleu'] - baseline_st['avg_bleu']) / baseline_st['avg_bleu']) * 100
        print(f"Improvement: {st_improvement:+.1f}%")
    
    print("\n=== EVALUATING MULTI-TURN ===")
    baseline_mt = evaluator.evaluate_file('baseline_multi_turn_outputs.jsonl')
    finetuned_mt = evaluator.evaluate_file('combined_finetuned_multi_turn_outputs.jsonl')
    
    print(f"Baseline   - BLEU: {baseline_mt['avg_bleu']:.3f} ({baseline_mt['valid_outputs']}/{baseline_mt['total_examples']} examples)")
    print(f"Fine-tuned - BLEU: {finetuned_mt['avg_bleu']:.3f} ({finetuned_mt['valid_outputs']}/{finetuned_mt['total_examples']} examples)")
    
    if baseline_mt['avg_bleu'] > 0:
        mt_improvement = ((finetuned_mt['avg_bleu'] - baseline_mt['avg_bleu']) / baseline_mt['avg_bleu']) * 100
        print(f"Improvement: {mt_improvement:+.1f}%")

compare_baseline_vs_finetuned()

=== EVALUATING SINGLE-TURN ===
Baseline   - BLEU: 0.019 (10/10 examples)
Fine-tuned - BLEU: 0.077 (10/10 examples)
Improvement: +310.2%

=== EVALUATING MULTI-TURN ===
Baseline   - BLEU: 0.010 (10/10 examples)
Fine-tuned - BLEU: 0.062 (10/10 examples)
Improvement: +496.3%
