In [6]:
!pip install -U \
  transformers \
  accelerate \
  peft \
  datasets \
  bitsandbytes \
  pandas \
  torch \
  rouge_score \
  numpy

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12

In [None]:
import torch
import json
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
import pandas as pd
import gc

import wandb

print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: Tesla T4


In [None]:
# MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.use_cache = False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


In [None]:
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    inference_mode=False,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


In [1]:
from google.colab import files

print("Upload your annotated JSONL file:")
uploaded = files.upload()

# Get filename (usually 'annotated_risks.jsonl')
filename = list(uploaded.keys())[0]
print(f"Uploaded: {filename}")

def load_training_data(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Error loading line: {e}")
    return data

train_data = load_training_data(filename)

df = pd.DataFrame(train_data)

df = df.sample(frac=1).reset_index(drop=True) # Shuffle to avoid bias.

# Create Dataset
dataset = Dataset.from_pandas(df)

# Split into train/validation (80/20)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(f"Train size: {len(split_dataset['train'])}")
print(f"Validation size: {len(split_dataset['test'])}")

# Extract held-out eval set for evalaution.
remainder_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Create train/val split
split_dataset = remainder_dataset.train_test_split(test_size=0.2, seed=42)
print(f"Train size: {len(split_dataset['train'])}")
print(f"Validation size: {len(split_dataset['test'])}")

train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]


Upload your annotated JSONL file:


IndexError: list index out of range

In [None]:

MAX_LENGTH = 2048
RESERVED_FOR_OUTPUT = 128

def tokenize_function(example):
    prompt = f"{example['instruction']} input: {example['input']} output: "

    # Completion text
    completion_text = example["output"] + tokenizer.eos_token

    prompt_ids = tokenizer(
        prompt,
        add_special_tokens=False,
        truncation=True,
        max_length=MAX_LENGTH - RESERVED_FOR_OUTPUT,
    )["input_ids"]

    # Tokenize completion
    completion_ids = tokenizer(
        completion_text,
        add_special_tokens=False,
        truncation=True,
        max_length=RESERVED_FOR_OUTPUT,
    )["input_ids"]

    # Combine
    input_ids = prompt_ids + completion_ids
    labels = [-100] * len(prompt_ids) + completion_ids

    # Handle padding
    pad_len = MAX_LENGTH - len(input_ids)
    if pad_len > 0:
        input_ids += [tokenizer.pad_token_id] * pad_len
        labels += [-100] * pad_len
        attention_mask = [1] * (len(input_ids) - pad_len) + [0] * pad_len
    else:
        # Truncate if too long
        input_ids = input_ids[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
        attention_mask = [1] * MAX_LENGTH

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask,
    }

tokenized_train = train_dataset.map(
    tokenize_function,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    tokenize_function,
    remove_columns=val_dataset.column_names
)


Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./tinyllama-risk-extractor",
    num_train_epochs=30,  # Keep high but with aggressive early stopping
    per_device_train_batch_size=2,  # Reduced batch size
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,  # Effective batch size = 32
    warmup_ratio=0.05,  # Reduced warmup
    logging_steps=10,
    save_steps=25,
    eval_steps=25,
    eval_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    report_to="wandb",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    group_by_length=False,
    weight_decay=0.1,  # Increased for more regularization
    lr_scheduler_type="constant_with_warmup",  # Simpler scheduler
    max_grad_norm=0.5,
    seed=42,
    data_seed=42,
)

In [None]:
def causal_lm_collator(features):
    batch = tokenizer.pad(
        features,
        padding=True,
        return_tensors="pt",
    )

    # Ensure labels padded with -100
    labels = batch["labels"]
    labels[labels == tokenizer.pad_token_id] = -100
    batch["labels"] = labels

    return batch



In [None]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop after 2 evals without improvement
    early_stopping_threshold=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=causal_lm_collator,
    callbacks=[early_stopping],
)

torch.cuda.empty_cache()
gc.collect()


  trainer = Trainer(


282

In [None]:
print("Starting training...")
print(f"Training on {len(tokenized_train)} examples")
print(f"Validating on {len(tokenized_val)} examples")

# Start training
train_result = trainer.train()

# Save final model
trainer.save_model("./tinyllama-risk-extractor-final")
tokenizer.save_pretrained("./tinyllama-risk-extractor-final")

print("Training completed!")
print(f"Training metrics: {train_result.metrics}")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Starting training...
Training on 661 examples
Validating on 166 examples


[34m[1mwandb[0m: Currently logged in as: [33mmcnamacl[0m ([33mmcnamacl-trinity-college-dublin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
25,1.7051,1.022547
50,0.6854,0.738074
75,0.5787,0.692981
100,0.4489,0.670067
125,0.3512,0.717999


KeyboardInterrupt: 

In [None]:
import glob
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os

# List all checkpoints
checkpoints = sorted(glob.glob("./tinyllama-risk-extractor/checkpoint-*"))
print("Available checkpoints:")
for cp in checkpoints:
    step = cp.split("-")[-1]
    print(f"  - {cp} (step {step})")

# Load base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
tokenizer.pad_token = tokenizer.eos_token

# Load a specific checkpoint (choose the one with lowest eval_loss)
def load_best_checkpoint(checkpoints):
    """Find and load checkpoint with lowest eval_loss"""
    best_checkpoint = None
    best_loss = float('inf')

    for cp in checkpoints:
        state_file = f"{cp}/trainer_state.json"
        if os.path.exists(state_file):
            with open(state_file, "r") as f:
                state = json.load(f)
                if "best_metric" in state and state["best_metric"] < best_loss:
                    best_loss = state["best_metric"]
                    best_checkpoint = cp

    if best_checkpoint:
        print(f"Best checkpoint: {best_checkpoint} (loss: {best_loss:.4f})")
        model = PeftModel.from_pretrained(base_model, best_checkpoint, inference_mode=False)
        return model
    else:
        # Load the first checkpoint as fallback
        print(f"Using first checkpoint: {checkpoints[0]}")
        return PeftModel.from_pretrained(base_model, checkpoints[0], inference_mode=False)

model = load_best_checkpoint(checkpoints)

Available checkpoints:
  - ./tinyllama-risk-extractor/checkpoint-100 (step 100)
  - ./tinyllama-risk-extractor/checkpoint-125 (step 125)
Best checkpoint: ./tinyllama-risk-extractor/checkpoint-100 (loss: 0.6701)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

from sentence_transformers import SentenceTransformer
from scipy import stats

import pandas as pd
import numpy as np
import torch, json, re, time, random
import torch.nn.functional as F

from rouge_score import rouge_scorer

base_model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

adapter_path = "LLM_Model_Test_v8_gretel"

# Set random seeds for reproducibility
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load base model
def load_base():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    return AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map="auto",
    )

#  Load finetuned model
def load_trained():
    base = load_base()
    return PeftModel.from_pretrained(
        base,
        adapter_path,
        inference_mode=True,
    )

tokenizer = AutoTokenizer.from_pretrained(base_model_name, model_max_length=1848)
tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "left"

sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Evaluations: accuracy, precision, recall, F1, semantic simliarity, ROUGE.

# Prepare dataset from CSV file
def prepare_csv_dataset(filename):
  dataset = pd.read_csv(filename)
  instruction = dataset.iloc[:,0]
  input = dataset.iloc[:,1]
  output = dataset.iloc[:,2]
  return instruction, input, output

# Generate model outputs for a batch of inputs
def ask_model(model, instruction, inputs):
  prompts = [f"{instruction} input: {input} output:" for input in inputs]
  input_tokens = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

  input_tokens = {k: v.to(model.device) for k, v in input_tokens.items()}

  with torch.no_grad(), torch.inference_mode():
      outputs = model.generate(
          **input_tokens,
          max_new_tokens=200,
          temperature=0.5,
          do_sample=True,
          top_k=50,
          top_p=0.95,
          repetition_penalty=1.5,
          num_beams=4,
          pad_token_id=tokenizer.pad_token_id,
          eos_token_id=tokenizer.eos_token_id,
      )

  decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  return decoded

# Batch generate outputs from model
def generate_output(model, instruction, inputs):
    BATCH_SIZE = 4
    all_risks = []
    num_errors = 0

    for i in range(0, len(inputs), BATCH_SIZE):
        batch = inputs[i:i + BATCH_SIZE]
        decoded = ask_model(model, instruction, batch)
        for d in decoded:
          parsed_output, new_errors = parse_output(d)
          all_risks.extend(parsed_output)
          num_errors += new_errors

    return all_risks, num_errors

# Parse model output into structured format
def parse_output(decoded: str):
    risks = []
    num_errors = 0
    try:
        # Find the output part after "output: "
        if "output:" in decoded:
            generated = decoded.split("output:")[-1].strip()
        else:
            raise ValueError("Invalid format: No output listed.")

        generated_first_output = generated
        if re.search("\[(.*?)\[", generated):
          generated_first_output = re.search("\[(.*?)\[", generated)[0][:-1]

        # Split on "|" for categories and summary
        parts = generated_first_output.split("|", 1)
        if len(parts) != 2:
            raise ValueError("Invalid format: No '|' separator found.")

        categories_str = parts[0].strip()
        summary = parts[1].strip()

        # Parse categories as list (safe eval or json.loads)
        categories = json.loads(categories_str.replace("'", '"'))  # Convert single quotes to double for JSON

        risks.append({
            "categories": categories,
            "summary": summary,
            "output": generated_first_output,
        })
    except (json.JSONDecodeError, ValueError) as e:
        print(f"Parsing error: {e}")  # Log for debugging
        risks.append({
            "categories": ["INVALID"],
            "summary": "INVALID",
            "output": decoded,
        })
        num_errors+=1

    return risks, num_errors

def create_outputs(instruction, inputs):
  print(f"Starting base model generation.")
  base_model = load_base()
  start_time = time.time()

  base_model_output, num_errors_base = generate_output(base_model, instruction[0], inputs)
  print(f"Base model output: {base_model_output}, number of errors: {num_errors_base}")

  end_time = time.time()
  total_seconds = end_time - start_time
  total_minutes = int(total_seconds // 60)
  total_seconds_remaining = int(total_seconds % 60)

  print(f"Total execution time for base model: {total_minutes} minutes and {total_seconds_remaining} seconds")

  print(f"Save finetuned model outputs.")
  json_str = json.dumps(base_model_output, indent=4)
  with open("risk_extractor_microservice/evaluation_results/base_evaluation_output_stable.json", "w+") as f:
      f.write(json_str)

  print(f"Starting finetuned model generation.")
  finetuned_model = load_trained()
  start_time = time.time()

  finetuned_model_output, num_errors_finetuned = generate_output(finetuned_model, instruction[0], inputs)
  print(f"Finetuned model output: {finetuned_model_output}, number of errors: {num_errors_finetuned}")

  end_time = time.time()
  total_seconds = end_time - start_time
  total_minutes = int(total_seconds // 60)
  total_seconds_remaining = int(total_seconds % 60)

  print(f"Total execution time for finetuned model: {total_minutes} minutes and {total_seconds_remaining} seconds")

  print(f"Save finetuned model outputs.")
  json_str = json.dumps(finetuned_model_output, indent=4)
  with open("risk_extractor_microservice/evaluation_results/finetuned_evaluation_output_stable.json", "w+") as f:
      f.write(json_str)

  return base_model_output, finetuned_model_output, num_errors_base, num_errors_finetuned

def extract_categories(output):
  return output["categories"]

def extract_summary(output):
  return output["summary"]

# Evaluate semantic similarity using SBERT embeddings
def evaluate_semantic_similarity(gt_output, llm_output):
  combined = list(gt_output) + list(llm_output)
  output_embs = sbert_model.encode(combined, convert_to_tensor=True)
  n_gt = len(gt_output); n_gen = len(llm_output)

  gt_emb_output = output_embs[:n_gt]; gen_emb_output = output_embs[n_gt:]

  # Normalize embeddings
  gt_emb_output_norm = F.normalize(gt_emb_output, p=2, dim=1)
  gen_emb_output_norm = F.normalize(gen_emb_output, p=2, dim=1)

  pairwise_sims = (gt_emb_output_norm * gen_emb_output_norm).sum(dim=1)

  # Convert to numpy if needed
  pairwise_sims = pairwise_sims.cpu().numpy()

  results = []
  # Assuming 1:1 alignment between gt and generated.
  for i in range(n_gt):
    sbert_sim = float(pairwise_sims[i])

    row = {
        "gt_output" : gt_output[i],
        "llm_output" : llm_output[i],
        "n_gt" : n_gt,
        "n_gen" : n_gen,
        "sbert_sim" : sbert_sim,
    }
    results.append(row)
  # Add summary statistics
  if results:
      sim_values = [r["sbert_sim"] for r in results]
      summary = {
          "total_pairs": n_gt,
          "mean_similarity": float(np.mean(sim_values)),
          "std_similarity": float(np.std(sim_values)),
          "var_similarity" : float(np.var(sim_values)),
          "min_similarity": float(np.min(sim_values)),
          "max_similarity": float(np.max(sim_values))
      }

      # Return both detailed results and summary
      return {
          "pairwise_results": results,
          "summary_statistics": summary
      }

  return {"pairwise_results": [], "summary_statistics": {}}

# Clean base model outputs
def clean_base_outputs(base_outputs):
    cleaned = []
    for text in base_outputs:
        if "output: \"" in text:
            parts = text.split("output: \"")
            if len(parts) > 1:
                cleaned.append(parts[1])
            else:
                cleaned.append(text)
        else:
            cleaned.append(text)
    return cleaned

# Compute micro precision, recall, F1 for category extraction
def evaluate_categories(ground_truth, predictions):
    tp_total = 0
    fp_total = 0
    fn_total = 0
    exact_match_count = 0
    jaccard_scores = []

    for gt_labels, pred_labels in zip(ground_truth, predictions):
        gt_set = set(gt_labels)
        pred_set = set(pred_labels)

        tp = len(gt_set & pred_set)
        fp = len(pred_set - gt_set)
        fn = len(gt_set - pred_set)

        tp_total += tp
        fp_total += fp
        fn_total += fn

        # Exact match
        if gt_set == pred_set:
            exact_match_count += 1

        # Jaccard similarity
        union = len(gt_set | pred_set)
        jaccard = tp / union if union > 0 else 1.0
        jaccard_scores.append(jaccard)

    precision_micro = (
        tp_total / (tp_total + fp_total)
        if (tp_total + fp_total) > 0 else 0.0
    )

    recall_micro = (
        tp_total / (tp_total + fn_total)
        if (tp_total + fn_total) > 0 else 0.0
    )

    if precision_micro + recall_micro > 0:
        f1_micro = (
            2 * precision_micro * recall_micro
            / (precision_micro + recall_micro)
        )
    else:
        f1_micro = 0.0

    return {
        "precision_micro": precision_micro,
        "recall_micro": recall_micro,
        "f1_micro": f1_micro,
        "exact_match_rate": exact_match_count / len(ground_truth),
        "jaccard_similarity": float(np.mean(jaccard_scores)),
        "tp_total": tp_total,
        "fp_total": fp_total,
        "fn_total": fn_total,
    }

# Extract categories from outputs
def extract_categories_from_string(output_str):
    try:
        if "|" in output_str:
            categories_part = output_str.split("|")[0].strip()
            return json.loads(categories_part.replace("'", '"'))
        return []
    except:
        return []

# Evaluate summary text quality using ROUGE
def evaluate_text_quality(gt_summaries, model_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

    for gt, pred in zip(gt_summaries, model_summaries):
        # ROUGE scores
        scores = scorer.score(gt, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)


    return {
        "rouge1": np.mean(rouge1_scores),
        "rouge2": np.mean(rouge2_scores),
        "rougeL": np.mean(rougeL_scores),
    }

# Measure inference efficiency: latency and memory usage
def measure_inference_efficiency(model, inputs, instruction, num_runs=3):
    latencies = []
    memory_usages = []

    # Clear cache before measurement
    torch.cuda.empty_cache()

    for _ in range(num_runs):
        start_time = time.time()
        start_memory = torch.cuda.memory_allocated() / 1024**2  # MB

        # Run inference
        _ = ask_model(model, instruction, inputs[:10])

        end_time = time.time()
        torch.cuda.synchronize()  # Wait for all kernels to finish
        end_memory = torch.cuda.memory_allocated() / 1024**2

        latencies.append(end_time - start_time)
        memory_usages.append(end_memory - start_memory)

    return {
        "num_of_runs": num_runs,
        "num_of_samples": 10,
        "avg_latency_seconds": np.mean(latencies),
        "latency_std": np.std(latencies),
        "avg_gpu_memory_increase_mb": np.mean(memory_usages),
        "throughput_samples_per_second": 10 / np.mean(latencies)
    }

# Convert numpy types to native Python types for JSON serialization
def convert_numpy(obj):
    if isinstance(obj, dict):
        return {key: convert_numpy(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy(item) for item in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.bool_):
        return bool(obj)
    else:
        return obj

def wilcoxon_effect_size(x, y):
    d = np.array(x) - np.array(y)
    d = d[d != 0]  # remove zero differences
    n = len(d)

    ranks = stats.rankdata(abs(d))
    W_pos = np.sum(ranks[d > 0])
    W_neg = np.sum(ranks[d < 0])

    # signed Z
    mean = n * (n + 1) / 4
    std = np.sqrt(n * (n + 1) * (2*n + 1) / 24)
    z = (W_pos - mean) / std

    r = z / np.sqrt(n)
    return float(r)

if __name__ == "__main__":
  overall_results = {}
  eval_filepath = "risk_extractor_microservice/training_and_evalaution/eval_dataset.csv"
  instruction, input_data, output = prepare_csv_dataset(eval_filepath)
  base_model_output, finetuned_model_output, num_errors_base, num_errors_fine_tuned = create_outputs(instruction, input_data)

  finetuned_model_output = pd.DataFrame(finetuned_model_output)

  results = evaluate_semantic_similarity(input_data, finetuned_model_output["summary"])
  overall_results["summary_input_finetuned"] = results
  print(f"Input vs. finedtuned LLM output summary {results["summary_statistics"]}")

  output_summary = output.str.split('|').str[1]
  results = evaluate_semantic_similarity(input_data, output_summary)
  overall_results["summary_input_gt"] = results
  print(f"Input vs. GT output summary {results["summary_statistics"]}")

  results = evaluate_semantic_similarity(output_summary, finetuned_model_output["output"])
  overall_results["summary_gt_finetuned"] = results
  print(f"GT expected output summary vs. finedtuned LLM output summary {results["summary_statistics"]}")

  results = evaluate_semantic_similarity(output, finetuned_model_output["output"])
  overall_results["output_gt_finetuned"] = results
  print(f"GT expected output vs. finetuned LLM output {results["summary_statistics"]}")

  base_model_output = pd.DataFrame(base_model_output)

  cleaned_base = clean_base_outputs(base_model_output["output"])

  results = evaluate_semantic_similarity(input_data, cleaned_base)
  overall_results["output_input_base"] = results
  print(f"Input vs. base LLM output {results["summary_statistics"]}")

  results = evaluate_semantic_similarity(output, cleaned_base)
  overall_results["output_gt_base"] = results
  print(f"GT expected output vs. base LLM output {results["summary_statistics"]}")

  gt_summaries = output_summary
  finetuned_summaries = finetuned_model_output["summary"]
  base_summaries = cleaned_base

  gt_full_output = output  # Already the full GT output
  finetuned_full_output = finetuned_model_output["output"]  # Already the full finetuned output
  base_full_output = cleaned_base  # Cleaned base output

  # 1. BASELINE COMPARISONS
  print("\n" + "=" * 80)
  print("1. BASELINE SEMANTIC SIMILARITY (Expected Transformation)")
  print("=" * 80)

  # How similar are GT input and GT output summaries? (Summary transformation)
  gt_input_vs_gt_summary = evaluate_semantic_similarity(input_data, gt_summaries)
  gt_stats = gt_input_vs_gt_summary["summary_statistics"]

  # How similar are GT input and GT full output? (Full output transformation)
  gt_input_vs_gt_full = evaluate_semantic_similarity(input_data, output)
  gt_full_stats = gt_input_vs_gt_full["summary_statistics"]

  print(f"\nSUMMARY TRANSFORMATION:")
  print(f"GT Input vs GT Output Summaries:")
  print(f"  Mean Similarity: {gt_stats['mean_similarity']:.4f} ± {gt_stats['std_similarity']:.4f}")
  print(f"  Interpretation: GT-written summary transformation level\n")

  print(f"FULL OUTPUT TRANSFORMATION:")
  print(f"GT Input vs GT Full Output (Categories + Summary):")
  print(f"  Mean Similarity: {gt_full_stats['mean_similarity']:.4f} ± {gt_full_stats['std_similarity']:.4f}")
  print(f"  Interpretation: GT-written full output transformation level\n")

  # 2. MODEL PERFORMANCE RELATIVE TO BASELINE
  print("\n" + "=" * 80)
  print("2. MODEL PERFORMANCE RELATIVE TO BASELINE")
  print("=" * 80)

  # SUMMARY TRANSFORMATIONS
  gt_input_vs_finetuned_summary = evaluate_semantic_similarity(input_data, finetuned_summaries)
  gt_input_vs_base_summary = evaluate_semantic_similarity(input_data, base_summaries)

  ft_summary_stats = gt_input_vs_finetuned_summary["summary_statistics"]
  base_summary_stats = gt_input_vs_base_summary["summary_statistics"]

  # FULL OUTPUT TRANSFORMATIONS
  gt_input_vs_finetuned_full = evaluate_semantic_similarity(input_data, finetuned_full_output)
  gt_input_vs_base_full = evaluate_semantic_similarity(input_data, base_full_output)

  ft_full_stats = gt_input_vs_finetuned_full["summary_statistics"]
  base_full_stats = gt_input_vs_base_full["summary_statistics"]

  print(f"\nSUMMARY TRANSFORMATION - Model vs Baseline:")
  print(f"Finetuned Model Summaries:")
  print(f"  Mean Similarity: {ft_summary_stats['mean_similarity']:.4f} ± {ft_summary_stats['std_similarity']:.4f}")
  print(f"  Difference from baseline: {(ft_summary_stats['mean_similarity'] - gt_stats['mean_similarity']):+.4f}")

  print(f"\nBase Model Summaries:")
  print(f"  Mean Similarity: {base_summary_stats['mean_similarity']:.4f} ± {base_summary_stats['std_similarity']:.4f}")
  print(f"  Difference from baseline: {(base_summary_stats['mean_similarity'] - gt_stats['mean_similarity']):+.4f}")

  print(f"\nFULL OUTPUT TRANSFORMATION - Model vs Baseline:")
  print(f"Finetuned Full Output:")
  print(f"  Mean Similarity: {ft_full_stats['mean_similarity']:.4f} ± {ft_full_stats['std_similarity']:.4f}")
  print(f"  Difference from baseline: {(ft_full_stats['mean_similarity'] - gt_full_stats['mean_similarity']):+.4f}")

  print(f"\nBase Full Output:")
  print(f"  Mean Similarity: {base_full_stats['mean_similarity']:.4f} ± {base_full_stats['std_similarity']:.4f}")
  print(f"  Difference from baseline: {(base_full_stats['mean_similarity'] - gt_full_stats['mean_similarity']):+.4f}")

  # 3. DIRECT MODEL COMPARISON TO GROUND TRUTH
  print("\n" + "=" * 80)
  print("3. DIRECT COMPARISON: How close are model outputs to GT outputs?")
  print("=" * 80)

  # SUMMARY COMPARISONS
  gt_summary_vs_finetuned = evaluate_semantic_similarity(gt_summaries, finetuned_summaries)
  gt_summary_vs_base = evaluate_semantic_similarity(gt_summaries, base_summaries)

  ft_vs_gt_summary_stats = gt_summary_vs_finetuned["summary_statistics"]
  base_vs_gt_summary_stats = gt_summary_vs_base["summary_statistics"]

  # FULL OUTPUT COMPARISONS
  gt_full_vs_finetuned = evaluate_semantic_similarity(output, finetuned_full_output)
  gt_full_vs_base = evaluate_semantic_similarity(output, base_full_output)

  ft_vs_gt_full_stats = gt_full_vs_finetuned["summary_statistics"]
  base_vs_gt_full_stats = gt_full_vs_base["summary_statistics"]

  print(f"\nSUMMARY QUALITY (Closeness to GT Summaries):")
  print(f"Finetuned Model vs GT Summaries:")
  print(f"  Mean Similarity: {ft_vs_gt_summary_stats['mean_similarity']:.4f} ± {ft_vs_gt_summary_stats['std_similarity']:.4f}")

  print(f"\nBase Model vs GT Summaries:")
  print(f"  Mean Similarity: {base_vs_gt_summary_stats['mean_similarity']:.4f} ± {base_vs_gt_summary_stats['std_similarity']:.4f}")
  print(f"  Difference (Finetuned - Base): {(ft_vs_gt_summary_stats['mean_similarity'] - base_vs_gt_summary_stats['mean_similarity']):+.4f}")

  print(f"\nFULL OUTPUT QUALITY (Closeness to GT Full Output):")
  print(f"Finetuned Model vs GT Full Output:")
  print(f"  Mean Similarity: {ft_vs_gt_full_stats['mean_similarity']:.4f} ± {ft_vs_gt_full_stats['std_similarity']:.4f}")

  print(f"\nBase Model vs GT Full Output:")
  print(f"  Mean Similarity: {base_vs_gt_full_stats['mean_similarity']:.4f} ± {base_vs_gt_full_stats['std_similarity']:.4f}")
  print(f"  Difference (Finetuned - Base): {(ft_vs_gt_full_stats['mean_similarity'] - base_vs_gt_full_stats['mean_similarity']):+.4f}")

  # 4. KEY INSIGHTS
  print("\n" + "=" * 80)
  print("4. KEY INSIGHTS")
  print("=" * 80)

  # Insight 1: Transformation levels
  baseline_summary_transform = gt_stats['mean_similarity']
  baseline_full_transform = gt_full_stats['mean_similarity']
  ft_summary_transform = ft_summary_stats['mean_similarity']
  base_summary_transform = base_summary_stats['mean_similarity']
  ft_full_transform = ft_full_stats['mean_similarity']
  base_full_transform = base_full_stats['mean_similarity']

  print(f"\nTRANSFORMATION LEVEL ANALYSIS:")
  print(f"Summary Transformation (Input → Summary):")
  print(f"  GT: {baseline_summary_transform:.4f}")
  print(f"  Finetuned: {ft_summary_transform:.4f}")
  print(f"  Base: {base_summary_transform:.4f}")

  print(f"\nFull Output Transformation (Input → Categories+Summary):")
  print(f"  GT: {baseline_full_transform:.4f}")
  print(f"  Finetuned: {ft_full_transform:.4f}")
  print(f"  Base: {base_full_transform:.4f}")

  # Insight 2: Output quality
  ft_summary_to_gt = ft_vs_gt_summary_stats['mean_similarity']
  base_summary_to_gt = base_vs_gt_summary_stats['mean_similarity']
  ft_full_to_gt = ft_vs_gt_full_stats['mean_similarity']
  base_full_to_gt = base_vs_gt_full_stats['mean_similarity']

  print(f"\nOUTPUT QUALITY ANALYSIS:")
  print(f"Summary Quality (Closeness to GT Summaries):")
  print(f"  Finetuned: {ft_summary_to_gt:.4f}")
  print(f"  Base: {base_summary_to_gt:.4f}")
  print(f"  Improvement: {ft_summary_to_gt - base_summary_to_gt:+.4f}")

  print(f"\nFull Output Quality (Closeness to GT Full Output):")
  print(f"  Finetuned: {ft_full_to_gt:.4f}")
  print(f"  Base: {base_full_to_gt:.4f}")
  print(f"  Improvement: {ft_full_to_gt - base_full_to_gt:+.4f}")

  # 5. STATISTICAL SIGNIFICANCE
  print("\n" + "=" * 80)
  print("5. STATISTICAL SIGNIFICANCE")
  print("=" * 80)

  # Extract similarity scores for t-tests
  ft_summary_scores = [r["sbert_sim"] for r in gt_summary_vs_finetuned["pairwise_results"]]
  base_summary_scores = [r["sbert_sim"] for r in gt_summary_vs_base["pairwise_results"]]

  ft_full_scores = [r["sbert_sim"] for r in gt_full_vs_finetuned["pairwise_results"]]
  base_full_scores = [r["sbert_sim"] for r in gt_full_vs_base["pairwise_results"]]

  # Summary ttest
  ttest_summary = stats.ttest_rel(ft_summary_scores, base_summary_scores)

  print(f"\nSUMMARY COMPARISON T-Test (Finetuned vs Base vs GT Summaries):")
  print(f"  t-statistic: {ttest_summary.statistic:.4f}")
  print(f"  p-value: {ttest_summary.pvalue:.6f}")
  print(f"  Significant at p<0.05: {'YES' if ttest_summary.pvalue < 0.05 else 'NO'}")

  # Full output ttest
  ttest_full = stats.ttest_rel(ft_full_scores, base_full_scores)

  print(f"\nFULL OUTPUT COMPARISON T-Test (Finetuned vs Base vs GT Full Output):")
  print(f"  t-statistic: {ttest_full.statistic:.4f}")
  print(f"  p-value: {ttest_full.pvalue:.6f}")
  print(f"  Significant at p<0.05: {'YES' if ttest_full.pvalue < 0.05 else 'NO'}")

  # Summary wilcoxon
  wilcoxon_summary = stats.wilcoxon(base_summary_scores, ft_summary_scores)

  print(f"\nSUMMARY COMPARISON WILCOXON (Finetuned vs Base vs GT Summaries):")
  print(f"  wilcoxon-statistic: {wilcoxon_summary.statistic:.4f}")
  print(f"  p-value: {wilcoxon_summary.pvalue:.6f}")
  print(f"  Significant at p<0.05: {'YES' if wilcoxon_summary.pvalue < 0.05 else 'NO'}")

  # Full output wilcoxon
  wilcoxon_full = stats.wilcoxon(base_full_scores, ft_full_scores)

  print(f"\nFULL OUTPUT COMPARISON WILCOXON (Finetuned vs Base vs GT Full Output):")
  print(f"  wilcoxon-statistic: {wilcoxon_full.statistic:.4f}")
  print(f"  p-value: {wilcoxon_full.pvalue:.6f}")
  print(f"  Significant at p<0.05: {'YES' if wilcoxon_full.pvalue < 0.05 else 'NO'}")

  # Effect sizes
  mean_diff_summary = np.mean(ft_summary_scores) - np.mean(base_summary_scores)
  pooled_std_summary = np.sqrt((np.std(ft_summary_scores)**2 + np.std(base_summary_scores)**2) / 2)
  cohens_d_summary = mean_diff_summary / pooled_std_summary if pooled_std_summary != 0 else 0

  mean_diff_full = np.mean(ft_full_scores) - np.mean(base_full_scores)
  pooled_std_full = np.sqrt((np.std(ft_full_scores)**2 + np.std(base_full_scores)**2) / 2)
  cohens_d_full = mean_diff_full / pooled_std_full if pooled_std_full != 0 else 0

  wilcoxon_summary_r = wilcoxon_effect_size(ft_summary_scores, base_summary_scores)
  wilcoxon_full_r = wilcoxon_effect_size(ft_full_scores, base_full_scores)

  print(f"\nEFFECT SIZE ANALYSIS:")
  print(f"  Summary Comparison (Cohen's d): {cohens_d_summary:.4f}")
  print(f"  Full Output Comparison (Cohen's d): {cohens_d_full:.4f}")

  # 6. CATEGORY EXTRACTION
  print("\n" + "=" * 80)
  print("6. RISK CATEGORY EXTRACTION PERFORMANCE")
  print("=" * 80)

  # Extract categories
  def extract_categories_from_string(text):
      try:
          if "|" in text:
              cat_part = text.split("|")[0].strip()
              return json.loads(cat_part.replace("'", '"'))
          return []
      except:
          return []

  gt_categories = output.apply(extract_categories_from_string)
  finetuned_categories = finetuned_model_output["categories"]
  base_categories = base_model_output["output"].apply(extract_categories_from_string)

  ft_cat_results = evaluate_categories(gt_categories, finetuned_categories)
  base_cat_results = evaluate_categories(gt_categories, base_categories)

  print(f"\nFinetuned Model Category Extraction:")
  print(f"  F1 Score: {ft_cat_results['f1_micro']:.4f}")
  print(f"  Exact Match Rate: {ft_cat_results['exact_match_rate']:.4f} ({ft_cat_results['exact_match_rate']*100:.1f}%)")

  print(f"\nBase Model Category Extraction:")
  print(f"  F1 Score: {base_cat_results['f1_micro']:.4f}")
  print(f"  Exact Match Rate: {base_cat_results['exact_match_rate']:.4f} ({base_cat_results['exact_match_rate']*100:.1f}%)")

  cat_improvement = ft_cat_results['f1_micro'] - base_cat_results['f1_micro']
  print(f"\nCategory Extraction Improvement: {cat_improvement:+.4f}")

  base_rouge = evaluate_text_quality(gt_summaries, base_summaries)
  ft_rouge = evaluate_text_quality(gt_summaries, finetuned_summaries)

  print(f"\nBase ROUGE: {base_rouge}")
  print(f"Finetuned ROUGE: {ft_rouge}")

  # 7. FINAL RECOMMENDATION
  # Decision criteria
  is_statistically_significant_summary_ttest = ttest_summary.pvalue < 0.05
  is_statistically_significant_full_ttest = ttest_full.pvalue < 0.05

  is_statistically_significant_summary_wilcoxon = wilcoxon_summary.pvalue < 0.05
  is_statistically_significant_full_wilcoxon = wilcoxon_full.pvalue < 0.05

  has_meaningful_summary_improvement = (ft_summary_to_gt - base_summary_to_gt) > 0.05
  has_meaningful_full_improvement = (ft_full_to_gt - base_full_to_gt) > 0.05

  has_good_summary_transform_match = abs(ft_summary_transform - baseline_summary_transform) < 0.1
  has_good_full_transform_match = abs(ft_full_transform - baseline_full_transform) < 0.1

  has_category_improvement = cat_improvement > 0.05



  # 8. INFERENCE
  base_inference_efficiency = measure_inference_efficiency(load_base(), tokenizer, input_data, instruction[0], num_runs=3)
  print(f"Base_Inference_Efficiency: {base_inference_efficiency}")
  finetuned_inference_efficiency = measure_inference_efficiency(load_trained(), tokenizer, input_data, instruction[0], num_runs=3)
  print(f"Finetuned_Inference_Efficiency: {finetuned_inference_efficiency}")

  # 9. SAVE KEY RESULTS
  print("\n" + "=" * 80)
  print("8. SAVING KEY RESULTS")
  print("=" * 80)

  # Compile key results
  key_results = {
      "hardware" : {
          "GPU":"A100 GPU"
      },
      "prompt_compliance_errors" : {
          "base_model": num_errors_base,
          "finetuned_model": num_errors_fine_tuned
      },
      "baseline": {
          "gt_input_vs_gt_summary": gt_stats,
          "gt_input_vs_gt_full": gt_full_stats,
          "interpretation": "Upper bound - GT transformation level"
      },
      "transformation_analysis": {
          "summary": {
              "gt_input_vs_finetuned": ft_summary_stats,
              "gt_input_vs_base": base_summary_stats,
              "baseline": float(baseline_summary_transform),
              "finetuned": float(ft_summary_transform),
              "base": float(base_summary_transform)
          },
          "full_output": {
              "gt_input_vs_finetuned": ft_full_stats,
              "gt_input_vs_base": base_full_stats,
              "baseline": float(baseline_full_transform),
              "finetuned": float(ft_full_transform),
              "base": float(base_full_transform)
          }
      },
      "output_quality": {
          "summary": {
              "gt_vs_finetuned": ft_vs_gt_summary_stats,
              "gt_vs_base": base_vs_gt_summary_stats,
              "improvement": float(ft_summary_to_gt - base_summary_to_gt)
          },
          "full_output": {
              "gt_vs_finetuned": ft_vs_gt_full_stats,
              "gt_vs_base": base_vs_gt_full_stats,
              "improvement": float(ft_full_to_gt - base_full_to_gt)
          }
      },
      "category_extraction": {
          "finetuned": ft_cat_results,
          "base": base_cat_results,
          "improvement": float(cat_improvement)
      },
      "statistical_tests": {
          "summary_comparison_ttest": {
              "t_statistic": float(ttest_summary.statistic),
              "p_value": float(ttest_summary.pvalue),
              "significant": bool(ttest_summary.pvalue < 0.05),
              "effect_size": float(cohens_d_summary)
          },
          "full_output_comparison_ttest": {
              "t_statistic": float(ttest_full.statistic),
              "p_value": float(ttest_full.pvalue),
              "significant": bool(ttest_full.pvalue < 0.05),
              "effect_size": float(cohens_d_full)
          },
          "summary_comparison_wilcoxon": {
              "wilcoxon_statistic": float(wilcoxon_summary.statistic),
              "p_value": float(wilcoxon_summary.pvalue),
              "significant": bool(wilcoxon_summary.pvalue < 0.05),
              "effect_size": float(wilcoxon_summary_r)
          },
          "full_output_comparison_wilcoxon": {
              "wilcoxon_statistic": float(wilcoxon_full.statistic),
              "p_value": float(wilcoxon_full.pvalue),
              "significant": bool(wilcoxon_full.pvalue < 0.05),
              "effect_size": float(wilcoxon_full_r)
          },
          "rouge_summary_comparison": {
              "base": base_rouge,
              "finetuned": ft_rouge
          }
      },
      "inference_efficiency": {
          "base": base_inference_efficiency,
          "finetuned": finetuned_inference_efficiency
      },
      "assessment": {
          "summary_significant": bool(is_statistically_significant_summary_ttest),
          "full_output_significant": bool(is_statistically_significant_full_ttest),
          "summary_improvement": bool(has_meaningful_summary_improvement),
          "full_output_improvement": bool(has_meaningful_full_improvement),
          "summary_transform_match": bool(has_good_summary_transform_match),
          "full_transform_match": bool(has_good_full_transform_match),
          "category_improvement": bool(has_category_improvement)
      }
  }

  key_results = convert_numpy(key_results)

  results_file = "risk_extractor_microservice/comprehensive_evaluation_results_stable.json"
  with open(results_file, 'w') as f:
      json.dump(key_results, f, indent=4)

  print(f"\nComprehensive results saved to: {results_file}")
  print("\n" + "=" * 80)
  print("EVALUATION COMPLETE")
  print("=" * 80)

  if re.search("\[(.*?)\[", generated):
  generated_first_output = re.search("\[(.*?)\[", generated)[0][:-1]
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Starting base model generation.
Parsing error: Invalid control character at: line 1 column 21 (char 20)
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Expecting value: line 1 column 1 (char 0)
Parsing error: Invalid format: No '|' separator found.
Parsing error: Expecting value: line 1 column 1 (char 0)
Parsing error: Invalid format: No output listed.
Parsing error: Invalid format: No output listed.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No output listed.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: No '|' separator found.
Parsing error: Invalid format: