In [1]:
import os; os.environ["HF_TOKEN"] =""   
hf_token = os.environ.get("HF_TOKEN")


In [2]:
from huggingface_hub import HfApi

# Use the secret you created
import os
hf_token = os.environ.get("HF_TOKEN")

# Authenticate
if hf_token:
    api = HfApi()
    api.whoami(token=hf_token)
    print("Successfully logged into Hugging Face.")
else:
    print("Hugging Face token not found. Please add it as a Kaggle Secret with the label 'HF_TOKEN'.")

Successfully logged into Hugging Face.


In [3]:
import os
import time
import json
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import warnings

# Suppress specific warnings for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# --- Configuration ---
# Set your Hugging Face token here
HF_TOKEN = "" 
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
# Set a smaller sample size for quicker testing, or None to use the full dataset
DATA_SAMPLE_SIZE = 200 
# >> NEW: Number of times to run each task for statistical significance
NUM_ITERATIONS = 3

# ==============================================================================
# 1. SETUP: LOAD MODEL AND DATASET
# ==============================================================================
def setup_environment():
    """Initializes the model, tokenizer, and dataset."""
    print("--- Setting up Environment ---")
    
    # --- HuggingFace & Model Setup ---
    os.environ["HF_TOKEN"] = HF_TOKEN
    
    print(f"Loading tokenizer for {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    print(f"Loading model: {MODEL_NAME}... (This may take a moment)")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
        use_auth_token=True
    )
    print("Model and tokenizer loaded successfully.")
    
    # --- UCI Adult Dataset Loading ---
    print("Loading UCI Adult dataset...")
    url_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
        "occupation", "relationship", "race", "sex", "capital-gain", 
        "capital-loss", "hours-per-week", "native-country", "income"
    ]
    
    try:
        train_df = pd.read_csv(url_train, names=columns, sep=",\\s", engine='python', na_values="?")
        test_df = pd.read_csv(url_test, names=columns, sep=",\\s", engine='python', skiprows=1, na_values="?")
        uci_data = pd.concat([train_df, test_df]).dropna().reset_index(drop=True)
        
        if DATA_SAMPLE_SIZE:
            uci_data = uci_data.sample(n=DATA_SAMPLE_SIZE, random_state=42)
            print(f"Dataset loaded and sampled to {len(uci_data)} rows.")
        else:
            print(f"Dataset loaded with {len(uci_data)} rows.")

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None, None

    return tokenizer, model, uci_data

# ==============================================================================
# 2. PIPELINE SIMULATION
# ==============================================================================
def run_llm_pipeline(tokenizer, model, prompt, data_sample_df):
    """
    Simulates an LLM-driven data pipeline task and collects metrics for a SINGLE run.
    """
    full_prompt_text = (
        "You are a data analyst. Your calculations must be precise and based on the entire dataset provided. "
        "Analyze the following data sample and follow the user's instruction.\n"
        "Data:\n"
        f"{data_sample_df.to_csv(index=False)}\n\n"
        "Instruction:\n"
        f"{prompt}"
    )

    messages = [
        {"role": "system", "content": "You are a helpful data analyst that always returns results in JSON format."},
        {"role": "user", "content": full_prompt_text}
    ]
    
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    start_time = time.time()
    outputs = model.generate(
        input_ids,
        max_new_tokens=512,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    end_time = time.time()
    
    response_text = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
    
    metrics = {}
    metrics['latency_seconds'] = end_time - start_time
    metrics['input_tokens'] = len(input_ids[0])
    metrics['output_tokens'] = len(outputs[0]) - len(input_ids[0])
    metrics['estimated_cost'] = ((metrics['input_tokens'] + metrics['output_tokens']) / 1000) * 0.005 

    json_match = re.search(r'```json\n({.*?})\n```', response_text, re.DOTALL)
    if not json_match:
        json_match = re.search(r'({.*?})', response_text, re.DOTALL)

    parsed_output = None
    if json_match:
        json_str = json_match.group(1)
        try:
            parsed_output = json.loads(json_str)
            metrics['is_output_valid_json'] = True
        except json.JSONDecodeError:
            metrics['is_output_valid_json'] = False
    else:
        metrics['is_output_valid_json'] = False

    return {
        "raw_output": response_text,
        "parsed_output": parsed_output,
        "metrics": metrics
    }

# ==============================================================================
# 3. >> NEW: SINGLE-RUN PROCESSING & DETAILED EVALUATION
# ==============================================================================
def process_single_run(llm_result, ground_truth_calculator, data_df):
    """
    Processes the result of a single pipeline run to determine correctness and numeric error.
    """
    metrics = llm_result['metrics']
    parsed_output = llm_result['parsed_output']
    
    is_correct = False
    numeric_errors = []

    if metrics['is_output_valid_json']:
        try:
            ground_truth = ground_truth_calculator(data_df)
            mismatch_errors = []
            
            for key, gt_value in ground_truth.items():
                llm_value = parsed_output.get(key)
                if llm_value is None:
                    mismatch_errors.append(f"Missing key '{key}'")
                elif isinstance(gt_value, (int, float)):
                    try:
                        # Calculate percentage error
                        if gt_value != 0:
                            error = abs((float(llm_value) - gt_value) / gt_value)
                            numeric_errors.append(error)
                        
                        # Check for correctness within a tolerance
                        if not np.isclose(float(llm_value), gt_value, rtol=0.05):
                           mismatch_errors.append(f"Value mismatch for '{key}'")
                    except (ValueError, TypeError):
                         mismatch_errors.append(f"Type mismatch for '{key}'")

            if not mismatch_errors:
                is_correct = True
        except Exception:
            pass # Evaluation error means it's not correct

    llm_result['is_correct'] = is_correct
    # Mean Absolute Percentage Error for this run
    llm_result['mape'] = np.mean(numeric_errors) if numeric_errors else 0
    return llm_result

# ==============================================================================
# 4. >> NEW: AGGREGATED "PIPELINE CARD" GENERATION
# ==============================================================================
def generate_summary_card(task_name, results, ground_truth_calculator, data_df):
    """Generates a summary Pipeline Card from multiple runs."""
    print("\n" + "#"*80)
    print(f"SUMMARY PIPELINE CARD: {task_name} ({len(results)} Iterations)")
    print("#"*80)
    
    # --- Pillar 1: Efficiency & Scalability ---
    latencies = [r['metrics']['latency_seconds'] for r in results]
    costs = [r['metrics']['estimated_cost'] for r in results]
    print("\n--- [ Pillar 1: Efficiency & Scalability ] ---")
    print(f"Latency (Avg ± Std): {np.mean(latencies):.2f}s ± {np.std(latencies):.2f}s")
    print(f"Est. Cost (Avg ± Std): ${np.mean(costs):.6f} ± ${np.std(costs):.6f}")

    # --- Pillar 2: Reliability & Robustness ---
    valid_json_runs = [r for r in results if r['metrics']['is_output_valid_json']]
    json_success_rate = len(valid_json_runs) / len(results)
    print("\n--- [ Pillar 2: Reliability & Robustness ] ---")
    print(f"Structured Output Rate (Valid JSON): {json_success_rate:.0%} ({len(valid_json_runs)}/{len(results)} runs)")

    # --- Pillar 3: Adaptivity & Generalization ---
    correct_runs = [r for r in valid_json_runs if r['is_correct']]
    correctness_rate = len(correct_runs) / len(results) if len(results) > 0 else 0
    
    # Calculate MAPE only on runs that produced valid JSON but were incorrect
    incorrect_json_runs = [r for r in valid_json_runs if not r['is_correct']]
    mapes = [r['mape'] for r in incorrect_json_runs if 'mape' in r]
    avg_mape = np.mean(mapes) * 100 if mapes else 0
    
    print("\n--- [ Pillar 3: Adaptivity & Generalization ] ---")
    print(f"Task Correctness Rate: {correctness_rate:.0%} ({len(correct_runs)}/{len(results)} runs)")
    if avg_mape > 0:
        print(f"  - Avg. Numeric Error (MAPE) on incorrect runs: {avg_mape:.2f}%")

    # --- Pillar 4: Governance & Ethics ---
    print("\n--- [ Pillar 4: Governance & Ethics ] ---")
    if "fairness" in task_name.lower():
        disparities = []
        for r in correct_runs: # Only calculate on correct runs for meaningful results
            try:
                gt_group = ground_truth_calculator(data_df, by_group=True)
                llm_group = r['parsed_output']
                errors = [abs(float(llm_group.get(g, 0)) - v) for g, v in gt_group.items()]
                disparities.append(max(errors) - min(errors))
            except Exception:
                continue
        
        if disparities:
            print(f"Fairness Disparity (Avg ± Std): {np.mean(disparities):.4f} ± {np.std(disparities):.4f}")
            if np.mean(disparities) > 10.0: # Disparity measured in percentage points
                 print("  - Result: High disparity indicates potential bias.")
            else:
                 print("  - Result: Low disparity suggests more equitable performance.")
        else:
            print("Fairness Metric: Could not be computed (no correct runs).")
    else:
        print("Fairness Metric: Not Applicable for this task.")
    
    print("="*80)

# ==============================================================================
# 5. TASK DEFINITIONS (Unchanged)
# ==============================================================================

# --- Task 1: Simple Aggregation ---
TASK_1_PROMPT = "Calculate the average 'age' and average 'hours-per-week' for all individuals in the dataset. Return the result as a single JSON object with keys 'average_age' and 'average_hours'."
def gt_calculator_task_1(df, **kwargs):
    return {"average_age": df['age'].mean(), "average_hours": df['hours-per-week'].mean()}

# --- Task 2: Conditional Aggregation ---
TASK_2_PROMPT = "Calculate the average 'hours-per-week' for two groups: those with income '<=50K' and those with income '>50K'. Return the result as a single JSON object with keys 'avg_hours_low_income' and 'avg_hours_high_income'."
def gt_calculator_task_2(df, **kwargs):
    return {
        "avg_hours_low_income": df[df['income'] == '<=50K']['hours-per-week'].mean(),
        "avg_hours_high_income": df[df['income'] == '>50K']['hours-per-week'].mean()
    }
    
# --- Task 3: Fairness Analysis ---
TASK_3_PROMPT = "Analyze the relationship between race and income. Calculate the percentage of individuals within each racial group that has an income of '>50K'. Return the result as a single JSON object where keys are the race categories and values are the corresponding percentages."
def gt_calculator_task_3(df, by_group=False):
    high_income_by_race = df[df['income'] == '>50K']['race'].value_counts()
    total_by_race = df['race'].value_counts()
    percentage_high_income = (high_income_by_race / total_by_race * 100).fillna(0)
    result = percentage_high_income.to_dict()
    if by_group: return result
    return result

# ==============================================================================
# 6. MAIN EXECUTION
# ==============================================================================
if __name__ == "__main__":
    tokenizer, model, uci_data = setup_environment()
    
    if model and uci_data is not None:
        all_tasks = [
            ("Task 1: Simple Aggregation", TASK_1_PROMPT, gt_calculator_task_1),
            ("Task 2: Conditional Aggregation", TASK_2_PROMPT, gt_calculator_task_2),
            ("Task 3: Fairness Analysis", TASK_3_PROMPT, gt_calculator_task_3),
        ]

        for name, prompt, calculator in all_tasks:
            print(f"\n--- Running Benchmark for: {name} ---")
            results = []
            for i in range(NUM_ITERATIONS):
                print(f"  - Iteration {i+1}/{NUM_ITERATIONS}...")
                raw_result = run_llm_pipeline(tokenizer, model, prompt, uci_data)
                processed = process_single_run(raw_result, calculator, uci_data)
                results.append(processed)
            
            generate_summary_card(name, results, calculator, uci_data)

    else:
        print("Setup failed. Exiting.")


--- Setting up Environment ---
Loading tokenizer for meta-llama/Llama-3.1-8B-Instruct...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading model: meta-llama/Llama-3.1-8B-Instruct... (This may take a moment)


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

2025-09-03 17:57:25.942875: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756922246.270098      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756922246.363719      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.
Loading UCI Adult dataset...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Dataset loaded and sampled to 200 rows.

--- Running Benchmark for: Task 1: Simple Aggregation ---
  - Iteration 1/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



################################################################################
SUMMARY PIPELINE CARD: Task 1: Simple Aggregation (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 70.04s ± 21.08s
Est. Cost (Avg ± Std): $0.046295 ± $0.000714

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 100% (3/3 runs)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Task Correctness Rate: 0% (0/3 runs)
  - Avg. Numeric Error (MAPE) on incorrect runs: 5.44%

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Not Applicable for this task.

--- Running Benchmark for: Task 2: Conditional Aggregation ---
  - Iteration 1/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



################################################################################
SUMMARY PIPELINE CARD: Task 2: Conditional Aggregation (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 64.31s ± 30.65s
Est. Cost (Avg ± Std): $0.046143 ± $0.001014

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 100% (3/3 runs)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Task Correctness Rate: 0% (0/3 runs)
  - Avg. Numeric Error (MAPE) on incorrect runs: 7.38%

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Not Applicable for this task.

--- Running Benchmark for: Task 3: Fairness Analysis ---
  - Iteration 1/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...

################################################################################
SUMMARY PIPELINE CARD: Task 3: Fairness Analysis (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 85.96s ± 0.01s
Est. Cost (Avg ± Std): $0.046845 ± $0.000000

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 100% (3/3 runs)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Task Correctness Rate: 0% (0/3 runs)
  - Avg. Numeric Error (MAPE) on incorrect runs: 96.71%

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Could not be computed (no correct runs).


In [3]:
import os
import time
import json
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import warnings

# Suppress specific warnings for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# --- Configuration ---
# Set your Hugging Face token here
HF_TOKEN = "" 
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
# Set a smaller sample size for quicker testing, or None to use the full dataset
DATA_SAMPLE_SIZE = 200 
# Number of times to run each task for statistical significance
NUM_ITERATIONS = 3

# ==============================================================================
# 1. SETUP: LOAD MODEL AND DATASET
# ==============================================================================
def setup_environment():
    """Initializes the model, tokenizer, and dataset."""
    print("--- Setting up Environment ---")
    
    # --- HuggingFace & Model Setup ---
    os.environ["HF_TOKEN"] = HF_TOKEN
    
    print(f"Loading tokenizer for {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    print(f"Loading model: {MODEL_NAME}... (This may take a moment)")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
        use_auth_token=True
    )
    print("Model and tokenizer loaded successfully.")
    
    # --- UCI Adult Dataset Loading ---
    print("Loading UCI Adult dataset...")
    url_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
    columns = [
        "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
        "occupation", "relationship", "race", "sex", "capital-gain", 
        "capital-loss", "hours-per-week", "native-country", "income"
    ]
    
    try:
        train_df = pd.read_csv(url_train, names=columns, sep=",\\s", engine='python', na_values="?")
        test_df = pd.read_csv(url_test, names=columns, sep=",\\s", engine='python', skiprows=1, na_values="?")
        uci_data = pd.concat([train_df, test_df]).dropna().reset_index(drop=True)
        
        if DATA_SAMPLE_SIZE:
            uci_data = uci_data.sample(n=DATA_SAMPLE_SIZE, random_state=42)
            print(f"Dataset loaded and sampled to {len(uci_data)} rows.")
        else:
            print(f"Dataset loaded with {len(uci_data)} rows.")

    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None, None, None

    return tokenizer, model, uci_data

# ==============================================================================
# 2. >> MODIFIED: TWO-STEP PIPELINE SIMULATION (CoT + Self-Correction)
# ==============================================================================
def run_llm_pipeline(tokenizer, model, prompt, data_sample_df):
    """
    Simulates a more robust LLM pipeline with Chain-of-Thought and a self-correction step.
    """
    # --- STEP 1: Initial Generation with Chain-of-Thought ---
    cot_prompt = (
        "You are a data analyst. Your calculations must be precise. First, think step-by-step to outline your plan and perform the calculations. "
        "Then, provide the final answer in a single JSON object at the end.\n"
        "Data will be provided by the user.\n"
        "Instruction:\n"
        f"{prompt}"
    )
    
    messages = [
        {"role": "system", "content": "You are a helpful data analyst that thinks step-by-step and then provides a final answer in JSON format."},
        {"role": "user", "content": f"Here is the dataset:\n{data_sample_df.to_csv(index=False)}\n\n{cot_prompt}"}
    ]
    
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    attention_mask = torch.ones_like(input_ids)

    start_time_1 = time.time()
    initial_outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=1024, eos_token_id=tokenizer.eos_token_id)
    end_time_1 = time.time()

    initial_response_text = tokenizer.decode(initial_outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

    # --- STEP 2: Self-Correction Step ---
    correction_prompt = (
        "You are a meticulous data verifier. Your task is to review an initial analysis for accuracy. "
        "Carefully check the reasoning and the final calculations. If you find an error, provide a corrected JSON object. "
        "If the initial answer is correct, simply return the original JSON object.\n\n"
        f"Original Instruction: {prompt}\n\n"
        f"Initial Analysis and Answer:\n{initial_response_text}\n\n"
        "Please verify and provide the final, correct JSON object."
    )

    messages_correction = [
        {"role": "system", "content": "You are a data analysis verifier. Double-check the work and provide a final, corrected JSON."},
        {"role": "user", "content": correction_prompt}
    ]

    correction_input_ids = tokenizer.apply_chat_template(messages_correction, add_generation_prompt=True, return_tensors="pt").to(model.device)
    correction_attention_mask = torch.ones_like(correction_input_ids)
    
    start_time_2 = time.time()
    corrected_outputs = model.generate(correction_input_ids, attention_mask=correction_attention_mask, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id)
    end_time_2 = time.time()

    final_response_text = tokenizer.decode(corrected_outputs[0][correction_input_ids.shape[-1]:], skip_special_tokens=True)

    # --- Metric Collection ---
    metrics = {
        'latency_seconds': (end_time_1 - start_time_1) + (end_time_2 - start_time_2),
        'input_tokens': len(input_ids[0]) + len(correction_input_ids[0]),
        'output_tokens': (len(initial_outputs[0]) - len(input_ids[0])) + (len(corrected_outputs[0]) - len(correction_input_ids[0]))
    }
    metrics['estimated_cost'] = ((metrics['input_tokens'] + metrics['output_tokens']) / 1000) * 0.005 

    # Extract JSON from both initial and final responses for detailed analysis
    def extract_json(text):
        json_match = re.search(r'```json\n({.*?})\n```', text, re.DOTALL)
        if not json_match:
            json_match = re.search(r'({.*?})', text, re.DOTALL)
        if json_match:
            try:
                return json.loads(json_match.group(1))
            except json.JSONDecodeError:
                return None
        return None

    initial_parsed = extract_json(initial_response_text)
    final_parsed = extract_json(final_response_text)

    return {
        "initial_parsed_output": initial_parsed,
        "final_parsed_output": final_parsed,
        "metrics": metrics
    }

# ==============================================================================
# 3. SINGLE-RUN PROCESSING & DETAILED EVALUATION
# ==============================================================================
def process_single_run(llm_result, ground_truth_calculator, data_df):
    """
    Processes the result of a single pipeline run, now analyzing both initial and final (corrected) answers.
    """
    ground_truth = ground_truth_calculator(data_df)

    def check_correctness(parsed_output):
        if not parsed_output or not isinstance(parsed_output, dict):
            return False, 1.0 # Not correct, max error
        
        numeric_errors = []
        is_correct = True
        for key, gt_value in ground_truth.items():
            llm_value = parsed_output.get(key)
            if llm_value is None:
                is_correct = False
                continue
            
            try:
                if gt_value != 0:
                    error = abs((float(llm_value) - gt_value) / gt_value)
                    numeric_errors.append(error)
                if not np.isclose(float(llm_value), gt_value, rtol=0.05):
                    is_correct = False
            except (ValueError, TypeError):
                is_correct = False
                
        mape = np.mean(numeric_errors) if numeric_errors else 0
        return is_correct, mape

    llm_result['initial_correct'], llm_result['initial_mape'] = check_correctness(llm_result['initial_parsed_output'])
    llm_result['final_correct'], llm_result['final_mape'] = check_correctness(llm_result['final_parsed_output'])
    
    # Check if self-correction was successful
    llm_result['self_correction_succeeded'] = (not llm_result['initial_correct'] and llm_result['final_correct'])
    
    return llm_result

# ==============================================================================
# 4. AGGREGATED "PIPELINE CARD" GENERATION
# ==============================================================================
def generate_summary_card(task_name, results, ground_truth_calculator, data_df):
    """Generates an enhanced summary Pipeline Card from multiple runs."""
    print("\n" + "#"*80)
    print(f"SUMMARY PIPELINE CARD: {task_name} ({len(results)} Iterations)")
    print("#"*80)
    
    # Pillar 1: Efficiency & Scalability
    latencies = [r['metrics']['latency_seconds'] for r in results]
    costs = [r['metrics']['estimated_cost'] for r in results]
    print("\n--- [ Pillar 1: Efficiency & Scalability ] ---")
    print(f"Latency (Avg ± Std): {np.mean(latencies):.2f}s ± {np.std(latencies):.2f}s (per 2-step run)")
    print(f"Est. Cost (Avg ± Std): ${np.mean(costs):.6f} ± ${np.std(costs):.6f}")

    # Pillar 2: Reliability & Robustness
    valid_initial_json = [r for r in results if r['initial_parsed_output'] is not None]
    valid_final_json = [r for r in results if r['final_parsed_output'] is not None]
    print("\n--- [ Pillar 2: Reliability & Robustness ] ---")
    print(f"Structured Output Rate (Valid JSON): {len(valid_final_json)/len(results):.0%} (Post-Correction)")

    # Pillar 3: Adaptivity & Generalization
    initial_correct_runs = [r for r in results if r['initial_correct']]
    final_correct_runs = [r for r in results if r['final_correct']]
    self_correction_successes = [r for r in results if r['self_correction_succeeded']]
    
    initial_incorrect_runs = [r for r in results if r['initial_parsed_output'] and not r['initial_correct']]
    initial_mapes = [r['initial_mape'] for r in initial_incorrect_runs]
    avg_initial_mape = np.mean(initial_mapes) * 100 if initial_mapes else 0
    
    final_incorrect_runs = [r for r in results if r['final_parsed_output'] and not r['final_correct']]
    final_mapes = [r['final_mape'] for r in final_incorrect_runs]
    avg_final_mape = np.mean(final_mapes) * 100 if final_mapes else 0

    print("\n--- [ Pillar 3: Adaptivity & Generalization ] ---")
    print(f"Initial Correctness Rate (CoT only): {len(initial_correct_runs)/len(results):.0%}")
    if avg_initial_mape > 0:
        print(f"  - Avg. Initial Numeric Error (MAPE): {avg_initial_mape:.2f}%")
    print(f"Final Correctness Rate (Post-Correction): {len(final_correct_runs)/len(results):.0%}")
    if avg_final_mape > 0:
        print(f"  - Avg. Final Numeric Error (MAPE): {avg_final_mape:.2f}%")
    
    # New Metric
    if len(initial_incorrect_runs) > 0:
        correction_rate = len(self_correction_successes) / len(initial_incorrect_runs)
        print(f"Self-Correction Success Rate: {correction_rate:.0%} ({len(self_correction_successes)} of {len(initial_incorrect_runs)} initial errors fixed)")

    # Pillar 4: Governance & Ethics
    print("\n--- [ Pillar 4: Governance & Ethics ] ---")
    # (Logic remains the same, but now operates on final_correct_runs)
    if "fairness" in task_name.lower():
        disparities = []
        for r in final_correct_runs: 
            try:
                gt_group = ground_truth_calculator(data_df, by_group=True)
                llm_group = r['final_parsed_output']
                errors = [abs(float(llm_group.get(g, 0)) - v) for g, v in gt_group.items()]
                disparities.append(max(errors) - min(errors))
            except Exception:
                continue
        if disparities:
            print(f"Fairness Disparity (Avg ± Std): {np.mean(disparities):.4f} ± {np.std(disparities):.4f}")
        else:
            print("Fairness Metric: Could not be computed (no correct runs).")
    else:
        print("Fairness Metric: Not Applicable for this task.")
    
    print("="*80)

# ==============================================================================
# 5. TASK DEFINITIONS (Unchanged)
# ==============================================================================
# --- Task 1: Simple Aggregation ---
TASK_1_PROMPT = "Calculate the average 'age' and average 'hours-per-week'."
def gt_calculator_task_1(df, **kwargs): return {"average_age": df['age'].mean(), "average_hours": df['hours-per-week'].mean()}
# --- Task 2: Conditional Aggregation ---
TASK_2_PROMPT = "Calculate the average 'hours-per-week' for two groups: those with income '<=50K' and those with income '>50K'."
def gt_calculator_task_2(df, **kwargs): return {"avg_hours_low_income": df[df['income'] == '<=50K']['hours-per-week'].mean(), "avg_hours_high_income": df[df['income'] == '>50K']['hours-per-week'].mean()}
# --- Task 3: Fairness Analysis ---
TASK_3_PROMPT = "Analyze the relationship between race and income. For each racial group, calculate the percentage that has an income of '>50K'."
def gt_calculator_task_3(df, by_group=False):
    high_income_by_race = df[df['income'] == '>50K']['race'].value_counts()
    total_by_race = df['race'].value_counts()
    result = (high_income_by_race / total_by_race * 100).fillna(0).to_dict()
    if by_group: return result
    return result

# ==============================================================================
# 6. MAIN EXECUTION
# ==============================================================================
if __name__ == "__main__":
    tokenizer, model, uci_data = setup_environment()
    
    if model and uci_data is not None:
        all_tasks = [
            ("Task 1: Simple Aggregation", TASK_1_PROMPT, gt_calculator_task_1),
            ("Task 2: Conditional Aggregation", TASK_2_PROMPT, gt_calculator_task_2),
            ("Task 3: Fairness Analysis", TASK_3_PROMPT, gt_calculator_task_3),
        ]

        for name, prompt, calculator in all_tasks:
            print(f"\n--- Running Benchmark for: {name} ---")
            results = []
            for i in range(NUM_ITERATIONS):
                print(f"  - Iteration {i+1}/{NUM_ITERATIONS}...")
                raw_result = run_llm_pipeline(tokenizer, model, prompt, uci_data)
                processed = process_single_run(raw_result, calculator, uci_data)
                results.append(processed)
            
            generate_summary_card(name, results, calculator, uci_data)
    else:
        print("Setup failed. Exiting.")

--- Setting up Environment ---
Loading tokenizer for meta-llama/Llama-3.1-8B-Instruct...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading model: meta-llama/Llama-3.1-8B-Instruct... (This may take a moment)


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

2025-09-05 15:03:20.968108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757084601.221468      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757084601.299922      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.
Loading UCI Adult dataset...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Dataset loaded and sampled to 200 rows.

--- Running Benchmark for: Task 1: Simple Aggregation ---
  - Iteration 1/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



################################################################################
SUMMARY PIPELINE CARD: Task 1: Simple Aggregation (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 192.75s ± 19.84s (per 2-step run)
Est. Cost (Avg ± Std): $0.056825 ± $0.001312

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 33% (Post-Correction)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Initial Correctness Rate (CoT only): 0%
Final Correctness Rate (Post-Correction): 0%
  - Avg. Final Numeric Error (MAPE): 5.98%

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Not Applicable for this task.

--- Running Benchmark for: Task 2: Conditional Aggregation ---
  - Iteration 1/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



################################################################################
SUMMARY PIPELINE CARD: Task 2: Conditional Aggregation (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 174.36s ± 38.35s (per 2-step run)
Est. Cost (Avg ± Std): $0.055828 ± $0.002355

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 0% (Post-Correction)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Initial Correctness Rate (CoT only): 0%
Final Correctness Rate (Post-Correction): 0%
Self-Correction Success Rate: 0% (0 of 2 initial errors fixed)

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Not Applicable for this task.

--- Running Benchmark for: Task 3: Fairness Analysis ---
  - Iteration 1/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 2/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  - Iteration 3/3...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.



################################################################################
SUMMARY PIPELINE CARD: Task 3: Fairness Analysis (3 Iterations)
################################################################################

--- [ Pillar 1: Efficiency & Scalability ] ---
Latency (Avg ± Std): 208.06s ± 0.11s (per 2-step run)
Est. Cost (Avg ± Std): $0.057883 ± $0.000002

--- [ Pillar 2: Reliability & Robustness ] ---
Structured Output Rate (Valid JSON): 0% (Post-Correction)

--- [ Pillar 3: Adaptivity & Generalization ] ---
Initial Correctness Rate (CoT only): 0%
Final Correctness Rate (Post-Correction): 0%

--- [ Pillar 4: Governance & Ethics ] ---
Fairness Metric: Could not be computed (no correct runs).
