<a href="https://colab.research.google.com/github/Motunrayo244/greatExpectationLLM/blob/main/finetuning/GE_Llama_3_2_1B%2B3B_Conversational_baseline_result.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
#!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install optuna
!pip install optuna-dashboard
!pip install datasets
!optuna-dashboard sqlite:///db.sqlite3
!pip install unsloth vllm
!pip install --upgrade pillow

!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b


In [2]:
# Access files using GitHub API
import requests
import os
import pandas as pd
import requests
import os
import torch
import textwrap
import csv
import time
import json
import optuna
import shutil
import re

from datasets import load_dataset
from datetime import datetime
from sklearn.model_selection import train_test_split
from unsloth import FastLanguageModel, to_sharegpt, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from trl import SFTTrainer,SFTConfig
from transformers import TrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from unsloth.chat_templates import standardize_sharegpt
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-20 21:06:09 [__init__.py:256] Automatically detected platform cuda.


# GELM - Great Expectation Language Model

This Notebook is an adaptation of unsloth tutorial for finetuning LLAMA-3.2. Follow instructions in https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing to view original notebook. View other tutorials in https://docs.unsloth.ai/get-started/unsloth-notebooks.

## Function Definition




In [3]:
# Function to download files and handle subfolders recursively
def download_folder(url, local_path):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        os.makedirs(local_path, exist_ok=True)
        for item in response.json():
            item_name = item['name']
            item_path = os.path.join(local_path, item_name)
            if item['type'] == 'file':
                print(f"Downloading file: {item_name}")
                with open(item_path, 'wb') as f:
                    file_response = requests.get(item['download_url'], headers=headers)
                    f.write(file_response.content)
            elif item['type'] == 'dir':
                print(f"Entering subfolder: {item_name}")
                download_folder(item['url'], item_path)
    else:
        print(f"Failed to access {url}: {response.status_code}, {response.text}")

In [None]:
def combine_csv_files(input_folder, output_file):
    """
    Combines all CSV files in a folder (including subfolders) into one CSV file.

    Parameters:
    - input_folder (str): Path to the folder containing CSV files.
    - output_file (str): Path to save the combined CSV file.
    """
    all_data = []  # List to store dataframes

    # Walk through all files and subfolders
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                print(f"Reading {file_path}")
                try:
                    df = pd.read_csv(file_path)
                    df['source_file'] = file  # Add a column to track the source file
                    all_data.append(df)
                except Exception as e:
                    print(f"Failed to read {file_path}: {e}")
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        combined_df.to_csv('output_file', index=False)
        print(f"Combined CSV saved as {output_file}")
    else:
        print("No CSV files found in the specified folder.")


In [5]:
def get_accepted_expectations():
    """
    Reads the list of accepted expectations from an csv and returns a dictionary.
    """
    try:
        expectation_list = pd.read_csv('data/finetuning_dataset/listExpectations.csv', usecols=['Category', 'Expectations'])
        expectation_list['Category'] = expectation_list['Category'].ffill()  # Forward fill NaN values in 'Category'
        expectation_category_dict = (
            expectation_list.groupby('Category')['Expectations']
            .apply(list)
            .to_dict()
        )
        print("Accepted expectations successfully loaded.")
        return expectation_category_dict
    except Exception as e:
        print(f"Error loading accepted expectations: {e}")
        raise

In [6]:
def formatting_prompts_func(examples, tokenizer):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }


def convert_data_to_sharegpt(dataset_file):

  dataset = load_dataset("json", data_files = dataset_file, split = "train")
  dataset = to_sharegpt(dataset,
                      merged_prompt= "{instruction} [[\n User prompt is: \n{user_prompt}]]"
                      ,output_column_name="generated_expectations"
                      )
  return dataset
def to_standardize_sharegpt(dataset,tokenizer):
  dataset = standardize_sharegpt(dataset)
  dataset = dataset.map(lambda examples: formatting_prompts_func(examples, tokenizer), batched=True)
  return dataset

In [7]:

def shuffle_and_split(df, test_size=0.1, random_state=42):
    # Assuming the last column is the label column, you can adjust this if needed
    X = df.iloc[:, :-1]  # All columns except the last one
    y = df.iloc[:, -1]   # The last column is assumed to be the label

    # Shuffle and split the DataFrame into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    train_df = pd.concat([X_train, y_train], axis=1)
    test_df = pd.concat([X_test, y_test], axis=1)

    return train_df, test_df


In [8]:
def initialize_model_tokinizer(model_name='unsloth/Llama-3.2-3B-bnb-4bit', max_seq_length=512, dtype=None, load_in_4bit=True):

#max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
#dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
#load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = model_name,                #"unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  return model, tokenizer

In [9]:
def define_model(
    model, r=64, target_modules=None, lora_alpha=16, lora_dropout=0, bias="none",use_gradient_checkpointing="unsloth", random_state=20, use_rslora=False, loftq_config=None,
    adapter_name=None, gradient_checkpointing_ratio=0.5, device_map="auto", torch_dtype="auto", use_flash_attention=True, compile_model=False):
    """
    Dynamically define a PEFT model with LoRA configuration.

    Args:
        model: Pretrained base model to wrap with PEFT.
        r (int): LoRA rank, higher values improve expressiveness (8, 16, 32, 64, 128).
        target_modules (list): Modules to apply LoRA (e.g., "q_proj", "k_proj").
        lora_alpha (int): Scaling factor for LoRA layers.
        lora_dropout (float): Dropout rate for LoRA layers.
        bias (str): Bias type ("none", "all", "lora_only").
        use_gradient_checkpointing (bool/str): Gradient checkpointing for memory efficiency.
        random_state (int): Random seed for reproducibility.
        use_rslora (bool): Whether to use Rank-Stabilized LoRA.
        loftq_config (dict/None): LoftQ configuration dictionary if needed.
        adapter_name (str/None): Name of the adapter configuration.
        gradient_checkpointing_ratio (float): Percentage of layers using checkpointing.
        device_map (str): Device mapping for multi-GPU or single GPU setup.
        torch_dtype (str): Data type for tensors ("float16", "bfloat16", "auto").
        use_flash_attention (bool): Enable flash attention for memory optimization.
        compile_model (bool): Whether to compile the model for speed optimization.

    Returns:
        model: The configured PEFT model with LoRA applied.
    """
    if target_modules is None:
        target_modules = [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ]

    model = FastLanguageModel.get_peft_model(
        model,
        r=r,
        target_modules=target_modules,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias=bias,
        use_gradient_checkpointing=use_gradient_checkpointing,
        random_state=random_state,
        use_rslora=use_rslora,
        loftq_config=loftq_config,
    )

    # Apply additional configurations
    if adapter_name:
        model.set_adapter(adapter_name)

    #if use_flash_attention:
        #model.enable_flash_attention()

    if gradient_checkpointing_ratio is not None:
        model.gradient_checkpointing_ratio = gradient_checkpointing_ratio

    if compile_model:
        model = torch.compile(model)

    return model


In [10]:
def train_model(
    model,
    tokenizer,
    train_dataset,
    eval_dataset,
    dataset_text_field = "text",
    max_seq_length = 512,
    dataset_num_proc = 2,
    packing = True,
    learning_rate=1e-4,
    optimizer="adamw_8bit",
    gradient_accumulation_steps=2,
    max_steps=50,
    per_device_train_batch_size=2,
    warmup_steps=5,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="steps",
    output_dir="outputs",
    seed=20,
):

    """
    Dynamically configure and return an SFTTrainer instance with essential parameters.

    Args:
        model: The model to train.
        tokenizer: The tokenizer for the model.
        train_dataset: Dataset for training.
        eval_dataset: Dataset for evaluation.
        learning_rate: Learning rate for training.
        optimizer: Optimizer to use (e.g., "adamw_8bit").
        gradient_accumulation_steps: Number of gradient accumulation steps.
        max_steps: Maximum number of training steps.
        per_device_train_batch_size: Batch size per device during training.
        warmup_steps: Number of warmup steps.
        weight_decay: Weight decay for regularization.
        logging_steps: Frequency of logging steps.
        evaluation_strategy: Evaluation strategy ("steps" or "epoch").
        save_steps: Frequency of saving checkpoints.
        output_dir: Directory to save outputs.
        seed: Random seed for reproducibility.

    Returns:
        SFTTrainer: Configured trainer object.
    """
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup_steps,
        max_steps=max_steps,
        learning_rate=learning_rate,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        weight_decay=weight_decay,
        logging_steps=logging_steps,
        eval_strategy=eval_strategy,
        save_steps=int(max_steps/2),
        load_best_model_at_end=True,
        optim=optimizer,
        seed=seed,
        report_to="none",  # Change this to "wandb" or "tensorboard" if needed
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        dataset_num_proc=2,
        packing=True, # Can make training 5x faster for short sequences.
        args=training_args,
    )
    return trainer


In [11]:
def start_training( model, tokenizer, train_dataset, eval_dataset, max_seq_length, learning_rate=1e-4,
                   optimizer="adamw_8bit", max_steps=50,random_state=20, lora_alpha=16,bias = "none" ):
  model = define_model(model=model, lora_alpha=lora_alpha, bias=bias, use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = random_state,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    )

  trainer = train_model(
    model = model,
    tokenizer = tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    max_seq_length=max_seq_length,
    learning_rate=learning_rate,
    optimizer=optimizer,
    max_steps=max_steps,
    )

  trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
    )

  print("Training...")
  trainer_stats = trainer.train()

  return trainer, model, tokenizer



## Prepare Dataset

In [None]:
aug_dataset = pd.read_csv('../data/finetuning_dataset\combined_processed_dataset.csv',usecols=['user_prompt', 'generated_expectations'])
initial_dataset = pd.read_csv('../data/finetuning_dataset\dataset _from_Industry_data_contract.csv',usecols=['user_prompt', 'generated_expectations'])

dataset = pd.concat([aug_dataset, initial_dataset], ignore_index=True)
expectation_list = pd.read_excel('data/expectation_and_prompt_sample/listExpectations.xlsx')

In [None]:
dataset.head()

Unnamed: 0,user_prompt,generated_expectations
0,For field 'booking_id': Ensure the field is re...,"expect_column_to_exist(column=""booking_id""),\n..."
1,For field 'customer_email': Ensure the field m...,"expect_column_to_exist(column=""customer_email""..."
2,For field 'room_type': Ensure the field is req...,"expect_column_to_exist(column=""room_type""),\n ..."
3,For field 'check_in_date': Ensure the field is...,"expect_column_to_exist(column=""check_in_date"")..."
4,For field 'payment_status': Ensure the field m...,"expect_column_to_exist(column=""payment_status""..."


## Inference.

This section uses the model to get GE output from prompts in the test dataset.

#### Import Model from Hugging Face

In [None]:
%%capture
! pip install huggingface_hub
from huggingface_hub import login
login(token=userdata.get('hugging_face'))


In [None]:
model_name = "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 torch_dtype=torch.float16,  # Use float16 precision
                                             ).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

#### Evaluate with test dataset

In [15]:
#load test file
test_file_path = "data/test.jsonl"

# read the test dataset
with open(test_file_path, "r") as file:
    test_data = [json.loads(line) for line in file]


In [16]:
# Ensure faster inference (if supported by your framework)
FastLanguageModel.for_inference(model)



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
      (1): LlamaDecoderLay

In [17]:
results = []

# Iterate through each test example
for data in tqdm(test_data):
    user_prompt = data["user_prompt"]

    # Prepare the input message
    messages = [{"role": "user", "content": user_prompt}]

    # Tokenize the input
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt",
    ).to("cuda")

    # Generate output
    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=256,
        use_cache=True,
        temperature=1.5,
        min_p=0.1
    )

    # Decode the output
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Append results
    results.append({
        "user_prompt": user_prompt,
        "generated_expectations": generated_text.strip()
    })


100%|██████████| 1420/1420 [6:09:32<00:00, 15.61s/it]


In [18]:
# Post-process to clean generated_expectations
for result in results:
    raw_text = result['generated_expectations']

    # Extract content after 'assistant\n\n' and before the '?>' if it exists
    cleaned_text = re.search(r"assistant\n\n(.*?)(\?>|$)", raw_text, re.DOTALL)
    if cleaned_text:
        result['great_expectations'] = cleaned_text.group(1).strip()



In [19]:
results[0]

{'user_prompt': "Check that the `threat_level` column has values that are either 'Low', 'Medium', or 'High', ensuring uniqueness.",
 'generated_expectations': 'system\n\nCutting Knowledge Date: December 2023\nToday Date: 20 Mar 2025\n\nuser\n\nCheck that the `threat_level` column has values that are either \'Low\', \'Medium\', or \'High\', ensuring uniqueness.assistant\n\nYou can use the following Python code snippet to check for unique values in the \'threat_level\' column.\n\n```python\nimport pandas as pd\n\ndef check_threat_level(df, column, allowed_values):\n    # Check if the column exists\n    if column not in df.columns:\n        raise ValueError(f"Column \'{column}\' does not exist in the DataFrame.")\n\n    # Check if the column values are unique and within the allowed values\n    if not (df[column].unique().tolist() == allowed_values):\n        duplicates = set(df[column].unique()) - set(allowed_values)\n        if duplicates:\n            print(f"Non-unique values found in 

In [20]:
output_file_path = "data/inference_results_baseline.jsonl"

with open(output_file_path, "w") as file:
    for result in results:
        file.write(json.dumps(result) + "\n")

print(f"Inference completed and saved to {output_file_path}")


Inference completed and saved to data/inference_results_baseline.jsonl


## Evaluation

This section uses ROUGE and BLEU score to Evaluate the inference from the model and compare it to the ground thruth.

In [21]:
%%capture
!pip install rouge-score sacrebleu
!pip install evaluate nltk
!pip install bert_score

In [22]:
from rouge_score import rouge_scorer
from evaluate import load

import sacrebleu
import json
import pandas as pd
import os

In [23]:
# Paths to test and inference files
test_file_path = "data/test.jsonl"
inference_file_path = "data/inference_results_baseline.jsonl"
output_csv_path = "data/eval_results_baseline_dataset.csv"

# Ensure the data directory exists
os.makedirs("data", exist_ok=True)

# Load test and inference data
def load_jsonl(file_path):
    with open(file_path, "r") as file:
        return [json.loads(line) for line in file]

test_data = load_jsonl(test_file_path)
inference_data = load_jsonl(inference_file_path)


In [24]:
inference_data[0]

{'user_prompt': "Check that the `threat_level` column has values that are either 'Low', 'Medium', or 'High', ensuring uniqueness.",
 'generated_expectations': 'system\n\nCutting Knowledge Date: December 2023\nToday Date: 20 Mar 2025\n\nuser\n\nCheck that the `threat_level` column has values that are either \'Low\', \'Medium\', or \'High\', ensuring uniqueness.assistant\n\nYou can use the following Python code snippet to check for unique values in the \'threat_level\' column.\n\n```python\nimport pandas as pd\n\ndef check_threat_level(df, column, allowed_values):\n    # Check if the column exists\n    if column not in df.columns:\n        raise ValueError(f"Column \'{column}\' does not exist in the DataFrame.")\n\n    # Check if the column values are unique and within the allowed values\n    if not (df[column].unique().tolist() == allowed_values):\n        duplicates = set(df[column].unique()) - set(allowed_values)\n        if duplicates:\n            print(f"Non-unique values found in 

In [25]:
# Initialize scorers
rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Collect results
results = []

for test, inference in zip(test_data, inference_data):
    # Get the ground-truth and generated expectations
    reference = test["generated_expectations"]
    hypothesis = inference["great_expectations"]

    # Compute ROUGE scores


    # Compute ROUGE scores
    rouge_scores = rouge.score(reference, hypothesis)

    # Compute granular BLEU score
    gran_bleu_score = sacrebleu.sentence_bleu(hypothesis, [reference])

    # Append result
    results.append({
        "user_prompt": test["user_prompt"],
        "reference": reference,
        "hypothesis": hypothesis,
        "rouge1_fmeasure": rouge_scores["rouge1"].fmeasure,
        "rouge2_fmeasure": rouge_scores["rouge2"].fmeasure,
        "rougeL_fmeasure": rouge_scores["rougeL"].fmeasure,
        "granular_bleu_score": gran_bleu_score.score
    })


In [26]:
results[0]

{'user_prompt': "Check that the `threat_level` column has values that are either 'Low', 'Medium', or 'High', ensuring uniqueness.",
 'reference': 'expect_column_values_to_be_in_set(column="threat_level", value_set=["Low", "Medium", "High"]),expect_column_values_to_be_unique(column="threat_level")',
 'hypothesis': 'You can use the following Python code snippet to check for unique values in the \'threat_level\' column.\n\n```python\nimport pandas as pd\n\ndef check_threat_level(df, column, allowed_values):\n    # Check if the column exists\n    if column not in df.columns:\n        raise ValueError(f"Column \'{column}\' does not exist in the DataFrame.")\n\n    # Check if the column values are unique and within the allowed values\n    if not (df[column].unique().tolist() == allowed_values):\n        duplicates = set(df[column].unique()) - set(allowed_values)\n        if duplicates:\n            print(f"Non-unique values found in the \'{column}\' column:")\n            print(duplicates)\n

In [27]:
predictions = [r['hypothesis'] for r in results]
references=[r['reference'] for r in results]

In [None]:

# Compute corpus-level BLEU score
corpus_bleu_score = sacrebleu.corpus_bleu(hypotheses=predictions, references=[references])
meteor = load('meteor')
meteor_result = meteor.compute(predictions=predictions, references=references)
bertscore = load("bertscore")
bert_results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased", )


In [29]:
corpus_bleu_score

BLEU = 3.08 14.4/3.5/1.8/1.0 (BP = 1.000 ratio = 4.281 hyp_len = 359757 ref_len = 84033)

In [31]:
# Convert results to DataFrame
result_df = pd.DataFrame(results)
result_df['f1'] = bert_results['f1']
result_df['precision'] = bert_results['precision']
result_df['recall'] = bert_results['recall']

# Save results to CSV
result_df.to_csv(output_csv_path, index=False)

# Print BLEU score and summary
print(f"Corpus BLEU Score: {corpus_bleu_score.score:.2f}")
print(f"Meteor Score: {meteor_result['meteor']:.2f}")
print(f"BERTScore F1: {result_df['f1'].mean():.2f}")
print(f"BERTScore Precision: {result_df['precision'].mean():.2f}")
print(f"BERTScore recall: {result_df['recall'].mean():.2f}")
print(f"Results saved to {output_csv_path}")

# Display summary statistics
print("\nSummary Statistics:")
print(result_df.describe())


Corpus BLEU Score: 3.08
Meteor Score: 0.17
BERTScore F1: 0.76
BERTScore Precision: 0.71
BERTScore recall: 0.82
Results saved to data/eval_results_baseline_dataset.csv

Summary Statistics:
       rouge1_fmeasure  rouge2_fmeasure  rougeL_fmeasure  granular_bleu_score  \
count      1420.000000      1420.000000      1420.000000          1420.000000   
mean          0.152489         0.043267         0.118691             2.761992   
std           0.050403         0.034175         0.038979             3.345079   
min           0.042553         0.000000         0.034483             0.245813   
25%           0.118679         0.022695         0.094340             1.215553   
50%           0.148936         0.037915         0.115063             1.773684   
75%           0.181818         0.055885         0.138683             3.450078   
max           0.585106         0.526882         0.585106            58.950359   

                f1    precision       recall  
count  1420.000000  1420.000000  14

In [32]:
import pandas as pd
import json

# Path to the inference results file
inference_file_path = "data/inference_results_baseline.jsonl"
output_excel_path = "data/inference_results_baseline.xlsx"

# Load the inference data
def load_jsonl(file_path):
    with open(file_path, "r") as file:
        return [json.loads(line) for line in file]

inference_data = load_jsonl(inference_file_path)

# Extract the required columns
processed_data = [
    {
        "user_prompt": item["user_prompt"],
        "great_expectations": item["great_expectations"]
    }
    for item in inference_data
]

# Convert to DataFrame
df = pd.DataFrame(processed_data)

# Save to Excel
df.to_excel(output_excel_path, index=False)

print(f"Inference results saved to {output_excel_path}")


Inference results saved to data/inference_results_baseline.xlsx
