In [1]:
# !pip install --upgrade accelerate "transformers[torch]"

In [2]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
)
import torch




In [3]:
# --- 1. Load Local Dataset Files ---
print("Step 1: Loading local JSON files...")
try:
    with open('./dataset/train.json', 'r') as f:
        train_data = json.load(f)
    with open('./dataset/dev.json', 'r') as f:
        dev_data = json.load(f)
    with open('./dataset/test.json', 'r') as f:
        test_data = json.load(f)
except FileNotFoundError:
    print("Error: Make sure 'train.json', 'dev.json', and 'test.json' are in a 'dataset' folder.")
    exit()

Step 1: Loading local JSON files...


In [8]:
# [This code should go in Cell 5, where the old data cleaning code was]

# --- Correct Data Cleaning for Mixed Types ---
# The 'qa.exe_ans' field contains both numbers (e.g., 94.0) and strings (e.g., 'yes').
# The Hugging Face `datasets` library (using Arrow) tries to guess the column type
# and fails when it sees this mix.
# To fix this, we convert all 'exe_ans' values to strings *before* creating the dataset.
# The official 'evaluate.py' script is designed to handle this, as it converts
# strings back to numbers where appropriate during evaluation.

print("Applying fix: Converting all 'qa.exe_ans' values to string for dataset compatibility.")
for dataset in [train_data, dev_data, test_data]:
    for item in dataset:
        # Check if 'qa' and 'exe_ans' exist to avoid errors
        if 'qa' in item and 'exe_ans' in item['qa']:
            # This ensures all values in this "column" have the same type (string)
            item['qa']['exe_ans'] = str(item['qa']['exe_ans'])

print("Data cleaning complete.")

Applying fix: Converting all 'qa.exe_ans' values to string for dataset compatibility.
Data cleaning complete.


In [9]:
# For a quick demonstration, we'll use a smaller subset.
# For a real run, use the full datasets: train_data, dev_data, test_data
train_subset = train_data[:800]
dev_subset = dev_data[:100]
test_subset = test_data[:100]

In [10]:
# Convert lists of dictionaries to Hugging Face Dataset objects
train_dataset = Dataset.from_list(train_subset)
dev_dataset = Dataset.from_list(dev_subset)
test_dataset = Dataset.from_list(test_subset)

# Combine them into a single DatasetDict
finqa_dataset = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset
})

In [11]:
print(f"Loaded {len(train_dataset)} training samples, {len(dev_dataset)} validation samples, and {len(test_dataset)} test samples.")

Loaded 800 training samples, 100 validation samples, and 100 test samples.


In [12]:
print("Below is the structure of the first example from the training set. \nThis shows all the keys and the format of their values that our `preprocess_function` will receive.")
# Access the first example directly from the Hugging Face Dataset object
first_example = finqa_dataset['train'][0]
# Use json.dumps for pretty-printing the dictionary
print(json.dumps(first_example, indent=2))

Below is the structure of the first example from the training set. 
This shows all the keys and the format of their values that our `preprocess_function` will receive.
{
  "pre_text": [
    "interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .",
    "if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .",
    "foreign currency exposure as more fully described in note 2i .",
    "in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .",
    "dollar-based exposures by entering into forward foreign currency exchange contracts .",
    "the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .",
    "currently , our largest foreign currency exposure is the euro , primarily because our eur

This code snippet **prepares the FinQA dataset for a T5 (Text-to-Text Transfer Transformer) model.**

Here's a brief explanation:

1.  **Loads Tokenizer:** It initializes a `T5Tokenizer` for the "t5-small" model, which is essential for converting text into numerical inputs that a T5 model can understand.
2.  **`preprocess_function`:** This function takes raw FinQA examples (containing questions, pre/post text, tables, and reasoning programs) and transforms them into:
    *   **Input Strings:** It linearizes the question, pre-text, post-text, and tabular data into a single, structured input string, prepending "finqa: " as a task-specific prefix.
    *   **Target Strings:** It extracts the reasoning program as the desired output string.
3.  **Tokenization:** Both the input and target strings are then tokenized using the T5 tokenizer, padded, and truncated to specific maximum lengths (1024 for inputs, 128 for targets). The tokenized targets are stored as `labels` in the output dictionary.
4.  **Applies to Dataset:** Finally, the `preprocess_function` is applied to the entire `finqa_dataset` using the `map` method, processing data in batches and removing original columns to prepare the dataset for direct use in training a T5 model.

In [13]:
# --- 2. Data Preparation ---
print("\nStep 2: Preparing data for T5 model...")

MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    """Prepares the FinQA dataset for a T5 model."""
    
    # The 'examples' parameter is a dictionary with keys like 'pre_text', 'post_text', etc.
    # and values that are lists.
    inputs = []
    targets = []

    for i in range(len(examples['id'])):
        # Construct the input string
        question = examples['qa'][i]['question']
        pre_text = " ".join(examples['pre_text'][i])
        post_text = " ".join(examples['post_text'][i])
        
        # Linearize the table using pandas for a clean string representation
        table_data = examples['table'][i]
        if table_data:
            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            table_str = df.to_string(index=False)
        else:
            table_str = ""

        # Combine all parts for the model's input
        # We prepend "finqa: " as a task-specific prefix
        input_text = f"finqa: question: {question} pre_text: {pre_text} table: {table_str} post_text: {post_text}"
        inputs.append(input_text)

        # The target for the model is the reasoning program
        program = examples['qa'][i]['program']
        targets.append(program)
        
    # Tokenize the processed inputs and targets
    model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing function to the entire dataset
tokenized_datasets = finqa_dataset.map(preprocess_function, batched=True, remove_columns=finqa_dataset['train'].column_names)
print("Data preparation complete.")


Step 2: Preparing data for T5 model...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/800 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Data preparation complete.




In [14]:
train = False

In [15]:
if train:
    # --- 3. Model Training ---
    print("\nStep 3: Training the T5 model...")

    model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./finqa_t5_local_model",
        num_train_epochs=5,                # Increase for better performance
        per_device_train_batch_size=4,     # Adjust based on your GPU memory
        per_device_eval_batch_size=4,
        warmup_steps=200,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_total_limit=2,
        load_best_model_at_end=True,
        report_to="none" # Disable wandb/tensorboard logging for this simple example
    )

    # Create the Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
    )

    # Start training
    trainer.train()
    print("Training complete.")

    # Save the final best model
    trainer.save_model("./finqa_t5_final_model")
    print("Best model saved to ./finqa_t5_final_model")

In [None]:
import os
import subprocess
# --- Step 4: Generate Predictions in the Correct Format ---
print("\nStep 4: Generating predictions ready for 'evaluate.py'...")

# IMPORTANT: This is the official tokenization logic from evaluate.py
def correct_program_tokenization(original_program):
    """
    This tokenizer is copied from the official evaluate.py script to ensure
    100% format compatibility.
    """
    if not isinstance(original_program, str):
        return ['EOF']
    original_program = original_program.split(', ')
    program = []
    for tok in original_program:
        cur_tok = ''
        for c in tok:
            if c == ')':
                if cur_tok != '': program.append(cur_tok)
                cur_tok = ''
            cur_tok += c
            if c in ['(', ')']:
                program.append(cur_tok)
                cur_tok = ''
        if cur_tok != '': program.append(cur_tok)
    program.append('EOF')
    return program

device = "cuda" if torch.cuda.is_available() else "cpu"

base_project_dir = r"E:\1.apps\obsidian_folder\Research\Research_code\FinQA" # <--- ADJUST THIS LINE

# Define the name of the folder containing your saved model
model_folder_name = "finqa_t5_final_model"

# Construct the full, absolute path to the model directory using os.path.join
# This function intelligently handles OS-specific path separators.
model_load_path = os.path.join(base_project_dir, model_folder_name)

print(f"Attempting to load model from: {model_load_path}")

try:
    model = T5ForConditionalGeneration.from_pretrained(
        model_load_path,           # Pass the correctly constructed local path
        local_files_only=True,     # Ensure it only looks for local files
        force_download=False       # Explicitly prevent download attempts
    ).to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"\nERROR: Failed to load model. Please check the path and folder contents.")
    print(f"Detailed error: {e}")
    print(f"The path attempted was: {model_load_path}")
    print("Make sure that directory contains files like 'pytorch_model.bin', 'config.json', 'tokenizer.json', etc.")
    exit() # Exit if model loading fails, as subsequent steps will also fail

predictions_for_eval = []

# We use the original test_dataset to access all necessary fields
for item in test_dataset:
    # Prepare input for the model
    question = item['qa']['question']
    pre_text = " ".join(item['pre_text'])
    post_text = " ".join(item['post_text'])
    table_data = item['table']
    table_str = pd.DataFrame(table_data[1:], columns=table_data[0]).to_string(index=False) if table_data else ""
    input_text = f"finqa: question: {question} pre_text: {pre_text} table: {table_str} post_text: {post_text}"
    
    # Generate the program string from the model
    input_ids = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=128)
    predicted_program_string = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Tokenize the generated string into the required list format
    tokenized_program_list = correct_program_tokenization(predicted_program_string)
    
    # Create the prediction entry with the correct key ("predicted") and format
    prediction_entry = {
        "id": item["id"],
        "predicted": tokenized_program_list # This is the crucial part
    }
    predictions_for_eval.append(prediction_entry)

# Save the correctly formatted predictions to a file
output_filename = 'predictions_final.json'
with open(output_filename, 'w') as f:
    json.dump(predictions_for_eval, f, indent=4)
print(f"'{output_filename}' created successfully in the correct format.")



Step 4: Generating predictions ready for 'evaluate.py'...
Attempting to load model from: E:\1.apps\obsidian_folder\Research\Research_code\FinQA\finqa_t5_final_model
Model loaded successfully!
'predictions_final.json' created successfully in the correct format.

--- DETAILED DEBUGGING OF FIRST 3 SAMPLES ---

--- Comparing Sample ID: ETR/2016/page_23.pdf-2 ---
Question: what is the net change in net revenue during 2015 for entergy corporation?
  GOLD Program:      ['subtract(', '5829', '5735', ')', 'EOF']
  PREDICTED Program: ['subtract(', '5735', '5829', ')', 'EOF']
  GOLD Answer:       94.0
  => Program Structure: MISMATCH.

--- Comparing Sample ID: INTC/2015/page_41.pdf-4 ---
Question: what percentage of total facilities as measured in square feet are leased?
  GOLD Program:      ['divide(', '8.1', '56.0', ')', 'EOF']
  PREDICTED Program: ['divide(', '2.1', '6.0', ')', 'EOF']
  GOLD Answer:       0.14464
  => Program Structure: MISMATCH.

--- Comparing Sample ID: ADI/2011/page_61.pdf

In [17]:

# --- Step 4.5: Export Detailed Comparison File for All Test Samples ---
print("\n--- Exporting detailed comparison for all test samples ---")

# Define the output file name
comparison_filename = "model_predictions_comparison.txt"

# For easy lookup, create a dictionary of the gold standard test data by ID
gold_data_map = {item['id']: item for item in test_dataset}

# Use a list to build the file content for efficiency
output_lines = []
output_lines.append("="*80)
output_lines.append("         MODEL PREDICTION vs. GOLD STANDARD COMPARISON")
output_lines.append("="*80 + "\n")


# Loop through ALL of your model's predictions
for i, pred_item in enumerate(predictions_for_eval):
    pred_id = pred_item['id']
    predicted_program_list = pred_item['predicted']

    # Find the corresponding gold standard data point
    if pred_id in gold_data_map:
        gold_item = gold_data_map[pred_id]
        gold_program_string = gold_item['qa']['program']
        
        # IMPORTANT: Tokenize the gold program in the exact same way
        # as the prediction for a fair comparison.
        gold_program_list = correct_program_tokenization(gold_program_string)
        gold_answer = gold_item['qa']['exe_ans']

        # Determine if the programs match
        is_match = (predicted_program_list == gold_program_list)
        match_status = "MATCH!" if is_match else "MISMATCH"

        # Append the formatted comparison to our list of lines
        output_lines.append(f"--- SAMPLE {i+1}/{len(predictions_for_eval)} | ID: {pred_id} ---")
        output_lines.append(f"Question: {gold_item['qa']['question']}")
        output_lines.append(f"  GOLD Program:      {gold_program_list}")
        output_lines.append(f"  PREDICTED Program: {predicted_program_list}")
        output_lines.append(f"  GOLD Answer:       {gold_answer}")
        output_lines.append(f"  => Program Structure: {match_status}")
        output_lines.append("-" * 50 + "\n") # Add a separator for readability

    else:
        output_lines.append(f"[Warning] Could not find gold data for predicted ID: {pred_id}\n")

# Write all the collected lines to the file at once
with open(comparison_filename, 'w', encoding='utf-8') as f:
    f.write("\n".join(output_lines))

print(f"Comparison file '{comparison_filename}' has been created successfully.")
print("You can now open this file to inspect all predictions side-by-side.")



--- Exporting detailed comparison for all test samples ---
Comparison file 'model_predictions_comparison.txt' has been created successfully.
You can now open this file to inspect all predictions side-by-side.


In [None]:

# --- Step 5: Run the Unchanged `evaluate.py` Script ---
print(f"\nStep 5: Running the official evaluation script...")
gold_standard_file = './dataset/test.json'
evaluation_script = './code/evaluate/evaluate.py' # This is evaluate.py in this environment

if os.path.exists(gold_standard_file):
    command = ["python", evaluation_script, output_filename, gold_standard_file]
    result = subprocess.run(command, capture_output=True, text=True)

    print("\n--- Evaluation Results ---")
    if result.stdout:
        print("Output from script:")
        print(result.stdout)
    if result.stderr:
        print("Errors from script (if any):")
        print(result.stderr)
    if result.returncode == 0:
        print("Evaluation script completed successfully.")
    else:
        print(f"Evaluation script failed with exit code: {result.returncode}")
else:
    print(f"\nError: Gold standard file not found at '{gold_standard_file}'.")

In [22]:
# --- Super Simple FINQA-Style Sanity Check ---
# This should be run AFTER the model has been fine-tuned.

print("\n" + "="*60)
print("     RUNNING A SUPER SIMPLE FINQA-STYLE SANITY CHECK")
print("="*60)
print("This test checks if the fine-tuned model has learned the most basic")
print("structure of the FinQA task: creating a simple program.")

# 1. Define an extremely simple FinQA-style input
# The context is minimal and the numbers are obvious.
simple_question = "What is the total of the first and second values?"
simple_pre_text = "The first value is 50. The second value is 25."
simple_table_str = "" # No table
simple_post_text = "" # No post_text

# 2. Define the expected, perfect program output
expected_program = "add(50, 25)"

# 3. Construct the input string in the exact FinQA format
input_text = f"finqa: question: {simple_question} pre_text: {simple_pre_text} table: {simple_table_str} post_text: {simple_post_text}"

print(f"\n[INFO] Feeding this input to the model:\n'{input_text}'")

# 4. Use the fine-tuned model to generate the program
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128)
predicted_program_string = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 5. Show the results
print("\n--- RESULTS ---")
print(f"  [EXPECTED]: {expected_program}")
print(f"  [MODEL'S ACTUAL OUTPUT]: {predicted_program_string}")
print("-" * 30)

# 6. Analyze the result
# We use .strip() to ignore any accidental leading/trailing whitespace
if predicted_program_string.strip() == expected_program:
    print("\n[ANALYSIS]: SUCCESS! The fine-tuned model passed the simplest FinQA test.")
    print("This is a strong positive signal that the training process was effective and")
    print("the model is learning to generate programs in the correct format.")
else:
    print("\n[ANALYSIS]: FAILED. The model did not produce the correct program for this trivial case.")
    print("If this fails after training, it suggests a problem with the training process itself, such as:")
    print("  - Not enough training data or epochs (the model is still severely undertrained).")
    print("  - The learning rate is too high or too low, preventing effective learning.")
    print("  - A subtle bug in the `preprocess_function` is creating bad training examples.")

print("\n" + "="*60)
print("      FINQA SANITY CHECK COMPLETE")
print("="*60)


     RUNNING A SUPER SIMPLE FINQA-STYLE SANITY CHECK
This test checks if the fine-tuned model has learned the most basic
structure of the FinQA task: creating a simple program.

[INFO] Feeding this input to the model:
'finqa: question: What is the total of the first and second values? pre_text: The first value is 50. The second value is 25. table:  post_text: '

--- RESULTS ---
  [EXPECTED]: add(50, 25)
  [MODEL'S ACTUAL OUTPUT]: duplicate
------------------------------

[ANALYSIS]: FAILED. The model did not produce the correct program for this trivial case.
If this fails after training, it suggests a problem with the training process itself, such as:
  - Not enough training data or epochs (the model is still severely undertrained).
  - The learning rate is too high or too low, preventing effective learning.
  - A subtle bug in the `preprocess_function` is creating bad training examples.

      FINQA SANITY CHECK COMPLETE


In [21]:
# --- Super Simple "No-Training-Required" Sanity Check ---

print("\n" + "="*60)
print("  RUNNING A SUPER SIMPLE SANITY CHECK (NO TRAINING NEEDED)")
print("="*60)
print("This test checks if the base T5 model is loaded and working correctly.")
print("It uses a simple translation task that T5 can do without fine-tuning.")

# 1. Define a standard T5 translation prompt
# T5 was pre-trained on tasks like this.
translation_prompt = "translate English to German: Hello, how are you?"

# 2. Define the expected, reasonable output
# The exact output can vary slightly, but it should be a correct translation.
expected_german_translation = "Hallo, wie geht es Ihnen?"

print(f"\n[INFO] Feeding this input to the model:\n'{translation_prompt}'")

# 3. Use the model to generate the text
# Ensure the model and tokenizer are the ones loaded in your script.
input_ids = tokenizer(translation_prompt, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128)
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. Show the results
print("\n--- RESULTS ---")
print(f"  [A REASONABLE EXPECTED OUTPUT]: {expected_german_translation}")
print(f"  [MODEL'S ACTUAL OUTPUT]:      {predicted_text}")
print("-" * 30)

# 5. Analyze the result
if "Hallo" in predicted_text and "geht" in predicted_text:
    print("\n[ANALYSIS]: SUCCESS! The model produced a correct German translation.")
    print("This confirms that:")
    print("  - The T5 model from Hugging Face was loaded correctly.")
    print("  - The tokenizer is working.")
    print("  - The `model.generate()` function is executing properly.")
    print("\nThis strongly implies that the previous failures are due to the model NOT being fine-tuned on the FinQA task.")
else:
    print("\n[ANALYSIS]: FAILED. The model did not produce the expected translation.")
    print("If this fails, there is a fundamental problem with how the model is being loaded from the local path, or the files themselves are corrupt.")
    print("Please double-check the `model_load_path` and the contents of the model directory.")


print("\n" + "="*60)
print("      SUPER SIMPLE SANITY CHECK COMPLETE")
print("="*60)


  RUNNING A SUPER SIMPLE SANITY CHECK (NO TRAINING NEEDED)
This test checks if the base T5 model is loaded and working correctly.
It uses a simple translation task that T5 can do without fine-tuning.

[INFO] Feeding this input to the model:
'translate English to German: Hello, how are you?'

--- RESULTS ---
  [A REASONABLE EXPECTED OUTPUT]: Hallo, wie geht es Ihnen?
  [MODEL'S ACTUAL OUTPUT]:      Hallo, wie sind Sie?
------------------------------

[ANALYSIS]: FAILED. The model did not produce the expected translation.
If this fails, there is a fundamental problem with how the model is being loaded from the local path, or the files themselves are corrupt.
Please double-check the `model_load_path` and the contents of the model directory.

      SUPER SIMPLE SANITY CHECK COMPLETE
