In [1]:
import pandas as pd
from datasets import load_dataset, DatasetDict

# Initialize the tokenizer for our chosen model
from transformers import T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:

def preprocess_finqa(examples):
    """
    Prepares the FinQA dataset for a T5 model.
    The input to the model will be a single string containing the question and context.
    The output will be the reasoning program.
    """
    inputs = []
    targets = []

    for i in range(len(examples['pre_text'])):
        # Construct the input string
        question = examples['qa'][i]['question']

        # Linearize the table using pandas for a clean string representation
        table_data = examples['table'][i]
        if table_data:
            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            table_str = df.to_string()
        else:
            table_str = ""

        # Combine all parts for the model's input
        input_text = f"question: {question} context: {examples['pre_text'][i]} {table_str} {examples['post_text'][i]}"
        inputs.append(input_text)

        # The target for the model is the reasoning program
        program = examples['qa'][i]['program']
        targets.append(program)

    # Tokenize the processed inputs and targets
    model_inputs = tokenizer(inputs, max_length=1024, padding="max_length", truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:

# Load the dataset from Hugging Face
# Using a pre-split version for convenience
try:
    finqa_dataset = load_dataset("dreamerdeo/finqa")
    # Using smaller splits for a quick demonstration
    # For a real run, use the full dataset
    train_dataset = finqa_dataset['train'].select(range(1000)) # Use more data for a real run
    validation_dataset = finqa_dataset['validation'].select(range(200))
    test_dataset = finqa_dataset['test'].select(range(200))
    
    small_finqa_dataset = DatasetDict({
        'train': train_dataset,
        'validation': validation_dataset,
        'test': test_dataset
    })
    
except Exception as e:
    print(f"Failed to load dataset. Make sure you are connected to the internet. Error: {e}")
    # You would exit or handle this error in a real script
    exit()

In [None]:


# Apply the preprocessing function to the entire dataset
tokenized_datasets = small_finqa_dataset.map(preprocess_finqa, batched=True)

print("Data preparation complete. Example of tokenized input:")
print(tokenized_datasets['train'][0].keys())