# Import Libraries

In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import TapasTokenizer, TapasForQuestionAnswering
import torch
import os


# Set Environment Variables

In [None]:
os.environ["USE_TF"] = "0"


# Load DataBench QA Dataset

In [None]:
# Load all QA pairs
all_qa = load_dataset("cardiffnlp/databench", name="qa", split="train")

# Convert to a pandas DataFrame for easier manipulation
qa_df = pd.DataFrame(all_qa)

# View the first few rows
print(qa_df.head())

# Check the dataset names
dataset_names = qa_df['dataset'].unique()
print(f"Datasets referenced in QA pairs: {dataset_names}")


                                            question answer     type  \
0  Is the person with the highest net worth self-...   True  boolean   
1    Does the youngest billionaire identify as male?   True  boolean   
2  Is the city with the most billionaires in the ...   True  boolean   
3  Is there a non-self-made billionaire in the to...   True  boolean   
4  Does the oldest billionaire have a philanthrop...  False  boolean   

                   columns_used                        column_types  \
0    ['finalWorth', 'selfMade']       ['number[uint32]', 'boolean']   
1             ['age', 'gender']       ['number[UInt8]', 'category']   
2           ['city', 'country']            ['category', 'category']   
3          ['rank', 'selfMade']       ['number[uint16]', 'boolean']   
4  ['age', 'philanthropyScore']  ['number[UInt8]', 'number[UInt8]']   

  sample_answer     dataset  
0         False  001_Forbes  
1          True  001_Forbes  
2          True  001_Forbes  
3         False  001

# Function to Load Dataset by ID

In [None]:
def load_dataset_by_id(ds_id, version='all'):
    """
    Loads a dataset by its ID from the Hugging Face Hub.

    Args:
        ds_id (str): The dataset ID (e.g., '001_Forbes').
        version (str): 'all' for full dataset, 'sample' for sampled version.

    Returns:
        pd.DataFrame: The loaded dataset as a pandas DataFrame.
    """
    try:
        df = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{ds_id}/{version}.parquet")
        print(f"Loaded dataset '{ds_id}' ({version} version).")
        return df
    except Exception as e:
        print(f"Error loading dataset {ds_id}: {e}")
        return None


# Load and Preprocess a Sample Dataset

In [None]:
# Load the first dataset referenced in the QA pairs
ds_id = qa_df['dataset'][0]  # e.g., '001_Forbes'

# Load the full version of the dataset
df_full = load_dataset_by_id(ds_id, version='all')

# Check if the dataset was loaded successfully
if df_full is not None:
    # Display the first few rows
    print(df_full.head())
else:
    print("Dataset could not be loaded.")


Loaded dataset '001_Forbes' (all version).
   rank                personName   age  finalWorth               category  \
0     1                 Elon Musk  50.0      219000             Automotive   
1     2                Jeff Bezos  58.0      171000             Technology   
2     3  Bernard Arnault & family  73.0      158000       Fashion & Retail   
3     4                Bill Gates  66.0      129000             Technology   
4     5            Warren Buffett  91.0      118000  Finance & Investments   

               source        country       state     city  \
0       Tesla, SpaceX  United States       Texas   Austin   
1              Amazon  United States  Washington  Seattle   
2                LVMH         France         NaN    Paris   
3           Microsoft  United States  Washington   Medina   
4  Berkshire Hathaway  United States    Nebraska    Omaha   

                       organization  selfMade gender  \
0                             Tesla      True      M   
1        

# Preprocess Table Function

In [None]:
def preprocess_table(table):
    """
    Preprocesses the table by handling missing values and any other required preprocessing steps.

    Args:
        table (pd.DataFrame): The input table.

    Returns:
        pd.DataFrame: The preprocessed table.
    """
    # Convert categorical columns to strings
    for col in table.select_dtypes(include=['category']).columns:
        table[col] = table[col].astype(str)

    # Fill missing values
    table = table.fillna("Unknown")

    return table


# Preprocess the Loaded Dataset

In [None]:
if df_full is not None:
    # Preprocess the loaded dataset
    df_preprocessed = preprocess_table(df_full)

    # Display the first few rows
    print(df_preprocessed.head())
else:
    print("Cannot preprocess because the dataset was not loaded.")


   rank                personName   age  finalWorth               category  \
0     1                 Elon Musk  50.0      219000             Automotive   
1     2                Jeff Bezos  58.0      171000             Technology   
2     3  Bernard Arnault & family  73.0      158000       Fashion & Retail   
3     4                Bill Gates  66.0      129000             Technology   
4     5            Warren Buffett  91.0      118000  Finance & Investments   

               source        country       state     city  \
0       Tesla, SpaceX  United States       Texas   Austin   
1              Amazon  United States  Washington  Seattle   
2                LVMH         France         nan    Paris   
3           Microsoft  United States  Washington   Medina   
4  Berkshire Hathaway  United States    Nebraska    Omaha   

                       organization  selfMade gender  \
0                             Tesla      True      M   
1                            Amazon      True      M

# Function to Get Answer Coordinates

In [None]:
def get_answer_coordinates(table, answer_text):
    """
    Generates the coordinates of the answer in the table.

    Args:
        table (pd.DataFrame): The table where the answer is located.
        answer_text (str): The answer text.

    Returns:
        List[Tuple[int, int]]: List of (row_index, column_index) tuples.
    """
    coordinates = []
    for row_idx, row in table.iterrows():
        for col_idx, value in enumerate(row):
            if str(value).lower() == str(answer_text).lower():
                coordinates.append((row_idx, col_idx))
    return coordinates


# Dataset Class for Training

In [None]:
from torch.utils.data import Dataset

class DataBenchQADataset(Dataset):
    def __init__(self, qa_df, tokenizer):
        self.qa_df = qa_df.reset_index(drop=True)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_df)

    def __getitem__(self, idx):
        original_idx = idx
        max_attempts = len(self.qa_df)
        attempts = 0

        while attempts < max_attempts:
            row = self.qa_df.iloc[idx]
            dataset_id = row['dataset']
            question = row['question']
            answer_text = row['answer']

            # Load the corresponding table
            table = load_dataset_by_id(dataset_id, version='all')
            if table is None:
                # Skip to next example
                idx = (idx + 1) % len(self.qa_df)
                attempts += 1
                continue

            # Preprocess the table
            table = preprocess_table(table)

            # Generate answer coordinates
            answer_coordinates = get_answer_coordinates(table, answer_text)
            if not answer_coordinates:
                # Skip to next example
                idx = (idx + 1) % len(self.qa_df)
                attempts += 1
                continue

            # Prepare inputs
            inputs = self.tokenizer(
                table=table,
                queries=question,
                answer_coordinates=[answer_coordinates],
                answer_text=[answer_text],
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )

            # Remove batch dimension
            for k in inputs:
                inputs[k] = inputs[k].squeeze(0)

            return inputs

        # If no valid example is found after max_attempts
        raise ValueError("No valid examples found in the dataset.")


# Initialize Tokenizer and Dataset

In [None]:
tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')

# Initialize the dataset
train_dataset = DataBenchQADataset(qa_df, tokenizer)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# Fine-Tune the TAPAS Model

In [None]:
from transformers import TapasForQuestionAnswering, TrainingArguments, Trainer

# Initialize the model
model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    logging_steps=10,
    save_steps=100,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=2,
)

# Since we don't have a validation dataset, we'll split the training data
split = int(0.9 * len(train_dataset))
train_subset = torch.utils.data.Subset(train_dataset, range(0, split))
eval_subset = torch.utils.data.Subset(train_dataset, range(split, len(train_dataset)))

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
)


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

# Start Training


In [None]:
# Start training
trainer.train()


# Save the Fine-Tuned Model

In [None]:
# Save the model and tokenizer
model.save_pretrained('fine-tuned-tapas')
tokenizer.save_pretrained('fine-tuned-tapas')


# Test the Fine-Tuned Model

In [None]:
# # Load the fine-tuned model and tokenizer
# tokenizer = TapasTokenizer.from_pretrained('fine-tuned-tapas')
# model = TapasForQuestionAnswering.from_pretrained('fine-tuned-tapas')

# # Example question and dataset
# test_idx = 0  # Index of the test example
# test_row = qa_df.iloc[test_idx]
# test_dataset_id = test_row['dataset']
# test_question = test_row['question']
# test_answer = test_row['answer']

# # Load and preprocess the dataset
# test_table = load_dataset_by_id(test_dataset_id, version='all')
# test_table = preprocess_table(test_table)

# # Prepare inputs
# inputs = tokenizer(
#     table=test_table,
#     queries=test_question,
#     padding='max_length',
#     truncation=True,
#     return_tensors="pt"
# )

# # Get model outputs
# with torch.no_grad():
#     outputs = model(**inputs)

# # Get the predicted answer coordinates
# predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
#     inputs,
#     outputs.logits,
#     outputs.logits_aggregation
# )

# # Extract the answers from the table
# answers = []
# for coordinates in predicted_answer_coordinates:
#     cell_values = []
#     for coord in coordinates:
#         try:
#             cell_value = test_table.iat[coord]
#             cell_values.append(str(cell_value))
#         except Exception as e:
#             print(f"Error accessing cell at {coord}: {e}")
#             continue
#     answers.append(", ".join(cell_values))

# # Handle aggregation (if any)
# aggregation_ops = ['NONE', 'SUM', 'AVERAGE', 'COUNT']
# agg_op_idx = predicted_aggregation_indices[0]
# agg_op = aggregation_ops[agg_op_idx]

# if agg_op == 'NONE':
#     answer = ", ".join(answers)
# elif agg_op == 'SUM':
#     nums = [float(a) for a in answers if a.replace('.', '', 1).isdigit()]
#     answer = str(sum(nums)) if nums else "0"
# elif agg_op == 'AVERAGE':
#     nums = [float(a) for a in answers if a.replace('.', '', 1).isdigit()]
#     answer = str(sum(nums) / len(nums)) if nums else "0"
# elif agg_op == 'COUNT':
#     answer = str(len(answers))
# else:
#     answer = ", ".join(answers)

# print(f"Question: {test_question}")
# print(f"Predicted Answer: {answer}")
# print(f"Actual Answer: {test_answer}")
