In [None]:
pip install -U transformers accelerate


In [1]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMultipleChoice, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the dataset from CSV files
def load_figqa_dataset(train_path, dev_path, test_path):
    train_df = pd.read_csv(train_path)
    dev_df = pd.read_csv(dev_path)
    test_df = pd.read_csv(test_path)

    # Ensure label columns are correctly named
    train_df.rename(columns={'labels': 'label'}, inplace=True)
    dev_df.rename(columns={'labels': 'label'}, inplace=True)
    test_df.rename(columns={'labels': 'label'}, inplace=True)

    # Assign a default value to 'label' in the test dataset if it doesn't exist
    if 'label' not in test_df.columns:
        test_df['label'] = -1

    # Convert DataFrames to Dataset objects
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)

    return DatasetDict({
        'train': train_dataset,
        'dev': dev_dataset,
        'test': test_dataset
    })

# Preprocess function for multiple-choice questions
def preprocess_function(examples):
    first_sentences = [[context] * 2 for context in examples['startphrase']]
    question_headers = [examples['ending1'], examples['ending2']]
    choices = list(map(list, zip(*question_headers)))

    first_sentences = sum(first_sentences, [])
    choices = sum(choices, [])

    tokenized_examples = tokenizer(
        first_sentences,
        choices,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    return {
        'input_ids': [tokenized_examples['input_ids'][i:i + 2] for i in range(0, len(tokenized_examples['input_ids']), 2)],
        'attention_mask': [tokenized_examples['attention_mask'][i:i + 2] for i in range(0, len(tokenized_examples['attention_mask']), 2)],
        'labels': examples['label']
    }

# Define paths for dataset
train_path = '/kaggle/input/nlpproject/train_xl.csv'
dev_path = '/kaggle/input/nlpproject/dev.csv'
test_path = '/kaggle/input/nlpproject/train_s.csv'

# Load the Fig-QA dataset
dataset = load_figqa_dataset(train_path, dev_path, test_path)

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')

# Apply the preprocessing function to the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Check and remove unnecessary columns
columns_to_remove = ['startphrase', 'ending1', 'ending2', 'valid']
for col in columns_to_remove:
    if col in encoded_dataset['test'].column_names:
        encoded_dataset = encoded_dataset.remove_columns([col])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_strategy='epoch',
    save_total_limit=2,
    report_to=[]  # Disable W&B
)

# Define a function to compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=1)
    return {'accuracy': (preds == labels).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Evaluate the model on the evaluation set
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Make predictions on the test dataset
test_predictions = trainer.predict(encoded_dataset['test'])

# Extract the predicted labels
predictions = test_predictions.predictions.argmax(axis=1)

# Load original test data for context
test_df = pd.read_csv(test_path)

# Add predicted labels to the DataFrame
test_df['predicted'] = predictions

# Save to a new CSV file
output_csv_path = './results/test_predictions.csv'
test_df.to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")


2024-05-14 19:48:11.373557: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 19:48:11.373668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 19:48:11.555438: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8016 [00:00<?, ? examples/s]

Map:   0%|          | 0/1094 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.492,0.440337,0.790676
2,0.448,0.371158,0.83638
3,0.263,0.402555,0.840951




Evaluation Results: {'eval_loss': 0.4025554060935974, 'eval_accuracy': 0.8409506398537477, 'eval_runtime': 10.8392, 'eval_samples_per_second': 100.93, 'eval_steps_per_second': 6.366, 'epoch': 3.0}
Predictions saved to ./results/test_predictions.csv
