In [1]:
pip install -U transformers accelerate


Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMultipleChoice, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

def load_figqa_dataset(train_path, dev_path, test_path):
    train_df = pd.read_csv(train_path)
    dev_df = pd.read_csv(dev_path)
    test_df = pd.read_csv(test_path)

    train_df.rename(columns={'labels': 'label'}, inplace=True)
    dev_df.rename(columns={'labels': 'label'}, inplace=True)
    test_df.rename(columns={'labels': 'label'}, inplace=True)

    if 'label' not in test_df.columns:
        test_df['label'] = -1

    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)

    return DatasetDict({
        'train': train_dataset,
        'dev': dev_dataset,
        'test': test_dataset
    })

def preprocess_function(examples):
    first_sentences = [[context] * 2 for context in examples['startphrase']]
    question_headers = [examples['ending1'], examples['ending2']]
    choices = list(map(list, zip(*question_headers)))

    first_sentences = sum(first_sentences, [])
    choices = sum(choices, [])

    tokenized_examples = tokenizer(
        first_sentences,
        choices,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    return {
        'input_ids': [tokenized_examples['input_ids'][i:i + 2] for i in range(0, len(tokenized_examples['input_ids']), 2)],
        'attention_mask': [tokenized_examples['attention_mask'][i:i + 2] for i in range(0, len(tokenized_examples['attention_mask']), 2)],
        'labels': examples['label']
    }

train_path = '/kaggle/input/nlpproject/train_xl.csv'
dev_path = '/kaggle/input/nlpproject/dev.csv'
test_path = '/kaggle/input/nlpproject/train_s.csv'

dataset = load_figqa_dataset(train_path, dev_path, test_path)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMultipleChoice.from_pretrained('bert-base-uncased')

encoded_dataset = dataset.map(preprocess_function, batched=True)

columns_to_remove = ['startphrase', 'ending1', 'ending2', 'valid']
for col in columns_to_remove:
    if col in encoded_dataset['test'].column_names:
        encoded_dataset = encoded_dataset.remove_columns([col])

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_strategy='epoch',
    save_total_limit=2,
    report_to=[]
)

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=1)
    return {'accuracy': (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

test_predictions = trainer.predict(encoded_dataset['test'])

predictions = test_predictions.predictions.argmax(axis=1)

test_df = pd.read_csv(test_path)

test_df['predicted'] = predictions

output_csv_path = './results/test_predictions.csv'
test_df.to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")


2024-05-12 17:58:28.414359: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-12 17:58:28.414457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-12 17:58:28.514535: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/nlpproject/train_xl.csv'