In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
#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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_id = "roberta-base"

### Dataset Load

In [5]:
dataset_dir = '/home/infres/iivanova-23/DADA/Data/finalData.csv'
data = pd.read_csv(dataset_dir)
data = data.rename(columns={'AuthorID': 'label', 'Quote': 'text', 'Label': 'type'})
data = data[data['type'] != 'spoof']
data['label'] = data['label'].astype(int)
label_counts = data['label'].value_counts()
labels_to_keep = label_counts[label_counts >= 250].index
data = data[data['label'].isin(labels_to_keep)]
print(f"Number of authors that have more then 100 quotes: {len(data['label'].unique())}")
authors = data['Author'].unique()
for i, author in enumerate(authors):
    data.loc[data['Author'] == author, 'label'] = i

Number of authors that have more then 100 quotes: 7


In [6]:
train_data, temp_data = train_test_split(data, test_size=0.3, stratify=data['label'], random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.6, stratify=temp_data['label'], random_state=42)

train_dataset = Dataset.from_pandas(train_data[['text', 'label']])
val_dataset = Dataset.from_pandas(val_data[['text', 'label']])
test_dataset = Dataset.from_pandas(test_data[['text', 'label']])

In [7]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Extract the number of classess and their names

num_labels = data['label'].nunique()
class_names = data['Author'].unique()
print(f"Number of labels: {num_labels}")
print(f"Labels: {class_names}")

# Create an id2label mapping
# We will need this to directly output the class names when using the pipeline without needing to map the labels later.
id2label = {i: label for i, label in enumerate(class_names)}
print(id2label)
# 3. Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})



Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/341 [00:00<?, ? examples/s]

Map:   0%|          | 0/513 [00:00<?, ? examples/s]

Number of labels: 7
Labels: ['Donald Trump' 'Barack Obama' 'Ayn Rand' 'Christopher Hitchens'
 'Ronald Reagan' 'JFK' 'Winston Churchill']
{0: 'Donald Trump', 1: 'Barack Obama', 2: 'Ayn Rand', 3: 'Christopher Hitchens', 4: 'Ronald Reagan', 5: 'JFK', 6: 'Winston Churchill'}


In [8]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)
repository_id = "~/DADA/authorship_attribution/output"
# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.874,1.88968
2,1.4199,1.250829


TrainOutput(global_step=250, training_loss=1.7789829406738282, metrics={'train_runtime': 43.2328, 'train_samples_per_second': 92.152, 'train_steps_per_second': 5.783, 'total_flos': 262070375731200.0, 'train_loss': 1.7789829406738282, 'epoch': 2.0})

In [10]:
trainer.evaluate()

{'eval_loss': 1.2508294582366943,
 'eval_runtime': 0.6773,
 'eval_samples_per_second': 503.484,
 'eval_steps_per_second': 16.241,
 'epoch': 2.0}