In [1]:
import os
import requests

# URLs and target paths
file_urls = {
    "data/Gungor_2018_VictorianAuthorAttribution_data-train.csv":
        "https://dataworks.indianapolis.iu.edu/bitstream/handle/11243/23/Gungor_2018_VictorianAuthorAttribution_data-train.csv?sequence=2&isAllowed=y",
    "data/Gungor_2018_VictorianAuthorAttribution_data.csv":
        "https://dataworks.indianapolis.iu.edu/bitstream/handle/11243/23/Gungor_2018_VictorianAuthorAttribution_data.csv?sequence=3&isAllowed=y"
}

# Ensure data directory exists
os.makedirs("data", exist_ok=True)

# Download missing files
for filename, url in file_urls.items():
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Saved to {filename}")
    else:
        print(f"File already exists: {filename}")


File already exists: data/Gungor_2018_VictorianAuthorAttribution_data-train.csv
File already exists: data/Gungor_2018_VictorianAuthorAttribution_data.csv


In [2]:
import pandas as pd

# Load the CSV files
train_df = pd.read_csv("data/Gungor_2018_VictorianAuthorAttribution_data-train.csv", encoding="ISO-8859-1")
full_df = pd.read_csv("data/Gungor_2018_VictorianAuthorAttribution_data.csv", encoding="ISO-8859-1")


In [3]:
from sklearn.model_selection import train_test_split

# Encode author names as categorical labels
train_df['author_label'] = train_df['author'].astype('category').cat.codes
label_map = dict(enumerate(train_df['author'].astype('category').cat.categories))

# Optional: Save label_map for later use
import json
with open("data/label_map.json", "w") as f:
    json.dump(label_map, f)

# Train/validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].tolist(),
    train_df['author_label'].tolist(),
    test_size=0.1,
    stratify=train_df['author_label'],
    random_state=42
)


In [4]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import torch

class AuthorDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

train_dataset = AuthorDataset(train_encodings, train_labels)
val_dataset = AuthorDataset(val_encodings, val_labels)


In [6]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import os
from sklearn.metrics import accuracy_score
import numpy as np

os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,2.042
1000,0.7962
1500,0.4357
2000,0.2148
2500,0.1342
3000,0.0804
3500,0.0372


TrainOutput(global_step=3775, training_loss=0.4976195132021872, metrics={'train_runtime': 2332.7292, 'train_samples_per_second': 103.548, 'train_steps_per_second': 1.618, 'total_flos': 3.2022037234944e+16, 'train_loss': 0.4976195132021872, 'epoch': 5.0})

In [7]:
output_dir = "saved_model"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json',
 'saved_model/tokenizer.json')