In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm
import torch

In [2]:
df = pd.read_csv('authors.csv')

In [3]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def preprocess_data(data, max_length=128):
    tokenized = tokenizer(list(data["text"]), max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = torch.tensor(list(data["index"]))

    return tokenized, labels

In [8]:
train_data, train_labels = preprocess_data(train_df)
test_data, test_labels = preprocess_data(test_df)

In [9]:
train_dataset = TensorDataset(train_data["input_ids"], train_data["attention_mask"], train_labels)
test_dataset = TensorDataset(test_data["input_ids"], test_data["attention_mask"], test_labels)

In [10]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=0.00005, no_deprecation_warning=True)
criterion = torch.nn.CrossEntropyLoss()

In [15]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss:.4f}")

Epoch 1/3: 100%|██████████████████████████████████████████████████████████████████████| 478/478 [42:45<00:00,  5.37s/it]


Epoch 1/3, Average Loss: 0.3113


Epoch 2/3:  16%|███████████▍                                                           | 77/478 [05:30<28:41,  4.29s/it]


KeyboardInterrupt: 