In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [7]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 128  # shorter input
BATCH_SIZE = 8   # smaller batch
EPOCHS = 1        # just 1 to test speed
DEVICE = torch.device("cpu")  # or "cuda" if you ever switch

In [8]:
df = pd.read_csv("../data/labeled_texts.csv")
df["label"] = df["label"].map({"formal": 0, "informal": 1})
train_df, _ = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)



In [4]:
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt")

class FormalityDataset(Dataset):
    def __init__(self, dataframe):
        self.encodings = tokenize_texts(dataframe["text"].tolist())
        self.labels = torch.tensor(dataframe["label"].tolist())

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
train_dataset = FormalityDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=3e-5)

model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Avg loss: {total_loss/len(train_loader):.2f}")

# Save model
model.save_pretrained("../models/distilbert-formality-fast")
tokenizer.save_pretrained("../models/distilbert-formality-fast")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|                             | 57/21539 [00:21<2:12:53,  2.69it/s]


KeyboardInterrupt: 