In [28]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [29]:
MODEL_NAME = "distilbert-base-uncased" # "small" bert
MAX_LENGTH = 128
BATCH_SIZE = 8 
EPOCHS = 3
DEVICE = torch.device("cpu")  # or cuda (I don't have cuda :( )

In [30]:
df = pd.read_csv("../data/labeled_texts.csv")
df["label"] = df["label"].map({"formal": 0, "informal": 1})
train_df, _ = train_test_split(df, test_size = 0.15, stratify = df["label"], random_state = 123321)
print(train_df.shape)
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

(1099, 2)




In [31]:
# tokenizer testing
sample_text = "Jetbrains is a cool company!"
tokens = tokenizer(sample_text, padding = "max_length", truncation = True, max_length = 128)

print(tokens)

{'input_ids': [101, 6892, 10024, 7076, 2003, 1037, 4658, 2194, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [32]:
def tokenize_texts(texts):
    return tokenizer(texts, truncation = True, padding = "max_length", max_length = MAX_LENGTH, return_tensors = "pt")

class FormalityDataset(Dataset):
    def __init__(self, dataframe):
        self.encodings = tokenize_texts(dataframe["text"].tolist())
        self.labels = torch.tensor(dataframe["label"].tolist())

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [33]:
train_dataset = FormalityDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)

model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 2)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr = 3e-5)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_loader, desc = f"Epoch {epoch+1}"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Avg loss: {total_loss/len(train_loader):.2f}")

model.save_pretrained("../models/distilbert")
tokenizer.save_pretrained("../models/distilbert")

Epoch 1: 100%|████████████████████████████████| 138/138 [03:30<00:00,  1.52s/it]


Epoch 1, Avg loss: 0.20


Epoch 2: 100%|████████████████████████████████| 138/138 [03:35<00:00,  1.56s/it]


Epoch 2, Avg loss: 0.04


Epoch 3: 100%|████████████████████████████████| 138/138 [03:36<00:00,  1.57s/it]


Epoch 3, Avg loss: 0.02


('../models/distilbert/tokenizer_config.json',
 '../models/distilbert/special_tokens_map.json',
 '../models/distilbert/vocab.txt',
 '../models/distilbert/added_tokens.json',
 '../models/distilbert/tokenizer.json')