In [1]:
#!pip install -r requirements.txt

In [2]:
import os, math, random
import numpy as np, pandas as pd, matplotlib as mpl, matplotlib.pyplot as plt, seaborn as sns
from dataclasses import dataclass, asdict
from typing import Tuple, List
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support

In [3]:
# plot style from IKT215, dpi set to 200
def set_mpl_params(dpi: int = 200, figsize: Tuple[int, int] = (9, 6), grid: bool = True, font_size: int = 12, font_family: str = 'serif') -> None:
    mpl.rcParams['figure.dpi'] = dpi
    mpl.rcParams['figure.figsize'] = figsize
    mpl.rcParams['axes.grid'] = grid
    mpl.rcParams.update({'font.size': font_size})
    mpl.rcParams['font.family'] = font_family

In [4]:
# reproducilbity
def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [5]:
# hyperparams given in the assignment text, dataclass decorator used for cleaner setup
@dataclass
class Config:
    model_name: str = "distilbert-base-uncased"
    batch_size: int = 16
    grad_accum_steps: int = 2
    epochs: int = 10
    lr: float = 0.00002
    weight_decay: float = 0.01
    max_len: int = 512
    val_split: float = 0.2
    seed: int = 42
    warmup_ratio: float = 0.1

In [6]:
cfg = Config()
set_mpl_params()
seed_everything(cfg.seed)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
print("Using device:", DEVICE)

Using device: cuda


In [8]:
df = pd.read_parquet("dataset.parquet")
assert {"title", "content", "label"}.issubset(df.columns)

In [9]:
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])
# converting np.int64 in dicts to plain python types due to issues with data type handling
# str(lbl) makes it so json can serialize labels, and int(i) makes it so the ids are int instead
label2id = {str(lbl): int(i) for lbl, i in zip(le.classes_, range(len(le.classes_)))}
id2label = {int(i): str(lbl) for lbl, i in enumerate(le.classes_)}

In [10]:
# split into train, validation, and test (0.6 / 0.2 / 0.2)
train_df, temp_df = train_test_split(df, test_size = 0.4, stratify = df["label_id"], random_state = cfg.seed)
val_df, test_df = train_test_split(temp_df, test_size = 0.5, stratify = temp_df["label_id"], random_state = cfg.seed)
print(f"train size: {len(train_df)}, validation size: {len(val_df)}, test size: {len(test_df)}")

train size: 84000, validation size: 28000, test size: 28000


In [11]:
# verify label distribution consistency
print("\nlabel distribution in full dataset:")
print(df['label'].value_counts(normalize = True).round(3))

print("\nlabel distribution in splits:")
print("train:", train_df['label'].value_counts(normalize = True).round(3).head(14))
print("val:  ", val_df['label'].value_counts(normalize = True).round(3).head(14))
print("test: ", test_df['label'].value_counts(normalize = True).round(3).head(14))


label distribution in full dataset:
label
4     0.071
13    0.071
5     0.071
9     0.071
2     0.071
7     0.071
0     0.071
12    0.071
3     0.071
6     0.071
8     0.071
10    0.071
11    0.071
1     0.071
Name: proportion, dtype: float64

label distribution in splits:
train: label
1     0.071
6     0.071
3     0.071
12    0.071
11    0.071
13    0.071
5     0.071
8     0.071
2     0.071
9     0.071
4     0.071
7     0.071
10    0.071
0     0.071
Name: proportion, dtype: float64
val:   label
12    0.071
9     0.071
8     0.071
10    0.071
0     0.071
4     0.071
6     0.071
11    0.071
5     0.071
3     0.071
13    0.071
1     0.071
2     0.071
7     0.071
Name: proportion, dtype: float64
test:  label
8     0.071
4     0.071
7     0.071
6     0.071
2     0.071
1     0.071
0     0.071
5     0.071
12    0.071
13    0.071
10    0.071
9     0.071
3     0.071
11    0.071
Name: proportion, dtype: float64


In [12]:
class DBpediaDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: DistilBertTokenizer, max_length: int = 512):
        # combine title and content into a single text input
        # title provides context and content has the detail, joining both improves the coverage
        # "[SEP]" added to help the model distinguish sections
        self.texts = [f"title: {t} [SEP] content: {c}" for t, c in zip(df["title"], df["content"])]
        self.labels = df["label_id"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict:
        text = str(self.texts[idx]) if self.texts[idx] is not None else ""
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding = "max_length", truncation = True, max_length = self.max_length, return_tensors = "pt")
        return {"input_ids": encoding["input_ids"].squeeze(0), "attention_mask": encoding["attention_mask"].squeeze(0), "labels": torch.tensor(label, dtype = torch.long)}

In [13]:
tokenizer = DistilBertTokenizer.from_pretrained(cfg.model_name)

In [14]:
train_ds = DBpediaDataset(train_df, tokenizer, cfg.max_len)
val_ds = DBpediaDataset(val_df, tokenizer, cfg.max_len)
test_ds = DBpediaDataset(test_df, tokenizer, cfg.max_len)
train_loader = DataLoader(train_ds, batch_size = cfg.batch_size, shuffle = True, num_workers = 2, pin_memory = torch.cuda.is_available())
val_loader = DataLoader(val_ds, batch_size = cfg.batch_size, shuffle = False, num_workers = 2, pin_memory = torch.cuda.is_available())
test_loader = DataLoader(test_ds, batch_size = cfg.batch_size, shuffle = False, num_workers = 2, pin_memory = torch.cuda.is_available())

In [15]:
# verify tokenized tensor shapes
sample = next(iter(train_loader))
print(f"input_ids shape: {sample['input_ids'].shape}")
print(f"attention_mask shape: {sample['attention_mask'].shape}")
print(f"labels shape: {sample['labels'].shape}")

input_ids shape: torch.Size([16, 512])
attention_mask shape: torch.Size([16, 512])
labels shape: torch.Size([16])


In [16]:
model = DistilBertForSequenceClassification.from_pretrained(cfg.model_name, num_labels = 14, id2label = id2label, label2id = label2id).to(DEVICE)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model_config = model.config

In [18]:
print({"n_layers": model_config.n_layers, "hidden_size": model_config.dim, "ffn_dim": model_config.hidden_dim, "activation": model_config.activation, "n_heads": model_config.n_heads})

{'n_layers': 6, 'hidden_size': 768, 'ffn_dim': 3072, 'activation': 'gelu', 'n_heads': 12}


In [19]:
def get_attention_weights(model: DistilBertForSequenceClassification, tokenizer: DistilBertTokenizer, text: str, layer_idx: int = 5) -> Tuple[np.ndarray, List[str]]:
    inputs = tokenizer(text, return_tensors = 'pt', truncation = True, padding = True, max_length = 512).to(DEVICE)
    model.eval()
    with torch.no_grad():
        outputs = model.base_model(**inputs, output_attentions=True)
    attentions = outputs.attentions
    layer_attention = attentions[layer_idx][0].detach().cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
    return layer_attention, tokens

In [20]:
def plot_attention_heatmap(attn: np.ndarray, tokens: List[str], title: str, path: str) -> None:
    set_mpl_params()
    plt.figure(dpi=200)
    plt.imshow(attn, interpolation="nearest", aspect="auto")
    plt.xticks(range(len(tokens)), tokens, rotation=90)
    plt.yticks(range(len(tokens)), tokens)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

In [21]:
sentence = "A robot may not injure a human being or, through inaction, allow a human being to come to harm."
# get middle transformer block and one-third into the head set (to avoid extremes)
LAYER, HEAD = model.config.n_layers // 2, model.config.n_heads // 3

In [22]:
pre_attn, pre_tokens = get_attention_weights(model, tokenizer, sentence, LAYER)
plot_attention_heatmap(pre_attn[HEAD], pre_tokens, f"Pre-training layer {LAYER}, head {HEAD}", "graphs/attn_pre_single.png")
plot_attention_heatmap(pre_attn.mean(axis=0), pre_tokens, f"Pre-training layer {LAYER} (Average heads)", "graphs/attn_pre_avg.png")



In [23]:
raw_lengths = [len(tokenizer.encode(f"title: {t} [SEP] content: {c}", truncation=False)) for t, c in zip(df["title"].head(2000), df["content"].head(2000))]

plt.figure(dpi=200)
plt.hist(raw_lengths, bins=40, alpha=0.8)
plt.axvline(512, linestyle='--', linewidth=1.5, label='max length (512)')
plt.title("Distribution of raw tokenized sequence lengths (first 2000 samples)")
plt.xlabel("Sequence length (tokens)"); plt.ylabel("count"); plt.legend(); plt.tight_layout()
plt.savefig("graphs/token_length_distribution.png", dpi=200); plt.close()

In [24]:
optimizer = AdamW(model.parameters(), lr = cfg.lr, weight_decay = cfg.weight_decay)

In [25]:
# getting hyperparams from config to calculate total steps and warmup steps
total_train_batches = math.ceil(len(train_ds) / cfg.batch_size)
total_steps = total_train_batches * cfg.epochs
warmup_steps = int(cfg.warmup_ratio * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps)

In [26]:
print(f"Total steps: {total_steps}, Warmup steps: {warmup_steps}")

Total steps: 52500, Warmup steps: 5250


In [27]:
def train_epoch(model: DistilBertForSequenceClassification, dataloader: DataLoader, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler.LambdaLR, device: torch.device, grad_accum_steps: int = 1) -> float:
    model.train()
    total_loss = 0.0
    optimizer.zero_grad(set_to_none=True)
    progress_bar = tqdm(total = len(dataloader), desc = "Training", leave = False)
    for step, batch in enumerate(dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
        loss = outputs.loss / grad_accum_steps
        loss.backward()
        total_loss += loss.item() * grad_accum_steps
        if (step + 1) % grad_accum_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none = True)
        if (step + 1) % 500 == 0 or (step + 1) == len(dataloader):
            progress_bar.update(500 if (step + 1) % 500 == 0 else len(dataloader) % 500)
            progress_bar.set_postfix({"loss": f"{loss.item() * grad_accum_steps:.4f}"})
    progress_bar.close()
    return total_loss / len(dataloader)

In [28]:
def evaluate(model: DistilBertForSequenceClassification, dataloader: DataLoader, device: torch.device) -> Tuple[float, float]:
    model.eval()
    preds, labels_list = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask, labels = labels)
            total_loss += outputs.loss.item()
            preds.extend(outputs.logits.argmax(dim = 1).cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
    acc = accuracy_score(labels_list, preds)
    return acc, total_loss / len(dataloader)

In [29]:
best_val_acc = 0.0
train_losses, val_losses, val_accs = [], [], []

epoch_bar = tqdm(range(1, cfg.epochs + 1), desc="Training epochs")
for epoch in epoch_bar:
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, DEVICE, cfg.grad_accum_steps)
    val_acc, val_loss = evaluate(model, val_loader, DEVICE)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    epoch_bar.set_postfix({"train_loss": f"{train_loss:.4f}", "val_loss": f"{val_loss:.4f}", "val_acc": f"{val_acc:.4f}"})
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")
        tqdm.write("Saved best_model.pt")
epoch_bar.close()

Training epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Saved best_model.pt


Training:   0%|          | 0/5250 [00:00<?, ?it/s]

Training:   0%|          | 0/5250 [00:00<?, ?it/s]

In [30]:
best_model = DistilBertForSequenceClassification.from_pretrained(cfg.model_name, num_labels =14, id2label = id2label, label2id = label2id).to(DEVICE)
best_model.load_state_dict(torch.load("best_model.pt", map_location = DEVICE))
post_attn, post_tokens = get_attention_weights(best_model, tokenizer, sentence, LAYER)
plot_attention_heatmap(post_attn[HEAD], post_tokens, f"Post-training layer {LAYER}, head {HEAD}", "graphs/attn_post_single.png")
plot_attention_heatmap(post_attn.mean(axis=0), post_tokens, f"Post-training layer {LAYER} (Average heads)", "graphs/attn_post_avg.png")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
plt.figure(dpi = 200)
plt.plot(train_losses, label = "Training loss")
plt.plot(val_losses, label = "Validation loss")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.legend(); plt.tight_layout()
plt.savefig("graphs/loss_curve.png", dpi = 200); plt.close()

In [32]:
plt.figure(dpi = 200)
plt.plot(val_accs, label = "Validation accuracy")
plt.xlabel("Epoch"); plt.ylabel("Accuracy"); plt.legend(); plt.tight_layout()
plt.savefig("graphs/accuracy_curve.png", dpi = 200); plt.close()

In [33]:
# evaluate on test set
best_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [34]:
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc = "testing", leave = False):
        ids, mask, labels = batch["input_ids"].to(DEVICE), batch["attention_mask"].to(DEVICE), batch["labels"].to(DEVICE)
        outputs = best_model(input_ids = ids, attention_mask = mask)
        preds = outputs.logits.argmax(dim = 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

testing:   0%|          | 0/1750 [00:00<?, ?it/s]

In [35]:
acc = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average = None, zero_division = 0)

In [36]:
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize = (9, 7), dpi = 200)
sns.heatmap(cm, annot = True, fmt = "d", cmap = "Blues_r", xticklabels = id2label.values(), yticklabels = id2label.values(), cbar = False, linewidths = 0.4, linecolor = "gray")
plt.title("Confusion matrix - test set", pad = 12)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation = 45, ha = "right", fontsize = 8)
plt.yticks(rotation = 0, fontsize = 8)
plt.tight_layout()
plt.savefig("graphs/confusion_matrix_test.png", dpi = 200, bbox_inches = "tight")
plt.close()

In [38]:
classes = ["Company", "EducationalInstitution", "Artist", "Athlete", "OfficeHolder", "MeanOfTransportation", "Building", "NaturalPlace", "Village", "Animal", "Plant", "Album", "Film", "WrittenWork"]

In [39]:
avg_precision = np.mean(precision)
avg_recall = np.mean(recall)
avg_f1 = np.mean(f1)

In [41]:
print("\nPer-class performance metrics")
print("-" * 75)
print(f"{'Class':<25}{'Precision':>12}{'Recall':>12}{'F1 Score':>12}")
print("-" * 75)

for c, p, r, f in zip(classes, precision, recall, f1):
    print(f"{c:<25}{p:>12.4f}{r:>12.4f}{f:>12.4f}")

print("-" * 75)
print(f"{'Macro average':<25}{avg_precision:>12.4f}{avg_recall:>12.4f}{avg_f1:>12.4f}")
print(f"{'Overall cccuracy:':<25}{acc:>12.4f}")


Per-class performance metrics
---------------------------------------------------------------------------
Class                       Precision      Recall    F1 Score
---------------------------------------------------------------------------
Company                        0.9783      0.9715      0.9749
EducationalInstitution         0.9866      0.9920      0.9893
Artist                         0.9830      0.9840      0.9835
Athlete                        0.9970      0.9960      0.9965
OfficeHolder                   0.9835      0.9835      0.9835
MeanOfTransportation           0.9920      0.9940      0.9930
Building                       0.9824      0.9770      0.9797
NaturalPlace                   0.9925      0.9980      0.9953
Village                        0.9980      0.9990      0.9985
Animal                         0.9965      0.9970      0.9968
Plant                          0.9965      0.9935      0.9950
Album                          0.9950      0.9975      0.9963
Film       