In [None]:
# ======================= SETUP =======================
!git clone https://github.com/yixuantt/FinEntity.git
!pip install transformers sequence-aligner evaluate
!pip install seqeval

import json
import warnings
import numpy as np
import torch
from torch.utils.data import DataLoader, random_split
from torch import cuda
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from FinEntity.sequence_aligner.labelset import LabelSet
from FinEntity.sequence_aligner.dataset import TrainingDataset
from FinEntity.sequence_aligner.containers import TraingingBatch

fatal: destination path 'FinEntity' already exists and is not an empty directory.


In [None]:
# ======================= LOAD DATA =======================
raw = json.load(open('FinEntity/data/FinEntity.json'))

In [None]:
# ======================= CHOOSE MODEL =======================
# Choose ONE of the following model names:
# model_name = "bert-base-cased"
# model_name = "roberta-base"
# model_name = "microsoft/deberta-base"

model_name = "microsoft/deberta-base"  # <<<<<<<<<<<< switch here

tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# ======================= SETUP LABELS =======================
label_set = LabelSet(labels=["Neutral", "Positive", "Negative"])
print("Label mapping:", label_set.ids_to_label)
dataset = TrainingDataset(
    data=raw,
    tokenizer=tokenizer,
    label_set=label_set,
    tokens_per_batch=128
)

Label mapping: {0: 'O', 1: 'B-Neutral', 2: 'I-Neutral', 3: 'L-Neutral', 4: 'U-Neutral', 5: 'B-Positive', 6: 'I-Positive', 7: 'L-Positive', 8: 'U-Positive', 9: 'B-Negative', 10: 'I-Negative', 11: 'L-Negative', 12: 'U-Negative'}


In [None]:
# ======================= TRAIN/VAL SPLIT =======================
train_size = int(0.8 * len(dataset))
validate_size = len(dataset) - train_size
train_dataset, validate_dataset = random_split(dataset, [train_size, validate_size])

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=TraingingBatch, shuffle=True)
val_loader = DataLoader(validate_dataset, batch_size=16, collate_fn=TraingingBatch, shuffle=False)

In [None]:
# ======================= MODEL INIT =======================
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_set.ids_to_label.values())
)

device = 'cuda:0' if cuda.is_available() else 'cpu'
model.to(device)

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaForTokenClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): Dropout(p=0.1,

In [None]:
# ======================= TRAINING CONFIG =======================
t_total = len(train_loader) * 10  # 10 epochs
weight_decay = 0.01
learning_rate = 3e-5
warmup_ratio = 0.1

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
warmup_steps = int(t_total * warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

In [None]:
# ======================= TRAINING FUNCTIONS =======================
def train_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch.input_ids.to(device)
        attention_mask = batch.attention_masks.to(device)
        labels = batch.labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return total_loss / len(loader)


from seqeval.metrics import classification_report
from seqeval.scheme import BILOU
from tqdm import tqdm

def valid_epoch(e, model, loader, device, label_set):
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Validation Epoch {e+1}"):
            input_ids = batch.input_ids.to(device)
            attention_mask = batch.attention_masks.to(device)
            labels = batch.labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            # Loop over batch
            for i in range(labels.size(0)):
                pred_seq = []
                label_seq = []
                for j in range(labels.size(1)):
                    if labels[i, j] != -100:
                        pred_seq.append(label_set.ids_to_label[preds[i, j].item()])
                        label_seq.append(label_set.ids_to_label[labels[i, j].item()])
                preds_all.append(pred_seq)
                labels_all.append(label_seq)

    print("\nValidation Report:")
    print(classification_report(labels_all, preds_all, mode='strict', scheme=BILOU))

In [None]:
# ======================= TRAIN LOOP =======================
warnings.filterwarnings('ignore')

EPOCHS = 10
for epoch in range(EPOCHS):
    print(f"\n===== EPOCH {epoch+1}/{EPOCHS} =====")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    print("Train loss:", train_loss)
    valid_epoch(epoch, model, val_loader, device, label_set)


===== EPOCH 1/10 =====
