In [None]:
# ======================= SETUP =======================
!git clone https://github.com/yixuantt/FinEntity.git
!pip install transformers sequence-aligner evaluate
!pip install pytorch-crf
!pip install seqeval

import json
import warnings
import numpy as np
import torch
from torch.utils.data import DataLoader, random_split
from torch import cuda
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from FinEntity.sequence_aligner.labelset import LabelSet
from FinEntity.sequence_aligner.dataset import TrainingDataset
from FinEntity.sequence_aligner.containers import TraingingBatch

Cloning into 'FinEntity'...
remote: Enumerating objects: 118, done.[K
remote: Total 118 (delta 0), reused 0 (delta 0), pack-reused 118 (from 1)[K
Receiving objects: 100% (118/118), 28.24 MiB | 8.70 MiB/s, done.
Resolving deltas: 100% (39/39), done.
Updating files: 100% (58/58), done.
Downloading model_bert_crf (439 MB)
Error downloading object: model_bert_crf (24132d8): Smudge error: Error downloading model_bert_crf (24132d85e1ad90e91d9688e5ddaf6f520396644386817fa2f877f21a58ea84fe): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.

Errors logged to /content/FinEntity/.git/lfs/logs/20250504T075331.905138192.log
Use `git lfs logs last` to view the log.
error: external filter 'git-lfs filter-process' failed
fatal: model_bert_crf: smudge filter lfs failed
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'

Collecting sequence-aligner
  Downloading sequenc

In [None]:
# ======================= LOAD DATA =======================
raw = json.load(open('FinEntity/data/FinEntity.json'))

In [None]:
# ======================= CHOOSE MODEL =======================
# Choose ONE of the following model names:
# model_name = "bert-base-cased"
model_name = "roberta-base"
# model_name = "microsoft/deberta-base"

# model_name = "microsoft/deberta-base"  # <<<<<<<<<<<< switch here

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# ======================= SETUP LABELS =======================
label_set = LabelSet(labels=["Neutral", "Positive", "Negative"])
print("Label mapping:", label_set.ids_to_label)
dataset = TrainingDataset(
    data=raw,
    tokenizer=tokenizer,
    label_set=label_set,
    tokens_per_batch=128
)

Label mapping: {0: 'O', 1: 'B-Neutral', 2: 'I-Neutral', 3: 'L-Neutral', 4: 'U-Neutral', 5: 'B-Positive', 6: 'I-Positive', 7: 'L-Positive', 8: 'U-Positive', 9: 'B-Negative', 10: 'I-Negative', 11: 'L-Negative', 12: 'U-Negative'}


In [None]:
# ======================= TRAIN/VAL SPLIT =======================
train_size = int(0.8 * len(dataset))
validate_size = len(dataset) - train_size
train_dataset, validate_dataset = random_split(dataset, [train_size, validate_size])

def collate_fn(batch):
    batch_data = TraingingBatch(batch)
    batch_data.labels[batch_data.labels == -100] = 0  # Replace invalid label
    return batch_data

train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(validate_dataset, batch_size=16, collate_fn=collate_fn, shuffle=False)

In [None]:
# Define model with CRF
from torch import nn
from transformers import AutoModel
from torchcrf import CRF

class TokenClassifierWithCRF(nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        emissions = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]

        if labels is not None:
            # CRF expects [B, T, num_labels] and [B, T] for mask and labels
            loss = -self.crf(emissions, labels, mask=attention_mask.bool(), reduction='mean')
            return loss
        else:
            # For prediction, return the best path
            prediction = self.crf.decode(emissions, mask=attention_mask.bool())
            return prediction

In [None]:
# ======================= MODEL INIT =======================
model = TokenClassifierWithCRF(
    model_name,
    num_labels=len(label_set.ids_to_label.values())
)

device = 'cuda:0' if cuda.is_available() else 'cpu'
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TokenClassifierWithCRF(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNo

In [None]:
# ======================= TRAINING CONFIG =======================
t_total = len(train_loader) * 10  # 3 epochs
weight_decay = 0.01
learning_rate = 3e-5
warmup_ratio = 0.1

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
warmup_steps = int(t_total * warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
)

In [None]:
from seqeval.metrics import classification_report
from seqeval.scheme import BILOU
from tqdm import tqdm

def train_epoch(e, model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0
    for step, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_masks"].to(device)
        labels = d["labels"].to(device)

        loss = model(input_ids, attention_mask, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {e+1}: Train loss = {total_loss / len(data_loader):.4f}")

def valid_epoch(e, model, data_loader, device, label_set):
    model.eval()
    all_preds, all_labels = [], []

    for batch in tqdm(data_loader, desc=f"Validation Epoch {e+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_masks'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            predictions = model(input_ids=input_ids, attention_mask=attention_mask)  # returns List[List[int]]

        # Convert labels to CPU and to list of lists (unpad)
        true_labels = labels.cpu().tolist()
        attention_mask_cpu = attention_mask.cpu().tolist()

        for preds, label_ids, mask in zip(predictions, true_labels, attention_mask_cpu):
            true = [label_set.ids_to_label[i] for i, m in zip(label_ids, mask) if m == 1]
            pred = [label_set.ids_to_label[i] for i in preds]  # preds are already masked by CRF
            all_labels.append(true)
            all_preds.append(pred)

    print("\nValidation Report:")
    print(classification_report(all_labels, all_preds, mode='strict', scheme=BILOU))

In [None]:
NUM_EPOCHS = 10

for epoch in range(NUM_EPOCHS):
    print(f"\n===== EPOCH {epoch+1} =====")
    train_epoch(epoch, model, train_loader, optimizer, scheduler, device)
    valid_epoch(epoch, model, val_loader, device, label_set)


===== EPOCH 1 =====
Epoch 1: Train loss = 12.3043


Validation Epoch 1: 100%|██████████| 13/13 [00:01<00:00,  9.32it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       109
     Neutral       0.62      0.61      0.61       223
    Positive       0.26      0.15      0.19       115

   micro avg       0.54      0.34      0.42       447
   macro avg       0.29      0.25      0.27       447
weighted avg       0.38      0.34      0.35       447


===== EPOCH 2 =====
Epoch 2: Train loss = 7.0803


Validation Epoch 2: 100%|██████████| 13/13 [00:01<00:00,  9.10it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.56      0.14      0.22       109
     Neutral       0.78      0.73      0.76       223
    Positive       0.51      0.47      0.49       115

   micro avg       0.68      0.52      0.59       447
   macro avg       0.62      0.45      0.49       447
weighted avg       0.66      0.52      0.56       447


===== EPOCH 3 =====
Epoch 3: Train loss = 5.2540


Validation Epoch 3: 100%|██████████| 13/13 [00:01<00:00,  8.87it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.49      0.51      0.50       109
     Neutral       0.83      0.83      0.83       223
    Positive       0.61      0.22      0.32       115

   micro avg       0.70      0.59      0.64       447
   macro avg       0.64      0.52      0.55       447
weighted avg       0.69      0.59      0.62       447


===== EPOCH 4 =====
Epoch 4: Train loss = 3.9757


Validation Epoch 4: 100%|██████████| 13/13 [00:01<00:00,  8.96it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.69      0.48      0.57       109
     Neutral       0.87      0.81      0.84       223
    Positive       0.68      0.50      0.57       115

   micro avg       0.79      0.65      0.71       447
   macro avg       0.75      0.59      0.66       447
weighted avg       0.78      0.65      0.71       447


===== EPOCH 5 =====
Epoch 5: Train loss = 2.6554


Validation Epoch 5: 100%|██████████| 13/13 [00:01<00:00,  9.04it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.75      0.79      0.77       109
     Neutral       0.90      0.71      0.79       223
    Positive       0.80      0.71      0.75       115

   micro avg       0.83      0.73      0.78       447
   macro avg       0.82      0.74      0.77       447
weighted avg       0.84      0.73      0.78       447


===== EPOCH 6 =====
Epoch 6: Train loss = 1.7441


Validation Epoch 6: 100%|██████████| 13/13 [00:01<00:00,  9.05it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.75      0.90      0.82       109
     Neutral       0.91      0.74      0.82       223
    Positive       0.85      0.81      0.83       115

   micro avg       0.85      0.80      0.82       447
   macro avg       0.84      0.82      0.82       447
weighted avg       0.86      0.80      0.82       447


===== EPOCH 7 =====
Epoch 7: Train loss = 1.2514


Validation Epoch 7: 100%|██████████| 13/13 [00:01<00:00,  8.99it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.80      0.93      0.86       109
     Neutral       0.92      0.69      0.78       223
    Positive       0.74      0.88      0.80       115

   micro avg       0.83      0.79      0.81       447
   macro avg       0.82      0.83      0.82       447
weighted avg       0.84      0.79      0.81       447


===== EPOCH 8 =====
Epoch 8: Train loss = 0.9836


Validation Epoch 8: 100%|██████████| 13/13 [00:01<00:00,  8.98it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.82      0.93      0.87       109
     Neutral       0.91      0.75      0.82       223
    Positive       0.80      0.89      0.84       115

   micro avg       0.85      0.83      0.84       447
   macro avg       0.84      0.85      0.84       447
weighted avg       0.86      0.83      0.84       447


===== EPOCH 9 =====
Epoch 9: Train loss = 0.8728


Validation Epoch 9: 100%|██████████| 13/13 [00:01<00:00,  8.95it/s]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.82      0.94      0.87       109
     Neutral       0.91      0.77      0.83       223
    Positive       0.82      0.86      0.84       115

   micro avg       0.86      0.83      0.85       447
   macro avg       0.85      0.85      0.85       447
weighted avg       0.87      0.83      0.85       447


===== EPOCH 10 =====
Epoch 10: Train loss = 0.7949


Validation Epoch 10: 100%|██████████| 13/13 [00:01<00:00,  9.02it/s]


Validation Report:
              precision    recall  f1-score   support

    Negative       0.82      0.94      0.87       109
     Neutral       0.91      0.77      0.83       223
    Positive       0.82      0.86      0.84       115

   micro avg       0.86      0.83      0.85       447
   macro avg       0.85      0.85      0.85       447
weighted avg       0.87      0.83      0.85       447






In [None]:
# ROBERTA OUTPUT


===== EPOCH 1 =====
Epoch 1: Train loss = 67.1443


Validation Epoch 1: 100%|██████████| 13/13 [01:22<00:00,  6.35s/it]
  _warn_prf(average, modifier, msg_start, len(result))



Validation Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        93
     Neutral       0.40      0.04      0.07       213
    Positive       0.00      0.00      0.00        90

   micro avg       0.40      0.02      0.04       396
   macro avg       0.13      0.01      0.02       396
weighted avg       0.22      0.02      0.04       396


===== EPOCH 2 =====
Epoch 2: Train loss = 12.4641


Validation Epoch 2: 100%|██████████| 13/13 [01:20<00:00,  6.22s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        93
     Neutral       0.67      0.78      0.72       213
    Positive       0.55      0.46      0.50        90

   micro avg       0.64      0.53      0.58       396
   macro avg       0.40      0.41      0.41       396
weighted avg       0.48      0.53      0.50       396


===== EPOCH 3 =====
Epoch 3: Train loss = 6.9002


Validation Epoch 3: 100%|██████████| 13/13 [01:24<00:00,  6.47s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.42      0.27      0.33        93
     Neutral       0.81      0.81      0.81       213
    Positive       0.50      0.07      0.12        90

   micro avg       0.71      0.52      0.60       396
   macro avg       0.58      0.38      0.42       396
weighted avg       0.65      0.52      0.54       396


===== EPOCH 4 =====
Epoch 4: Train loss = 5.0852


Validation Epoch 4: 100%|██████████| 13/13 [01:21<00:00,  6.24s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.48      0.34      0.40        93
     Neutral       0.80      0.94      0.86       213
    Positive       0.50      0.03      0.06        90

   micro avg       0.73      0.59      0.65       396
   macro avg       0.59      0.44      0.44       396
weighted avg       0.66      0.59      0.57       396


===== EPOCH 5 =====
Epoch 5: Train loss = 3.9643


Validation Epoch 5: 100%|██████████| 13/13 [01:22<00:00,  6.36s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.42      0.42      0.42        93
     Neutral       0.86      0.91      0.88       213
    Positive       0.36      0.18      0.24        90

   micro avg       0.69      0.63      0.65       396
   macro avg       0.55      0.50      0.51       396
weighted avg       0.64      0.63      0.63       396


===== EPOCH 6 =====
Epoch 6: Train loss = 2.9161


Validation Epoch 6: 100%|██████████| 13/13 [01:20<00:00,  6.21s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.81      0.71      0.76        93
     Neutral       0.88      0.88      0.88       213
    Positive       0.86      0.84      0.85        90

   micro avg       0.86      0.83      0.85       396
   macro avg       0.85      0.81      0.83       396
weighted avg       0.86      0.83      0.85       396


===== EPOCH 7 =====
Epoch 7: Train loss = 1.5265


Validation Epoch 7: 100%|██████████| 13/13 [01:22<00:00,  6.31s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.83      0.76      0.79        93
     Neutral       0.89      0.89      0.89       213
    Positive       0.85      0.86      0.85        90

   micro avg       0.86      0.85      0.86       396
   macro avg       0.85      0.84      0.84       396
weighted avg       0.86      0.85      0.86       396


===== EPOCH 8 =====
Epoch 8: Train loss = 1.1787


Validation Epoch 8: 100%|██████████| 13/13 [01:21<00:00,  6.27s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.76      0.82        93
     Neutral       0.88      0.92      0.90       213
    Positive       0.90      0.82      0.86        90

   micro avg       0.89      0.86      0.87       396
   macro avg       0.89      0.84      0.86       396
weighted avg       0.89      0.86      0.87       396


===== EPOCH 9 =====
Epoch 9: Train loss = 0.8992


Validation Epoch 9: 100%|██████████| 13/13 [01:20<00:00,  6.22s/it]



Validation Report:
              precision    recall  f1-score   support

    Negative       0.88      0.75      0.81        93
     Neutral       0.89      0.92      0.91       213
    Positive       0.82      0.83      0.82        90

   micro avg       0.87      0.86      0.87       396
   macro avg       0.86      0.84      0.85       396
weighted avg       0.87      0.86      0.86       396


===== EPOCH 10 =====
Epoch 10: Train loss = 0.7133


Validation Epoch 10: 100%|██████████| 13/13 [01:20<00:00,  6.23s/it]


Validation Report:
              precision    recall  f1-score   support

    Negative       0.85      0.78      0.82        93
     Neutral       0.88      0.91      0.90       213
    Positive       0.88      0.83      0.86        90

   micro avg       0.87      0.86      0.87       396
   macro avg       0.87      0.84      0.86       396
weighted avg       0.87      0.86      0.87       396






In [None]:
import pickle

with open('model_roberta_crf', 'wb') as f:
    pickle.dump(model, f)