In [None]:
# Install required packages
!pip install -U bitsandbytes transformers datasets accelerate seqeval unsloth huggingface_hub


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unsloth
  Downloading unsloth-2025.4.4-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)

In [None]:
# Log in so we can pull gated checkpoints
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Clone FinEntity
!git clone https://github.com/yixuantt/FinEntity.git


Cloning into 'FinEntity'...
remote: Enumerating objects: 118, done.[K
remote: Total 118 (delta 0), reused 0 (delta 0), pack-reused 118 (from 1)[K
Receiving objects: 100% (118/118), 28.24 MiB | 15.91 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [None]:
# Setup imports
import sys, torch, numpy as np, torch.nn as nn
sys.path.append("FinEntity")

from torch.utils.data import Dataset, random_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    Trainer, TrainingArguments, DataCollatorForTokenClassification
)
from datasets import load_dataset
from seqeval.metrics import classification_report, f1_score
from sequence_aligner.labelset import LabelSet
from sequence_aligner.containers import TraingingBatch, TrainingExample

# Simple config
class config: dev_split_size = 0.2


In [None]:
# 1) Load + reshape FinEntity
hf = load_dataset("yixuantt/FinEntity", split="train")
raw = [{"text":e["content"],
        "entities":[{"start":a["start"],"end":a["end"],"label":a["tag"]}
                    for a in e["annotations"]]}
       for e in hf]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

FinEntity.json:   0%|          | 0.00/710k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/979 [00:00<?, ? examples/s]

In [None]:
# 2) Load gated LLaMA-3 in 4-bit
bnb = BitsAndBytesConfig(load_in_4bit=True, llm_int8_enable_fp32_cpu_offload=True)
model = AutoModelForCausalLM.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    quantization_config = bnb,
    device_map          = "auto",
    use_auth_token      = True,
    trust_remote_code   = True
)
tokenizer = AutoTokenizer.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    use_fast       = True,
    use_auth_token = True
)




config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [None]:
# 3) Build token-classification dataset
class TrainingDatasetLLaMA(Dataset):
    def __init__(self, data, tokenizer, tpb=128):
        self.tpb       = tpb
        self.tokenizer = tokenizer
        self.label_map = {"O":0, "Positive":1, "Negative":2, "Neutral":3}
        self.examples  = []

        for ex in data:
            text, ents = ex["text"], ex["entities"]
            # 1) build char-level labels
            char_lbl = ["O"] * len(text)
            for e in ents:
                for i in range(e["start"], e["end"]):
                    if 0 <= i < len(text):
                        char_lbl[i] = e["label"]

            # 2) tokenize with offsets (no mutation!)
            toks = tokenizer(
                text,
                padding="max_length",
                truncation=True,
                max_length=tpb,
                return_attention_mask=True,
                return_offsets_mapping=True
            )
            input_ids      = toks["input_ids"]
            attention_mask = toks["attention_mask"]
            offsets        = toks["offset_mapping"]

            # 3) align token → label by zipping
            labels = []
            for (start, end), mask_bit in zip(offsets, attention_mask):
                if mask_bit == 0 or start == end:
                    labels.append(-100)
                else:
                    span = char_lbl[start:end]
                    # pick any non-O label in the span
                    lbls = set(span) - {"O"}
                    if lbls:
                        labels.append(self.label_map[lbls.pop()])
                    else:
                        labels.append(self.label_map["O"])

            # sanity: all three should be length tpb
            assert len(input_ids) == len(attention_mask) == len(labels) == tpb

            self.examples.append({
                "input_ids":      torch.tensor(input_ids,      dtype=torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                "labels":         torch.tensor(labels,         dtype=torch.long),
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]



dataset = TrainingDatasetLLaMA(raw, tokenizer, tpb=128)
n       = len(dataset)
val_sz  = int(config.dev_split_size * n)   # e.g. 10% of total
train_sz = n - val_sz                      # the remaining 90%

train_ds, val_ds = random_split(dataset, [train_sz, val_sz])



In [None]:
ex0 = train_ds[0]
print(ex0["input_ids"].shape, ex0["attention_mask"].shape, ex0["labels"].shape)
print("Unique labels:", set(ex0["labels"].tolist()))


torch.Size([128]) torch.Size([128]) torch.Size([128])
Unique labels: {0, 1, 3, -100}


In [None]:
# 4) Define the token-classification head
class LlamaForTokenClassification(nn.Module):
    def __init__(self, base_model, hidden_size, num_labels):
        super().__init__()
        self.base       = base_model
        self.dropout    = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        # Tell the model to return hidden states
        outputs = self.base(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )
        # hidden_states is a tuple: (embeddings, layer1, ..., layerN)
        seq   = outputs.hidden_states[-1]           # last layer
        seq   = self.dropout(seq)
        logits= self.classifier(seq)                # (B, T, num_labels)

        if labels is not None:
            loss = nn.CrossEntropyLoss(ignore_index=-100)(
                logits.view(-1, logits.size(-1)),
                labels.view(-1)
            )
            return loss, logits

        return logits


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_tc = LlamaForTokenClassification(model, model.config.hidden_size, 4).to(device)
model_tc.half()


LlamaForTokenClassification(
  (base): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
            (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
            (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
          (post_att

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
from seqeval.metrics import classification_report, f1_score
import numpy as np

# 1) Data collator (pads and uses label_pad_token_id=-100)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    label_pad_token_id=-100,
)

# 2) Metric helpers
label_inv = {0:"O",1:"Positive",2:"Negative",3:"Neutral"}

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels, true_preds = [], []
    for lab_seq, pred_seq in zip(labels, preds):
        tl, tp = [], []
        for l, p in zip(lab_seq, pred_seq):
            if l == -100: continue
            tl.append(label_inv[l])
            tp.append(label_inv[p])
        true_labels.append(tl)
        true_preds.append(tp)

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "eval_f1": f1_score(true_labels, true_preds),
        "eval_loss": None,
        "f1_positive": report["Positive"]["f1-score"],
        "f1_negative": report["Negative"]["f1-score"],
        "f1_neutral":  report["Neutral"]["f1-score"],
        "micro_avg":   report["micro avg"]["f1-score"],
        "macro_avg":   report["macro avg"]["f1-score"],
        "weighted_avg":report["weighted avg"]["f1-score"],
    }



# 3) TrainingArguments (minimal)
training_args = TrainingArguments(
    output_dir                 = "./llama-tokenclf",
    num_train_epochs           = 3,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size = 8,
    learning_rate              = 3e-5,
    logging_steps              = 50,
    logging_dir                = "./logs",
    # no evaluation_strategy here
)

# 4) Trainer
trainer = Trainer(
    model            = model_tc,
    args             = training_args,
    train_dataset    = train_ds,
    eval_dataset     = val_ds,
    data_collator    = data_collator,
    tokenizer        = tokenizer,
    compute_metrics  = compute_metrics
)

# 5) Train + Evaluate
trainer.train()
metrics = trainer.evaluate()
print("Eval metrics:", metrics)


  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnv2608[0m ([33mnv2608-princeton-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.9473
100,0.0
150,0.0
200,0.0
250,0.0
