In [2]:
# --------- FAST FIRE: BERT-base + TPLinker-style Pair Classification ----------
# Reduces compute time by 70–85%

import json
from pathlib import Path
import random
import torch
import tqdm
from torch.optim import AdamW   # correct

from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,

    get_linear_schedule_with_warmup
)
from sklearn.metrics import precision_recall_fscore_support

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

BASE = Path("/content/drive/MyDrive/Datasets_EE782_course_project/FIRE_dataset")
# source finred-style text (your file)

TRAIN_PATH = BASE /"fire_train.json"
DEV_PATH   = BASE / "fire_dev.json"   # may or may not exist
TEST_PATH  = BASE / "fire_test.json"  # may or may not exist
TYPES_PATH = BASE / "fire_types.json" # relations / entity types

# Output location
OUT_DIR = Path("/content/drive/MyDrive/FIRE_fast_bert_tplinker")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# ---------------- LOAD JSON ----------------
def load(path):
    if path.exists():
        return json.load(open(path))
    return []

train_data = load(TRAIN_PATH)
dev_data   = load(DEV_PATH)
test_data  = load(TEST_PATH)

print("Loaded:", len(train_data), len(dev_data), len(test_data))

# ---------------- RELATION LIST ----------------
rels = set()
for ds in (train_data + dev_data + test_data):
    for r in ds.get("relations", []):
        rels.add(r["type"])
rels = ["no_relation"] + sorted(list(rels))
label2id = {l:i for i,l in enumerate(rels)}
id2label = {i:l for l,i in label2id.items()}
print("Relations:", rels)

# ---------------- BUILD EXAMPLES (FAST) ----------------
def build_fast_pairs(records, neg_ratio=0.4):
    examples = []
    for rec in records:
        tokens = rec["tokens"]
        ents = rec["entities"]
        rels_map = {}

        # Map entity-index pairs
        for r in rec.get("relations", []):
            h, t = r["head"], r["tail"]
            rels_map[(h, t)] = r["type"]

        all_pairs = []
        for i in range(len(ents)):
            for j in range(len(ents)):
                if i == j:
                    continue
                h, t = ents[i], ents[j]
                label = rels_map.get((i, j), "no_relation")
                all_pairs.append((tokens, h, t, label))

        # Keep all positives, sample negatives
        positives = [p for p in all_pairs if p[3] != "no_relation"]
        negatives = [p for p in all_pairs if p[3] == "no_relation"]

        k = int(len(negatives) * neg_ratio)
        negatives = random.sample(negatives, k) if len(negatives) > k else negatives

        for tok, h, t, lab in positives + negatives:
            examples.append({
                "tokens": tok,
                "h": h,
                "t": t,
                "label": lab,
            })
    return examples

train_ex = build_fast_pairs(train_data, neg_ratio=0.35)
dev_ex   = build_fast_pairs(dev_data, neg_ratio=1.0)
test_ex  = build_fast_pairs(test_data, neg_ratio=1.0)

print("Pair examples:", len(train_ex), len(dev_ex), len(test_ex))

# ---------------- TOKENIZER ----------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.add_tokens(["[E1]","[/E1]","[E2]","[/E2]"])

# ---------------- DATASET ----------------
class PairDataset(Dataset):
    def __init__(self, examples):
        self.ex = examples

    def __len__(self): return len(self.ex)

    def __getitem__(self, i):
        e = self.ex[i]
        tok = e["tokens"][:]

        h_s, h_e = e["h"]["start"], e["h"]["end"]-1
        t_s, t_e = e["t"]["start"], e["t"]["end"]-1

        # Marker insertion (descending order)
        inserts = [
            (h_e+1, "[/E1]"),
            (h_s,   "[E1]"),
            (t_e+1, "[/E2]"),
            (t_s,   "[E2]"),
        ]
        inserts.sort(key=lambda x: x[0], reverse=True)
        tks = tok.copy()
        for pos, sym in inserts:
            if 0 <= pos <= len(tks):
                tks.insert(pos, sym)

        input_ids = tokenizer.convert_tokens_to_ids(tks)
        input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]
        if len(input_ids) > 256:
            input_ids = input_ids[:255] + [tokenizer.sep_token_id]

        att = [1]*len(input_ids)
        pad = 256 - len(input_ids)
        if pad > 0:
            input_ids += [tokenizer.pad_token_id]*pad
            att += [0]*pad

        return {
            "input_ids": torch.tensor(input_ids),
            "att": torch.tensor(att),
            "label": torch.tensor(label2id[e["label"]])
        }

train_ds = PairDataset(train_ex)
dev_ds   = PairDataset(dev_ex)
test_ds  = PairDataset(test_ex)

train_loader = DataLoader(train_ds, batch_size=6, shuffle=True)
dev_loader   = DataLoader(dev_ds, batch_size=6)
test_loader  = DataLoader(test_ds, batch_size=6)

# ---------------- MODEL ----------------
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(rels))
model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)

scaler = torch.cuda.amp.GradScaler()

# ---------------- TRAIN (1 epoch, FP16) ----------------
model.train()
loop = tqdm.tqdm(train_loader, desc="Training")
for batch in loop:
    ids = batch["input_ids"].to(DEVICE)
    att = batch["att"].to(DEVICE)
    lab = batch["label"].to(DEVICE)

    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        out = model(input_ids=ids, attention_mask=att, labels=lab)
        loss = out.loss

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    scheduler.step()

    loop.set_postfix(loss=float(loss))

# ---------------- EVAL ----------------
def evaluate(loader):
    model.eval()
    preds, golds = [], []
    with torch.no_grad():
        for batch in tqdm.tqdm(loader, desc="Eval"):
            ids = batch["input_ids"].to(DEVICE)
            att = batch["att"].to(DEVICE)
            lab = batch["label"].to(DEVICE)
            out = model(input_ids=ids, attention_mask=att)
            p = out.logits.argmax(dim=-1).cpu().tolist()
            preds += p
            golds += lab.cpu().tolist()

    p,r,f,_ = precision_recall_fscore_support(golds,preds,average="micro",zero_division=0)
    return p,r,f,preds,golds

print("\nDEV:", evaluate(dev_loader)[:3])
print("TEST:", evaluate(test_loader)[:3])

# ---------------- SAVE ----------------
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print("\nSaved:", OUT_DIR)


Device: cuda
Mounted at /content/drive
Loaded: 2117 454 454
Relations: ['no_relation', 'ActionBuy', 'ActionMerge', 'ActionSell', 'Actionin', 'Actionto', 'Constituentof', 'Designation', 'Employeeof', 'Locatedin', 'Productof', 'Propertyof', 'Quantity', 'Sector', 'Subsidiaryof', 'Value', 'ValueChangeDecreaseby', 'ValueChangeIncreaseby', 'Valuein']
Pair examples: 23229 12336 11402


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:836.)
  loop.set_postfix(loss=float(loss))
Training: 100%|██████████| 3872/3872 [06:51<00:00,  9.41it/s, loss=0.575]
Eval: 100%|██████████| 2056/2056 [02:51<00:00, 11.97it/s]



DEV: (0.9156939040207522, 0.9156939040207522, 0.9156939040207522)


Eval: 100%|██████████| 1901/1901 [02:38<00:00, 11.99it/s]


TEST: (0.9166812839852657, 0.9166812839852657, 0.9166812839852657)

Saved: /content/drive/MyDrive/FIRE_fast_bert_tplinker
