In [1]:
# ================================================================
#              BERT-BASE + TPLinker-style Pair Classifier
#                     (FINRED prepared JSON)
# ================================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import json
from pathlib import Path
from collections import defaultdict
import random
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import tqdm
import os

# -------------------- USER PATHS --------------------
BASE_DRIVE = Path("/content/drive/MyDrive/Datasets_EE782_course_project/FinRED_dataset")

TRAIN_JSON = BASE_DRIVE / "finred_tplinkertrain.json"   # already converted JSON prepared earlier
DEV_TXT   = BASE_DRIVE / "finred_dev.txt"
TEST_TXT  = BASE_DRIVE / "finred_test.txt"
RELATIONS_LIST = BASE_DRIVE / "finred_relations.txt"

OUTPUT_DIR = BASE_DRIVE / "bert_pair_class_model_final"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# -------------------- RELATION LABELS --------------------
def load_relation_set(train_json_path, relations_file=None):
    rels = set()
    if relations_file and relations_file.exists():
        with open(relations_file, "r", encoding="utf-8") as f:
            for ln in f:
                r = ln.strip()
                if r:
                    rels.add(r)

    if train_json_path.exists():
        with open(train_json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            for rec in data:
                for rel in rec.get("relations", []):
                    rels.add(rel["type"])

    rels = sorted(list(rels))
    if "no_relation" not in rels:
        rels = ["no_relation"] + rels
    return rels

label_list = load_relation_set(TRAIN_JSON, RELATIONS_LIST)
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}
print("Labels:", label_list)

# -------------------- TOKENIZER --------------------
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

special_tokens = ["[E1]","[/E1]","[E2]","[/E2]"]
tokenizer.add_tokens([t for t in special_tokens if t not in tokenizer.get_vocab()])
print("Tokenizer vocab size:", len(tokenizer))

# -------------------- FINRED TXT PARSING --------------------
def parse_finred_txt_line(line):
    parts = [p.strip() for p in line.strip().split("|")]
    text = parts[0]
    triples = []
    for p in parts[1:]:
        if not p: continue
        a = [x.strip() for x in p.split(";") if x.strip()!=""]
        if len(a)!=3: continue
        h,t,r = a
        triples.append((h,t,r))
    return {"text": text, "triples": triples}

def load_finred_txt_as_converted(txt_path):
    recs=[]
    with open(txt_path,"r",encoding="utf-8") as f:
        for ln in f:
            if not ln.strip(): continue
            p = parse_finred_txt_line(ln)
            text = p["text"]
            enc = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
            offsets = enc["offset_mapping"]
            tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"])

            entities_dict={}
            rels=[]
            for (h,t,r) in p["triples"]:
                # find subject
                pos=text.lower().find(h.lower())
                if pos!=-1:
                    start,pos_end = pos, pos+len(h)
                    hs,he=None,None
                    for i,(a,b) in enumerate(offsets):
                        if a<=start<b: hs=i
                        if a<pos_end<=b: he=i
                    if hs is not None and he is not None:
                        entities_dict[h]={"start":hs,"end":he}
                # find object
                pos=text.lower().find(t.lower())
                if pos!=-1:
                    start,pos_end=pos, pos+len(t)
                    ts,te=None,None
                    for i,(a,b) in enumerate(offsets):
                        if a<=start<b: ts=i
                        if a<pos_end<=b: te=i
                    if ts is not None and te is not None:
                        entities_dict[t]={"start":ts,"end":te}

                if h in entities_dict and t in entities_dict:
                    rels.append({"type":r,
                                 "head":[entities_dict[h]["start"],entities_dict[h]["end"]],
                                 "tail":[entities_dict[t]["start"],entities_dict[t]["end"]]})

            recs.append({
                "text": text,
                "tokens": tokens,
                "entities": [{"start":v["start"],"end":v["end"]} for v in entities_dict.values()],
                "relations": rels
            })
    return recs

# -------------------- LOAD DATA --------------------
def load_train_json(json_path):
    with open(json_path,"r",encoding="utf-8") as f:
        return json.load(f)

train_records = load_train_json(TRAIN_JSON)
dev_records   = load_finred_txt_as_converted(DEV_TXT)
test_records  = load_finred_txt_as_converted(TEST_TXT)

print("Loaded records:", len(train_records), len(dev_records), len(test_records))

# -------------------- BUILD ENTITY PAIRS --------------------
def build_pair_examples(records):
    examples=[]
    for rec in records:
        text=rec["text"]
        entities=rec["entities"]
        rels=rec.get("relations",[])
        rel_lookup={}
        for r in rels:
            rel_lookup[(tuple(r["head"]),tuple(r["tail"]))]=r["type"]
        enc = tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
        offsets = enc["offset_mapping"]

        entity_items=[]
        for e in entities:
            s = e["start"]
            e_tok = e["end"]

            # --- SAFETY CHECKS ---
            if s < 0 or e_tok < 0:
              continue
            if s >= len(offsets) or e_tok >= len(offsets):
              continue
            if s > e_tok:
              continue

            # compute char boundaries
            cs = offsets[s][0]
            ce = offsets[e_tok][1]

            # if invalid offset range, skip entity
            if cs is None or ce is None:
              continue
            if cs >= ce:
              continue

            ent_text = text[cs:ce]

            entity_items.append({
              "start": s,
              "end": e_tok,
              "text": ent_text
            })


        for i,h in enumerate(entity_items):
            for j,t in enumerate(entity_items):
                if i==j: continue
                lbl=rel_lookup.get(( (h["start"],h["end"]), (t["start"],t["end"]) ), "no_relation")
                examples.append({
                    "text":text,
                    "head":h,
                    "tail":t,
                    "label":lbl
                })
    return examples

train_examples = build_pair_examples(train_records)
dev_examples   = build_pair_examples(dev_records)
test_examples  = build_pair_examples(test_records)

print("Pair examples:", len(train_examples), len(dev_examples), len(test_examples))

# -------------------- DATASET --------------------
class PairRelDataset(Dataset):
    def __init__(self, examples, tokenizer, label2id, max_len=256):
        self.examples=examples
        self.tokenizer=tokenizer
        self.label2id=label2id
        self.max_len=max_len

    def __len__(self): return len(self.examples)

    def __getitem__(self, idx):
        ex=self.examples[idx]
        text=ex["text"]

        enc = self.tokenizer(text, return_offsets_mapping=True, add_special_tokens=False)
        token_list = self.tokenizer.convert_ids_to_tokens(enc["input_ids"])
        offsets = enc["offset_mapping"]

        hs,he = ex["head"]["start"], ex["head"]["end"]
        ts,te = ex["tail"]["start"], ex["tail"]["end"]

        inserts=[
            (he+1,"[/E1]"), (hs,"[E1]"),
            (te+1,"[/E2]"), (ts,"[E2]")
        ]
        inserts = sorted(inserts, key=lambda x: x[0], reverse=True)
        for pos,tok in inserts:
            pos = min(max(pos,0), len(token_list))
            token_list.insert(pos, tok)

        input_ids = [self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids(token_list) + [self.tokenizer.sep_token_id]
        if len(input_ids)>self.max_len:
            input_ids = input_ids[:self.max_len-1] + [self.tokenizer.sep_token_id]

        attention_mask=[1]*len(input_ids)
        pad_len=self.max_len-len(input_ids)
        if pad_len>0:
            input_ids += [self.tokenizer.pad_token_id]*pad_len
            attention_mask += [0]*pad_len

        label_id=self.label2id.get(ex["label"],self.label2id["no_relation"])

        return {
            "input_ids":torch.tensor(input_ids),
            "attention_mask":torch.tensor(attention_mask),
            "label":torch.tensor(label_id)
        }

# -------------------- DATA LOADERS --------------------
BATCH_SIZE=8
train_ds=PairRelDataset(train_examples, tokenizer, label2id)
dev_ds  =PairRelDataset(dev_examples, tokenizer, label2id)
test_ds =PairRelDataset(test_examples, tokenizer, label2id)

train_loader=DataLoader(train_ds,batch_size=BATCH_SIZE,shuffle=True)
dev_loader  =DataLoader(dev_ds, batch_size=BATCH_SIZE)
test_loader =DataLoader(test_ds,batch_size=BATCH_SIZE)

# -------------------- MODEL --------------------
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_list))
model.resize_token_embeddings(len(tokenizer))
model.to(DEVICE)

# standard settings (no thresholding)
EPOCHS = 1
LR = 2e-5

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
total_steps=len(train_loader)*EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

# -------------------- TRAIN --------------------
model.train()
for epoch in range(EPOCHS):
    loop = tqdm.tqdm(train_loader, desc=f"Training Epoch {epoch+1}")
    for batch in loop:
        ids=batch["input_ids"].to(DEVICE)
        att=batch["attention_mask"].to(DEVICE)
        labels=batch["label"].to(DEVICE)

        out=model(input_ids=ids, attention_mask=att, labels=labels)
        loss=out.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        loop.set_postfix({"loss":loss.item()})

# -------------------- SAVE --------------------
model.save_pretrained(str(OUTPUT_DIR))
tokenizer.save_pretrained(str(OUTPUT_DIR))
print("Saved to:", OUTPUT_DIR)

# -------------------- EVALUATION --------------------
def evaluate(model, loader, device):
    model.eval()
    preds=[]; golds=[]
    with torch.no_grad():
        for batch in tqdm.tqdm(loader, desc="Evaluating"):
            ids=batch["input_ids"].to(device)
            att=batch["attention_mask"].to(device)
            labels=batch["label"].to(device)

            logits=model(input_ids=ids, attention_mask=att).logits
            p=torch.argmax(logits,dim=-1).cpu().numpy()
            g=labels.cpu().numpy()

            preds.extend(p.tolist())
            golds.extend(g.tolist())

    p_micro,r_micro,f_micro,_=precision_recall_fscore_support(golds,preds,average='micro',zero_division=0)
    p_macro,r_macro,f_macro,_=precision_recall_fscore_support(golds,preds,average='macro',zero_division=0)

    return {
        "micro":(p_micro,r_micro,f_micro),
        "macro":(p_macro,r_macro,f_macro)
    }, preds, golds

dev_metrics, dev_preds, dev_golds = evaluate(model, dev_loader, DEVICE)
test_metrics, test_preds, test_golds = evaluate(model, test_loader, DEVICE)

print("\nDEV micro:",dev_metrics["micro"])
print("DEV macro:",dev_metrics["macro"])
print("\nTEST micro:",test_metrics["micro"])
print("TEST macro:",test_metrics["macro"])

# -------------------- SAVE PREDICTIONS --------------------
pred_out={
    "dev":[{"pred":id2label[p],"gold":id2label[g]} for p,g in zip(dev_preds,dev_golds)],
    "test":[{"pred":id2label[p],"gold":id2label[g]} for p,g in zip(test_preds,test_golds)]
}
with open(OUTPUT_DIR/"predictions_summary.json","w") as f:
    json.dump(pred_out,f,indent=2)

print("Predictions saved.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Labels: ['no_relation', 'brand', 'business division', 'business_division', 'chairperson', 'chief executive officer', 'chief_executive_officer', 'creator', 'currency', 'developer', 'director/manager', 'director_/_manager', 'distributed by', 'distributed_by', 'distribution format', 'distribution_format', 'employer', 'founded by', 'founded_by', 'headquarters location', 'headquarters_location', 'industry', 'legal form', 'legal_form', 'location of formation', 'location_of_formation', 'manufacturer', 'member of', 'member_of', 'operator', 'original broadcaster', 'original_broadcaster', 'owned by', 'owned_by', 'owner of', 'owner_of', 'parent organization', 'parent_organization', 'platform', 'position held', 'position_held', 'product/material produced', 'product_or_material_produced', 'publisher', 'stock exchange', 'stock_exchange', 'subsidiary']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizer vocab size: 30526


Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


Loaded records: 5700 1007 1068
Pair examples: 16570 3758 2672


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Training Epoch 1: 100%|██████████| 2072/2072 [12:51<00:00,  2.69it/s, loss=2.01]


Saved to: /content/drive/MyDrive/Datasets_EE782_course_project/FinRED_dataset/bert_pair_class_model_final


Evaluating: 100%|██████████| 470/470 [00:57<00:00,  8.15it/s]
Evaluating: 100%|██████████| 334/334 [00:39<00:00,  8.50it/s]


DEV micro: (0.6508781266631187, 0.6508781266631187, 0.6508781266631187)
DEV macro: (0.03836528221512247, 0.033478435800594425, 0.026596309975073975)

TEST micro: (0.5239520958083832, 0.5239520958083832, 0.5239520958083832)
TEST macro: (0.03229540584912485, 0.033984330774630346, 0.024406853804291414)
Predictions saved.



