In [None]:
!pip install -q --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q --upgrade transformers scikit-learn tqdm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/9.5 MB[0m [31m103.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.5/9.5 MB[0m [31m158.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!unzip -q ee782_re_project.zip -d /content
!ls /content


ee782_re_project  ee782_re_project.zip	sample_data


In [None]:
import os, json
from itertools import chain

ROOT = "/content/ee782_re_project"
REFIND_PUB = os.path.join(ROOT, "data", "REFinD", "public_dat")

def load_refind_split(split_name):
    fname = f"{split_name}_refind_official.json"
    path = os.path.join(REFIND_PUB, fname)
    print(f"Loading {split_name} from {path}")
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"  -> {len(data)} examples")
    return data

train_raw = load_refind_split("train")
dev_raw   = load_refind_split("dev")
test_raw  = load_refind_split("test")

len(train_raw), len(dev_raw), len(test_raw)


Loading train from /content/ee782_re_project/data/REFinD/public_dat/train_refind_official.json
  -> 20070 examples
Loading dev from /content/ee782_re_project/data/REFinD/public_dat/dev_refind_official.json
  -> 4306 examples
Loading test from /content/ee782_re_project/data/REFinD/public_dat/test_refind_official.json
  -> 4300 examples


(20070, 4306, 4300)

In [None]:
all_rels = sorted({ex["relation"] for ex in chain(train_raw, dev_raw, test_raw)})
rel2id = {rel: i for i, rel in enumerate(all_rels)}
id2rel = {i: rel for rel, i in rel2id.items()}

def normalize_example(ex):
    tokens = ex["token"]
    e1 = {"start": ex["e1_start"], "end": ex["e1_end"], "type": ex["e1_type"]}
    e2 = {"start": ex["e2_start"], "end": ex["e2_end"], "type": ex["e2_type"]}
    rel_label = ex["relation"]
    return {
        "docid": ex["docid"],
        "tokens": tokens,
        "e1": e1,
        "e2": e2,
        "rel": rel_label,
        "rel_id": rel2id[rel_label],
        "rel_group": ex["rel_group"],
    }

train_proc = [normalize_example(ex) for ex in train_raw]
dev_proc   = [normalize_example(ex) for ex in dev_raw]
test_proc  = [normalize_example(ex) for ex in test_raw]

len(train_proc), len(dev_proc), len(test_proc), len(all_rels)


(20070, 4306, 4300, 22)

In [None]:
def run_refind_experiment(
    model_name,
    train_examples,
    dev_examples,
    test_examples,
    rel2id,
    max_len=256,
    batch_size=16,
    epochs=3,
    lr=2e-5,
    device=None,
    return_test_predictions=False,
):
    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
    from transformers import AutoTokenizer, AutoModel
    from sklearn.metrics import precision_recall_fscore_support, accuracy_score
    from tqdm.auto import tqdm
    import numpy as np

    print(f"\n=== Running experiment with encoder: {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # -----------------------------
    # Dataset + collate
    # -----------------------------
    class RefindDataset(Dataset):
        def __init__(self, examples, tokenizer, max_len=256):
            self.examples = examples
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            return len(self.examples)

        def __getitem__(self, idx):
            ex = self.examples[idx]
            tokens = ex["tokens"]
            e1 = ex["e1"]
            e2 = ex["e2"]
            label_id = ex["rel_id"]

            encoding = self.tokenizer(
                tokens,
                is_split_into_words=True,
                truncation=True,
                max_length=self.max_len,
                return_attention_mask=True,
                return_token_type_ids=True,
            )

            word_ids = encoding.word_ids()

            subj_mask = [0] * len(word_ids)
            obj_mask  = [0] * len(word_ids)

            for i, w_id in enumerate(word_ids):
                if w_id is None:
                    continue
                if e1["start"] <= w_id < e1["end"]:
                    subj_mask[i] = 1
                if e2["start"] <= w_id < e2["end"]:
                    obj_mask[i] = 1

            item = {
                "input_ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(encoding["attention_mask"], dtype=torch.long),
                "subj_mask": torch.tensor(subj_mask, dtype=torch.float),
                "obj_mask": torch.tensor(obj_mask, dtype=torch.float),
                "labels": torch.tensor(label_id, dtype=torch.long),
            }
            if "token_type_ids" in encoding:
                item["token_type_ids"] = torch.tensor(encoding["token_type_ids"], dtype=torch.long)

            return item

    def collate_fn(batch):
        pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
        max_len_local = max(len(ex["input_ids"]) for ex in batch)

        def pad_seq(seq, pad_value):
            seq = seq.tolist() if isinstance(seq, torch.Tensor) else list(seq)
            return seq + [pad_value] * (max_len_local - len(seq))

        input_ids = torch.tensor(
            [pad_seq(ex["input_ids"], pad_token_id) for ex in batch],
            dtype=torch.long,
        )
        attention_mask = torch.tensor(
            [pad_seq(ex["attention_mask"], 0) for ex in batch],
            dtype=torch.long,
        )
        subj_mask = torch.tensor(
            [pad_seq(ex["subj_mask"], 0.0) for ex in batch],
            dtype=torch.float,
        )
        obj_mask = torch.tensor(
            [pad_seq(ex["obj_mask"], 0.0) for ex in batch],
            dtype=torch.float,
        )
        labels = torch.tensor(
            [int(ex["labels"]) for ex in batch],
            dtype=torch.long,
        )

        batch_dict = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "subj_mask": subj_mask,
            "obj_mask": obj_mask,
            "labels": labels,
        }

        if "token_type_ids" in batch[0]:
            token_type_ids = torch.tensor(
                [pad_seq(ex["token_type_ids"], 0) for ex in batch],
                dtype=torch.long,
            )
            batch_dict["token_type_ids"] = token_type_ids

        return batch_dict

    train_ds = RefindDataset(train_examples, tokenizer, max_len=max_len)
    dev_ds   = RefindDataset(dev_examples,   tokenizer, max_len=max_len)
    test_ds  = RefindDataset(test_examples,  tokenizer, max_len=max_len)

    from torch.utils.data import DataLoader
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
    dev_loader   = DataLoader(dev_ds,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # -----------------------------
    # Model
    # -----------------------------
    class RefindRelationClassifier(nn.Module):
        def __init__(self, encoder_name, num_labels, dropout=0.1):
            super().__init__()
            self.encoder = AutoModel.from_pretrained(encoder_name)
            hidden_size = self.encoder.config.hidden_size
            self.classifier = nn.Sequential(
                nn.Linear(hidden_size * 3, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_size, num_labels)
            )

        def forward(self, input_ids, attention_mask, subj_mask, obj_mask, token_type_ids=None, labels=None):
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )
            sequence_output = encoder_outputs.last_hidden_state  # [B, L, H]

            cls_repr = sequence_output[:, 0, :]  # [B, H]

            subj_mask_ = subj_mask.unsqueeze(-1)
            obj_mask_  = obj_mask.unsqueeze(-1)

            subj_sum = (sequence_output * subj_mask_).sum(dim=1)
            subj_len = subj_mask_.sum(dim=1).clamp(min=1e-6)
            subj_repr = subj_sum / subj_len

            obj_sum = (sequence_output * obj_mask_).sum(dim=1)
            obj_len = obj_mask_.sum(dim=1).clamp(min=1e-6)
            obj_repr = obj_sum / obj_len

            pair_repr = torch.cat([cls_repr, subj_repr, obj_repr], dim=-1)
            logits = self.classifier(pair_repr)

            loss = None
            if labels is not None:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits, labels)

            return {"loss": loss, "logits": logits}

    # -----------------------------
    # Setup training
    # -----------------------------
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    num_labels = len(rel2id)
    model = RefindRelationClassifier(model_name, num_labels=num_labels).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    def evaluate(model, data_loader):
        model.eval()
        all_labels = []
        all_preds = []
        with torch.no_grad():
            for batch in data_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    subj_mask=batch["subj_mask"],
                    obj_mask=batch["obj_mask"],
                    token_type_ids=batch.get("token_type_ids", None),
                )
                logits = outputs["logits"]
                preds = logits.argmax(dim=-1)
                all_labels.extend(batch["labels"].cpu().tolist())
                all_preds.extend(preds.cpu().tolist())

        prec, rec, f1, _ = precision_recall_fscore_support(
            all_labels, all_preds, average="micro", zero_division=0
        )
        acc = accuracy_score(all_labels, all_preds)
        return acc, prec, rec, f1

    best_dev_f1 = 0.0
    best_state_dict = None

    # -----------------------------
    # Training loop
    # -----------------------------
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        steps = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                subj_mask=batch["subj_mask"],
                obj_mask=batch["obj_mask"],
                token_type_ids=batch.get("token_type_ids", None),
                labels=batch["labels"],
            )
            loss = outputs["loss"]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            running_loss += loss.item()
            steps += 1

        avg_loss = running_loss / max(steps, 1)
        dev_acc, dev_prec, dev_rec, dev_f1 = evaluate(model, dev_loader)
        print(f"\nEpoch {epoch} - train loss: {avg_loss:.4f}")
        print(f"Dev -> acc: {dev_acc:.4f}, prec: {dev_prec:.4f}, rec: {dev_rec:.4f}, F1: {dev_f1:.4f}")

        if dev_f1 > best_dev_f1:
            best_dev_f1 = dev_f1
            best_state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
            print(f"  🟢 New best model saved with dev F1 = {best_dev_f1:.4f}")

    # -----------------------------
    # Load best weights (from memory) and evaluate on test
    # -----------------------------
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)
        model.to(device)

    test_acc, test_prec, test_rec, test_f1 = evaluate(model, test_loader)
    print(f"\nTEST -> acc: {test_acc:.4f}, prec: {test_prec:.4f}, rec: {test_rec:.4f}, F1: {test_f1:.4f}")

    results = {
        "dev_f1": best_dev_f1,
        "test_acc": test_acc,
        "test_prec": test_prec,
        "test_rec": test_rec,
        "test_f1": test_f1,
    }

    if not return_test_predictions:
        return results

    # -----------------------------
    # Extra: collect test predictions for per-relation analysis
    # -----------------------------
    model.eval()
    all_labels = []
    all_preds  = []

    with torch.no_grad():
        for batch in test_loader:
            labels = batch["labels"].to(device)

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                subj_mask=batch["subj_mask"],
                obj_mask=batch["obj_mask"],
                token_type_ids=batch.get("token_type_ids", None),
            )
            logits = outputs["logits"]
            preds = torch.argmax(logits, dim=-1)

            all_labels.extend(labels.cpu().tolist())
            all_preds.extend(preds.cpu().tolist())

    y_true = np.array(all_labels)
    y_pred = np.array(all_preds)

    return results, y_true, y_pred


In [None]:
results_finbert = run_refind_experiment(
    "yiyanghkust/finbert-pretrain",  # FinBERT encoder
    train_proc,
    dev_proc,
    test_proc,
    rel2id,
    epochs=3,        # same as BERT/FLANG runs
    batch_size=16,   # if OOM: change to 8
    max_len=256,     # if still heavy: try 192
)

results_finbert



=== Running experiment with encoder: yiyanghkust/finbert-pretrain ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Using device: cuda


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Epoch 1:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 1 - train loss: 0.5600
Dev -> acc: 0.8477, prec: 0.8477, rec: 0.8477, F1: 0.8477
  🟢 New best model saved with dev F1 = 0.8477


Epoch 2:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 2 - train loss: 0.2760
Dev -> acc: 0.8546, prec: 0.8546, rec: 0.8546, F1: 0.8546
  🟢 New best model saved with dev F1 = 0.8546


Epoch 3:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 3 - train loss: 0.1899
Dev -> acc: 0.8423, prec: 0.8423, rec: 0.8423, F1: 0.8423

TEST -> acc: 0.7672, prec: 0.7672, rec: 0.7672, F1: 0.7672


{'dev_f1': 0.8546214584300975,
 'test_acc': 0.7672093023255814,
 'test_prec': 0.7672093023255814,
 'test_rec': 0.7672093023255814,
 'test_f1': 0.7672093023255814}

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

def per_relation_report(y_true, y_pred, rel2id):
    id2rel = {v: k for k, v in rel2id.items()}
    labels = list(range(len(id2rel)))
    target_names = [id2rel[i] for i in labels]

    report_dict = classification_report(
        y_true,
        y_pred,
        labels=labels,
        target_names=target_names,
        digits=3,
        output_dict=True,
        zero_division=0,
    )

    rows = []
    for rel_name, metrics in report_dict.items():
        if rel_name in ["accuracy", "macro avg", "weighted avg", "micro avg"]:
            continue
        rows.append({
            "relation": rel_name,
            "precision": metrics["precision"],
            "recall": metrics["recall"],
            "f1": metrics["f1-score"],
            "support": metrics["support"],
        })

    df = pd.DataFrame(rows).sort_values("support", ascending=False).reset_index(drop=True)
    return df


In [None]:
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

results_bert, y_true_b, y_pred_b = run_refind_experiment(
    "bert-base-uncased",
    train_proc,
    dev_proc,
    test_proc,
    rel2id,
    epochs=3,
    batch_size=16,
    max_len=256,
    lr=2e-5,
    device=device,
    return_test_predictions=True,
)




=== Running experiment with encoder: bert-base-uncased ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Using device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 1 - train loss: 0.6098
Dev -> acc: 0.8251, prec: 0.8251, rec: 0.8251, F1: 0.8251
  🟢 New best model saved with dev F1 = 0.8251


Epoch 2:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 2 - train loss: 0.2928
Dev -> acc: 0.8437, prec: 0.8437, rec: 0.8437, F1: 0.8437
  🟢 New best model saved with dev F1 = 0.8437


Epoch 3:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 3 - train loss: 0.2110
Dev -> acc: 0.8504, prec: 0.8504, rec: 0.8504, F1: 0.8504
  🟢 New best model saved with dev F1 = 0.8504

TEST -> acc: 0.7598, prec: 0.7598, rec: 0.7598, F1: 0.7598


In [None]:
y_true_b, y_pred_b

(array([ 0,  0,  0, ..., 11, 11, 11]), array([0, 0, 0, ..., 0, 0, 0]))

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Invert mapping: id -> relation name
id2rel = {v: k for k, v in rel2id.items()}

# Make sure labels are in a fixed order 0..N-1
num_labels = len(id2rel)
labels = list(range(num_labels))
target_names = [id2rel[i] for i in labels]

print("Number of test examples:", len(y_true_b))
print("Unique labels in y_true:", sorted(set(y_true_b)))

report_b = classification_report(
    y_true_b,
    y_pred_b,
    labels=labels,
    target_names=target_names,
    digits=4,
    zero_division=0,
)

print(report_b)


Number of test examples: 4300
Unique labels in y_true: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21)]
                          precision    recall  f1-score   support

             no_relation     0.7783    0.7998    0.7889      1953
    org:date:acquired_on     0.4074    0.4583    0.4314        24
      org:date:formed_on     0.9318    0.8542    0.8913        96
       org:gpe:formed_in     0.4667    0.4118    0.4375        17
org:gpe:headquartered_in     0.7097    0.7586    0.7333        29
   org:gpe:operations_in     0.9281    0.7686    0.8409       605
       org:money:cost_of     0.2500    0.2500    0.2500         4
       org:money:loss_of     0.7895    0.9677    0.8696        31
     org:money:profit_of     0.6000    0.6000    0.

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

prec, rec, f1, support = precision_recall_fscore_support(
    y_true_b,
    y_pred_b,
    labels=labels,
    zero_division=0,
)

df_rel_bert = pd.DataFrame({
    "relation": target_names,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "support": support,
})

# Sort by F1 descending (optional)
df_rel_bert = df_rel_bert.sort_values("f1", ascending=False).reset_index(drop=True)
df_rel_bert


Unnamed: 0,relation,precision,recall,f1,support
0,pers:title:title,0.983193,0.871833,0.924171,671
1,pers:univ:attended,1.0,0.857143,0.923077,7
2,org:date:formed_on,0.931818,0.854167,0.891304,96
3,org:money:loss_of,0.789474,0.967742,0.869565,31
4,org:gpe:operations_in,0.928144,0.768595,0.840868,605
5,no_relation,0.778276,0.799795,0.788889,1953
6,pers:org:employee_of,0.610619,0.92246,0.734824,374
7,org:gpe:headquartered_in,0.709677,0.758621,0.733333,29
8,org:money:revenue_of,0.609375,0.829787,0.702703,47
9,org:money:profit_of,0.6,0.6,0.6,5


In [None]:
df_rel_bert.to_csv("bert_refind_per_relation.csv", index=False)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

results_flang, y_true_fl, y_pred_fl = run_refind_experiment(
    model_name="SALT-NLP/FLANG-BERT",   # FLANG encoder
    train_examples=train_proc,
    dev_examples=dev_proc,
    test_examples=test_proc,
    rel2id=rel2id,
    max_len=256,        # same as BERT
    batch_size=16,      # if you hit OOM, reduce to 8
    epochs=3,           # mirror BERT setup
    lr=2e-5,
    device=device,
    return_test_predictions=True
)

results_flang, y_true_fl, y_pred_fl

Using device: cuda

=== Running experiment with encoder: SALT-NLP/FLANG-BERT ===


tokenizer_config.json:   0%|          | 0.00/369 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Using device: cuda


config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at SALT-NLP/FLANG-BERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1:   0%|          | 0/1255 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


Epoch 1 - train loss: 0.5996
Dev -> acc: 0.8372, prec: 0.8372, rec: 0.8372, F1: 0.8372
  🟢 New best model saved with dev F1 = 0.8372


Epoch 2:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 2 - train loss: 0.2923
Dev -> acc: 0.8525, prec: 0.8525, rec: 0.8525, F1: 0.8525
  🟢 New best model saved with dev F1 = 0.8525


Epoch 3:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 3 - train loss: 0.2096
Dev -> acc: 0.8481, prec: 0.8481, rec: 0.8481, F1: 0.8481

TEST -> acc: 0.7781, prec: 0.7781, rec: 0.7781, F1: 0.7781


({'dev_f1': 0.8525313516024152,
  'test_acc': 0.7781395348837209,
  'test_prec': 0.7781395348837209,
  'test_rec': 0.7781395348837209,
  'test_f1': 0.7781395348837209},
 array([ 0,  0,  0, ..., 11, 11, 11]),
 array([0, 0, 0, ..., 0, 0, 0]))

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Invert mapping: id -> relation name
id2rel = {v: k for k, v in rel2id.items()}

# Make sure labels are in a fixed order 0..N-1
num_labels = len(id2rel)
labels = list(range(num_labels))
target_names = [id2rel[i] for i in labels]

print("Number of test examples:", len(y_true_b))
print("Unique labels in y_true:", sorted(set(y_true_b)))

report_fl = classification_report(
    y_true_fl,
    y_pred_fl,
    labels=labels,
    target_names=target_names,
    digits=4,
    zero_division=0,
)

print(report_fl)

Number of test examples: 4300
Unique labels in y_true: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21)]
                          precision    recall  f1-score   support

             no_relation     0.7903    0.8315    0.8104      1953
    org:date:acquired_on     0.4138    0.5000    0.4528        24
      org:date:formed_on     0.7981    0.8646    0.8300        96
       org:gpe:formed_in     0.3750    0.3529    0.3636        17
org:gpe:headquartered_in     0.7308    0.6552    0.6909        29
   org:gpe:operations_in     0.9140    0.8083    0.8579       605
       org:money:cost_of     0.0000    0.0000    0.0000         4
       org:money:loss_of     0.7838    0.9355    0.8529        31
     org:money:profit_of     1.0000    0.6000    0.

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

prec, rec, f1, support = precision_recall_fscore_support(
    y_true_fl,
    y_pred_fl,
    labels=labels,
    zero_division=0,
)

df_rel_flang = pd.DataFrame({
    "relation": target_names,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "support": support,
})

# Sort by F1 descending (optional)
df_rel_flang = df_rel_flang.sort_values("f1", ascending=False).reset_index(drop=True)
df_rel_flang

Unnamed: 0,relation,precision,recall,f1,support
0,pers:title:title,0.966184,0.894188,0.928793,671
1,pers:univ:attended,1.0,0.857143,0.923077,7
2,org:gpe:operations_in,0.914019,0.808264,0.857895,605
3,org:money:loss_of,0.783784,0.935484,0.852941,31
4,org:date:formed_on,0.798077,0.864583,0.83,96
5,no_relation,0.790268,0.831541,0.810379,1953
6,org:money:profit_of,1.0,0.6,0.75,5
7,pers:org:employee_of,0.608392,0.930481,0.735729,374
8,org:money:revenue_of,0.622951,0.808511,0.703704,47
9,org:gpe:headquartered_in,0.730769,0.655172,0.690909,29


In [None]:
df_rel_flang.to_csv("flang_refind_per_relation.csv", index=False)

In [None]:


# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

results_finbert, y_true_f, y_pred_f = run_refind_experiment(
    model_name="yiyanghkust/finbert-pretrain",   # FinBERT encoder
    train_examples=train_proc,
    dev_examples=dev_proc,
    test_examples=test_proc,
    rel2id=rel2id,
    max_len=256,        # keep same as BERT / FLANG
    batch_size=16,      # if OOM, drop to 8
    epochs=3,           # mirror BERT setup
    lr=2e-5,
    device=device,
    return_test_predictions=True
)

results_finbert, y_true_f, y_pred_f


Using device: cuda

=== Running experiment with encoder: yiyanghkust/finbert-pretrain ===


config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Using device: cuda


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Epoch 1:   0%|          | 0/1255 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]


Epoch 1 - train loss: 0.5479
Dev -> acc: 0.8300, prec: 0.8300, rec: 0.8300, F1: 0.8300
  🟢 New best model saved with dev F1 = 0.8300


Epoch 2:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 2 - train loss: 0.2757
Dev -> acc: 0.8418, prec: 0.8418, rec: 0.8418, F1: 0.8418
  🟢 New best model saved with dev F1 = 0.8418


Epoch 3:   0%|          | 0/1255 [00:00<?, ?it/s]


Epoch 3 - train loss: 0.1891
Dev -> acc: 0.8321, prec: 0.8321, rec: 0.8321, F1: 0.8321

TEST -> acc: 0.7598, prec: 0.7598, rec: 0.7598, F1: 0.7598


({'dev_f1': 0.841848583372039,
  'test_acc': 0.7597674418604651,
  'test_prec': 0.7597674418604651,
  'test_rec': 0.7597674418604651,
  'test_f1': 0.7597674418604651},
 array([ 0,  0,  0, ..., 11, 11, 11]),
 array([0, 0, 0, ..., 0, 0, 0]))

In [None]:
import numpy as np
from sklearn.metrics import classification_report

# Invert mapping: id -> relation name
id2rel = {v: k for k, v in rel2id.items()}

# Make sure labels are in a fixed order 0..N-1
num_labels = len(id2rel)
labels = list(range(num_labels))
target_names = [id2rel[i] for i in labels]

print("Number of test examples:", len(y_true_b))
print("Unique labels in y_true:", sorted(set(y_true_b)))

report_f = classification_report(
    y_true_f,
    y_pred_f,
    labels=labels,
    target_names=target_names,
    digits=4,
    zero_division=0,
)

print(report_f)

Number of test examples: 4300
Unique labels in y_true: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21)]
                          precision    recall  f1-score   support

             no_relation     0.7807    0.7947    0.7876      1953
    org:date:acquired_on     0.3824    0.5417    0.4483        24
      org:date:formed_on     0.8500    0.8854    0.8673        96
       org:gpe:formed_in     0.4286    0.3529    0.3871        17
org:gpe:headquartered_in     0.6410    0.8621    0.7353        29
   org:gpe:operations_in     0.9203    0.8017    0.8569       605
       org:money:cost_of     0.0000    0.0000    0.0000         4
       org:money:loss_of     0.7632    0.9355    0.8406        31
     org:money:profit_of     0.5714    0.8000    0.

In [None]:
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

prec, rec, f1, support = precision_recall_fscore_support(
    y_true_f,
    y_pred_f,
    labels=labels,
    zero_division=0,
)

df_rel_f = pd.DataFrame({
    "relation": target_names,
    "precision": prec,
    "recall": rec,
    "f1": f1,
    "support": support,
})

# Sort by F1 descending (optional)
df_rel_f = df_rel_f.sort_values("f1", ascending=False).reset_index(drop=True)
df_rel_f

Unnamed: 0,relation,precision,recall,f1,support
0,pers:univ:attended,0.875,1.0,0.933333,7
1,pers:title:title,0.987478,0.822653,0.897561,671
2,org:date:formed_on,0.85,0.885417,0.867347,96
3,org:gpe:operations_in,0.920304,0.801653,0.85689,605
4,org:money:loss_of,0.763158,0.935484,0.84058,31
5,no_relation,0.780684,0.794675,0.787617,1953
6,pers:org:employee_of,0.617594,0.919786,0.73899,374
7,org:gpe:headquartered_in,0.641026,0.862069,0.735294,29
8,pers:univ:employee_of,0.727273,0.666667,0.695652,12
9,org:money:profit_of,0.571429,0.8,0.666667,5


In [None]:
df_rel_f.to_csv("finbert_refind_per_relation.csv", index=False)