# Cue Detection Fine-Tuning the Models XLnet, SciBERT, BERT
* To run for each model seperately, comment the models config in "main" which are not going to be used
* Imbalanced data problem addressed in this script

In [None]:
import os
import torch
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW, get_scheduler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.metrics import (
    confusion_matrix,
    precision_recall_fscore_support,
    classification_report,
    f1_score
)
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight


nltk.download("punkt", quiet=True)

def preprocess_hedgepeer(data_path, tokenizer, max_len=128, oversample=True):
    """
    Load and preprocess HedgePeer dataset with optional oversampling of hedge-rich sentences.
    """
    df = pd.read_json(data_path, lines=True)
    sentences, labels = [], []

    for _, row in df.iterrows():
        for sentence in row["Sentences"]:
            tokens = tokenizer.tokenize(sentence["Sentence"])
            token_labels = ["O"] * len(tokens)
            for hedge in sentence["Hedges"]:
                hedge_tokens = tokenizer.tokenize(hedge["Hedge"])
                for idx in range(len(tokens)):
                    if tokens[idx:idx + len(hedge_tokens)] == hedge_tokens:
                        token_labels[idx:idx + len(hedge_tokens)] = ["HEDGE"] * len(hedge_tokens)

            sentences.append(tokens)
            labels.append(token_labels)

    # Oversample hedge sentences
    if oversample:
        hedge_sentences = [(t, l) for t, l in zip(sentences, labels) if "HEDGE" in l]
        # Add each hedge sentence twice
        for _ in range(2):
            for t, l in hedge_sentences:
                sentences.append(t)
                labels.append(l)

    # 70-20-10 split
    train_val_tokens, test_tokens, train_val_labels, test_labels = train_test_split(
        sentences, labels, test_size=0.1, random_state=42
    )
    train_tokens, val_tokens, train_labels, val_labels = train_test_split(
        train_val_tokens, train_val_labels, test_size=0.22, random_state=42
    )

    return train_tokens, val_tokens, test_tokens, train_labels, val_labels, test_labels



def save_model(model, tokenizer, output_dir):
    """
    Save the fine-tuned model and tokenizer.
    """
    os.makedirs(output_dir, exist_ok=True)


    for name, param in model.named_parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f" Model saved to {output_dir}")


class HedgePeerDataset(Dataset):
    def __init__(self, tokens, labels, tokenizer, max_len=128):
        self.tokens = tokens
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        tokens = self.tokens[idx]
        labels = self.labels[idx]

        # Tokenize and align labels
        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            max_length=self.max_len,
            return_offsets_mapping=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )


        label_ids = []
        word_ids = encoding.word_ids()
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore [PAD] tokens
            elif word_idx != previous_word_idx:
                label_ids.append(1 if labels[word_idx] == "HEDGE" else 0)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label_ids),
        }


def train_model(model, train_loader, val_loader, test_loader, device, model_name, output_dir, epochs=3, learning_rate=3e-5):

    from sklearn.utils.class_weight import compute_class_weight


    all_labels = []
    for batch in train_loader:
        labels = batch["labels"].numpy()
        all_labels.extend([l for l in labels.flatten() if l != -100])

    class_weights = compute_class_weight(
        "balanced",
        classes=np.unique(all_labels),
        y=all_labels
    )

    optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,
                               num_training_steps=len(train_loader) * epochs)
    loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(device))

    # Tracking
    train_losses, val_losses, val_f1_scores = [], [], []
    final_all_preds, final_all_labels = [], []

    metrics_path = os.path.join(output_dir, model_name.lower().replace("-", "_"), "test_metrics_per_epoch.csv")
    os.makedirs(os.path.dirname(metrics_path), exist_ok=True)

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            active_loss = labels.view(-1) != -100
            active_logits = outputs.logits.view(-1, model.config.num_labels)[active_loss].float()
            active_labels = labels.view(-1)[active_loss].long()

            loss = loss_fn(active_logits, active_labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                active_loss = labels.view(-1) != -100
                active_logits = outputs.logits.view(-1, model.config.num_labels)[active_loss].float()
                active_labels = labels.view(-1)[active_loss].long()

                val_loss = loss_fn(active_logits, active_labels)
                total_val_loss += val_loss.item()

                preds = torch.argmax(active_logits, dim=-1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(active_labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        val_f1 = f1_score(all_labels, all_preds, average="macro")
        val_f1_scores.append(val_f1)

        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, "
              f"Val Loss = {avg_val_loss:.4f}, Val F1 = {val_f1:.4f}")

        if epoch == epochs - 1:
            final_all_preds = all_preds
            final_all_labels = all_labels

        # Evaluate on test set at the end of each epoch
        test_preds, test_true = [], []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                active_loss = labels.view(-1) != -100
                logits = outputs.logits.view(-1, model.config.num_labels)[active_loss]
                true = labels.view(-1)[active_loss]

                test_preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
                test_true.extend(true.cpu().numpy())

        # compute and save test metrics
        p, r, f1, _ = precision_recall_fscore_support(test_true, test_preds, average="macro")
        test_metrics_df = pd.DataFrame([[epoch+1, p, r, f1]], columns=["Epoch", "Precision", "Recall", "F1"])
        if not os.path.exists(metrics_path):
            test_metrics_df.to_csv(metrics_path, index=False)
        else:
            test_metrics_df.to_csv(metrics_path, mode='a', index=False, header=False)

    return model, train_losses, val_losses, val_f1_scores, final_all_preds, final_all_labels


def plot_model_comparison(metrics_tracker, output_dir):

    plt.figure(figsize=(10, 6))

    for model_name, metrics in metrics_tracker.items():

        print(f"{model_name}: Val F1 Scores = {metrics['val_f1_scores']}")
        plt.plot(metrics["val_f1_scores"], label=model_name)

    plt.title("Model Comparison: Validation F1 Scores")
    plt.xlabel("Epoch")
    plt.ylabel("F1 Score")
    plt.legend()
    plt.grid()

    # Save the plot
    comparison_plot_path = os.path.join(output_dir, "model_comparison_f1_scores.png")
    plt.savefig(comparison_plot_path)
    plt.close()
    print(f"Comparison plot saved at {comparison_plot_path}")


def plot_learning_metrics(train_losses, val_losses, val_f1_scores, all_preds, all_labels, output_dir, model_name):

    plt.figure(figsize=(15, 5))

    # Loss Plot
    plt.subplot(1, 3, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title(f'{model_name} - Loss per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # F1 Score Plot
    plt.subplot(1, 3, 2)
    plt.plot(val_f1_scores, label='Validation F1 Score', color='green')
    plt.title(f'{model_name} - F1 Score per Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()

    # Performance Metrics Plot
    plt.subplot(1, 3, 3)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average=None
    )
    x = ['Non-Hedge', 'Hedge']
    plt.bar(np.arange(len(x)) - 0.2, precision, 0.4, label='Precision', color='blue')
    plt.bar(np.arange(len(x)) + 0.2, recall, 0.4, label='Recall', color='red')
    plt.title(f'{model_name} - Precision and Recall')
    plt.xticks(np.arange(len(x)), x)
    plt.ylabel('Score')
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{model_name.lower().replace("-", "_")}_learning_metrics.png'))
    plt.close()

    # Confusion Matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Non-Hedge', 'Hedge'],
                yticklabels=['Non-Hedge', 'Hedge'])
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{model_name.lower().replace("-", "_")}_confusion_matrix.png'))
    plt.close()


def evaluate_model(model, test_loader, device, model_name, output_dir):

    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            active_loss = labels.view(-1) != -100
            active_logits = outputs.logits.view(-1, model.config.num_labels)[active_loss].float()
            active_labels = labels.view(-1)[active_loss].long()

            preds = torch.argmax(active_logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(active_labels.cpu().numpy())


    report = classification_report(
        all_labels, all_preds,
        target_names=['Non-Hedge', 'Hedge'],
        output_dict=True
    )

    # save classification metrics to CSV
    report_df = pd.DataFrame(report).transpose()
    csv_path = os.path.join(output_dir, f"{model_name.lower().replace('-', '_')}_metrics.csv")
    report_df.to_csv(csv_path, index=True)
    print(f"Metrics saved to {csv_path}")

    print(f"\n--- {model_name} Test Set Performance ---")
    print(classification_report(all_labels, all_preds,
                                target_names=['Non-Hedge', 'Hedge']))

    return all_preds, all_labels


def run_model_training(model_name, model_path, data_path, output_dir, device, metrics_tracker):
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    if hasattr(tokenizer, "add_prefix_space") and "bert" not in model_path.lower():
        tokenizer.add_prefix_space = True

    model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=2).to(device)

    print(f"Preprocessing hp dataset for {model_name}...")
    train_tokens, val_tokens, test_tokens, train_labels, val_labels, test_labels = preprocess_hedgepeer(data_path, tokenizer)

    train_dataset = HedgePeerDataset(train_tokens, train_labels, tokenizer)
    val_dataset = HedgePeerDataset(val_tokens, val_labels, tokenizer)
    test_dataset = HedgePeerDataset(test_tokens, test_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    print(f"Fine-tuning {model_name} model...")
    model, train_losses, val_losses, val_f1_scores, all_preds, all_labels = train_model(
        model, train_loader, val_loader, test_loader, device, model_name, output_dir, epochs=3, learning_rate=5e-5
    )

    # Store for comparison plot
    metrics_tracker[model_name] = {
        "val_f1_scores": val_f1_scores,
        "train_losses": train_losses,
        "val_losses": val_losses
    }

    model_output_dir = os.path.join(output_dir, model_name.lower().replace("-", "_"))
    os.makedirs(model_output_dir, exist_ok=True)

    plot_learning_metrics(train_losses, val_losses, val_f1_scores, all_preds, all_labels, model_output_dir, model_name)
    test_preds, test_labels = evaluate_model(model, test_loader, device, model_name, model_output_dir)
    save_model(model, tokenizer, output_dir=model_output_dir)

    pt_path = os.path.join(model_output_dir, f"{model_name.lower().replace('-', '_')}_model.pt")
    torch.save(model.state_dict(), pt_path)
    print(f" .pt model saved to: {pt_path}")


if __name__ == "__main__":

    DATA_PATH = "/kaggle/input/" #change the path for the dataset which is used for fine-tuning(HedgePeer or Bioscope)
    OUTPUT_DIR = "/kaggle/working/"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model configurations
    models = [
        {"name": "BERT", "path": "bert-base-cased"},
        {"name": "XLNet", "path": "xlnet-base-cased"},
        {"name": "SciBERT", "path": "allenai/scibert_scivocab_cased"}
    ]

    metrics_tracker = {}

    for model_config in models:
        run_model_training(
            model_config["name"],
            model_config["path"],
            DATA_PATH,
            OUTPUT_DIR,
            device,
            metrics_tracker
        )

    plot_model_comparison(metrics_tracker, OUTPUT_DIR)


    print("All model training completed!")

# Fine-Tuning Bioscope Trained Cue Detection Model with UNSC Corpus

In [None]:
import os
import json
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_score, recall_score,
    accuracy_score, f1_score
)
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# config
UNSC_PATH = "/kaggle/input/unsc-gold-labeled/gold_labeled_unsc_MELISA_combined_fini.json" #change the path
PRETRAINED_MODEL_PATH = "/kaggle/input/bioscope_cue/pytorch/default/1/fine_tuned_bioscope_bert_23_03.2/bert/bert_model.pt" #change the path
BASE_MODEL = "bert-base-cased"
OUTPUT_DIR = "/kaggle/working/unsc_finetuned_bert"
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#load tokenizer and model arch.
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForTokenClassification.from_pretrained(BASE_MODEL, num_labels=2).to(DEVICE)
state_dict = torch.load(PRETRAINED_MODEL_PATH, map_location=DEVICE)
model.load_state_dict(state_dict)


def extract_examples_from_unsc(json_path, tokenizer, max_len=128):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    input_ids_list, attention_masks_list, labels_list = [], [], []
    hedge_examples = []

    for doc in data:
        for sent in doc.get("Sentences", []):
            text = sent["Sentence"]
            cue_spans = [h["Hedge"] for h in sent.get("Gold_Hedges", [])]

            encoding = tokenizer(
                text,
                return_offsets_mapping=True,
                padding="max_length",
                truncation=True,
                max_length=max_len,
                return_tensors="pt"
            )

            input_ids = encoding["input_ids"].squeeze()
            attention_mask = encoding["attention_mask"].squeeze()
            offsets = encoding["offset_mapping"].squeeze().tolist()

            labels = [-100] * len(input_ids)
            found_hedge = False

            for i, (start, end) in enumerate(offsets):
                if start == end:
                    continue
                token_text = text[start:end].lower()
                for cue in cue_spans:
                    if cue.lower() in token_text or token_text in cue.lower():
                        labels[i] = 1
                        found_hedge = True
                        break
                if labels[i] != 1:
                    labels[i] = 0

            input_ids_list.append(input_ids)
            attention_masks_list.append(attention_mask)
            labels_list.append(torch.tensor(labels))

            if found_hedge:
                hedge_examples.append((input_ids, attention_mask, torch.tensor(labels)))

    # OVERSAMPLING
    for _ in range(3):
        for ids, mask, lab in hedge_examples:
            input_ids_list.append(ids)
            attention_masks_list.append(mask)
            labels_list.append(lab)

    return input_ids_list, attention_masks_list, labels_list

class CueDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": self.labels[idx]
        }


input_ids_list, attention_masks_list, labels_list = extract_examples_from_unsc(UNSC_PATH, tokenizer)

# split of data
temp_inputs, test_inputs, temp_masks, test_masks, temp_labels, test_labels = train_test_split(
    input_ids_list, attention_masks_list, labels_list, test_size=0.1, random_state=42
)
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    temp_inputs, temp_masks, temp_labels, test_size=2/9, random_state=42
)

train_dataset = CueDataset(train_inputs, train_masks, train_labels)
val_dataset = CueDataset(val_inputs, val_masks, val_labels)
test_dataset = CueDataset(test_inputs, test_masks, test_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# class weights
all_flat_labels = torch.cat([l[l != -100] for l in train_labels]).numpy()
class_weights = compute_class_weight("balanced", classes=np.unique(all_flat_labels), y=all_flat_labels)
print("\n Class weights:", class_weights)
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(DEVICE))

# training loop
optimizer = AdamW(model.parameters(), lr=2e-5)
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    all_preds, all_true = [], []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        active_loss = labels.view(-1) != -100
        logits = outputs.logits.view(-1, 2)[active_loss]
        true_labels = labels.view(-1)[active_loss]

        loss = loss_fn(logits, true_labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        trues = true_labels.detach().cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(trues)

    f1 = f1_score(all_true, all_preds, average='macro')
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}, F1 = {f1:.4f}")


os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "finetuned_unsc_bert.pt"))
print(f"\n Model saved to {OUTPUT_DIR}")

# evaluation
print("\n Running Evaluation on Test Set...")

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        active_loss = labels.view(-1) != -100
        logits = outputs.logits.view(-1, 2)[active_loss]
        true_labels = labels.view(-1)[active_loss]

        preds = torch.argmax(logits, dim=1).cpu().numpy()
        trues = true_labels.cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(trues)

report_dict = classification_report(all_labels, all_preds, target_names=["Non-Hedge", "Hedge"], output_dict=True)
conf_matrix = confusion_matrix(all_labels, all_preds)

report_df = pd.DataFrame(report_dict).transpose()
report_csv_path = os.path.join(OUTPUT_DIR, "classification_report.csv")
report_df.to_csv(report_csv_path)
print(f" Saved classification report to: {report_csv_path}")

plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues", xticklabels=["Non-Hedge", "Hedge"], yticklabels=["Non-Hedge", "Hedge"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
conf_image_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
plt.savefig(conf_image_path)
plt.close()
print(f" Saved confusion matrix image to: {conf_image_path}")

accuracy = accuracy_score(all_labels, all_preds)
macro_precision = precision_score(all_labels, all_preds, average='macro')
macro_recall = recall_score(all_labels, all_preds, average='macro')
macro_f1 = f1_score(all_labels, all_preds, average='macro')

print(f"\n Final Evaluation Metrics:")
print(f"  Accuracy      : {accuracy:.4f}")
print(f"  Macro F1      : {macro_f1:.4f}")
print(f"  Macro Recall  : {macro_recall:.4f}")
print(f"  Macro Precision: {macro_precision:.4f}")