In [None]:
from datasets import load_dataset,load_from_disk
dataset_path="MeDAL-dataset-small-5perc"
dataset = load_from_disk(dataset_path)
# dataset = load_dataset("lutful2004/MeDAL-dataset-small-5perc")
print(dataset)

In [None]:
# label2id = {label: idx for idx, label in enumerate(sorted(set(dataset["train"]["LABEL"])))}
# id2label = {idx: label for label, idx in label2id.items()}

In [None]:
# len(label2id)

In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

special_tokens = {"additional_special_tokens": ["[ABBR]", "[/ABBR]"]}
tokenizer.add_special_tokens(special_tokens)


def preprocess_function(examples):
    marked_texts = []

    for text, loc, label in zip(
        examples["TEXT"], examples["LOCATION"], examples["LABEL"]
    ):
        # Step 1: Mark abbreviation
        before = text[:loc]
        after = text[loc:]
        tokens = after.split(maxsplit=1)
        abbr = tokens[0]
        rest = tokens[1] if len(tokens) > 1 else ""
        marked_text = f"{before} [ABBR] {abbr} [/ABBR] {rest}"
        marked_texts.append(marked_text)


    # Step 3: Tokenize without tensors
    model_inputs = tokenizer(
        marked_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_attention_mask=True,
    )

    model_inputs["labels"] = examples["LABEL"]  # Note: Trainer expects "labels" key
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=dataset["train"].column_names
)

In [None]:
tokenized_dataset

In [None]:
# !pip install seqeval
# !pip install peft

In [None]:
import os
import torch
import numpy as np
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from peft import get_peft_model, LoraConfig, TaskType

# =========================
# Load tokenizer & dataset
# =========================
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Assuming you have already tokenized your dataset
# tokenized_dataset = {"train": ..., "test": ...}
# Each element should have "input_ids", "attention_mask", and "labels"

# =========================
# Label mappings
# =========================
# id2label and label2id should already be defined
# Example:
# id2label = {0: "label_a", 1: "label_b", ...}
# label2id = {v: k for k, v in id2label.items()}

# =========================
# Load model for sequence classification
# =========================
# model = AutoModelForSequenceClassification.from_pretrained(
#         model_checkpoint,
#         num_labels=len(id2label),
#         id2label=id2label,
#         label2id=label2id
#     )

num_labels = dataset["train"].features["LABEL"].num_classes
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels  # Only pass this
)


# Resize embeddings if tokenizer has extra tokens
model.resize_token_embeddings(len(tokenizer)) 

# =========================
# LoRA configuration
# =========================
tar_modules = ["query", "key", "value"]  # target attention modules

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence classification
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="all",
    target_modules=tar_modules,
    modules_to_save=["classifier"],
)

lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

# =========================
# Device setup
# =========================
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Using device:", device)
lora_model.to(device)

# =========================
# Data collator
# =========================
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator = DataCollatorWithPadding(tokenizer)

# =========================
# Metrics
# =========================
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "precision": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }

# =========================
# Training arguments
# =========================
training_args = TrainingArguments(
    output_dir="model_training",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    push_to_hub=False,
)

# =========================
# Trainer
# =========================
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# =========================
# Start training
# =========================
trainer.train()

# Save the fine-tuned LoRA model
trainer.save_model("./lora_model_finetuned")


In [None]:
metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)

In [None]:
all_ids = np.concatenate([x["input_ids"] for x in tokenized_dataset["train"]])
print("Max token ID:", all_ids.max(), "Tokenizer vocab size:", len(tokenizer))