In [None]:
# !pip install transformers datasets accelerate bitsandbytes

In [None]:
# !pip install -q transformers datasets accelerate evaluate

In [None]:
from datasets import load_dataset
# dataset_path="MeDAL-dataset-small-5perc"
# dataset = load_from_disk(dataset_path)
dataset = load_dataset("lutful2004/MeDAL-dataset-small-5perc")
print(dataset)


In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

special_tokens = {"additional_special_tokens": ["[ABBR]", "[/ABBR]"]}
tokenizer.add_special_tokens(special_tokens)


def preprocess_function(examples):
    marked_texts = []

    for text, loc, label in zip(
        examples["TEXT"], examples["LOCATION"], examples["LABEL"]
    ):
        # Step 1: Mark abbreviation
        before = text[:loc]
        after = text[loc:]
        tokens = after.split(maxsplit=1)
        abbr = tokens[0]
        rest = tokens[1] if len(tokens) > 1 else ""
        marked_text = f"{before} [ABBR] {abbr} [/ABBR] {rest}"
        marked_texts.append(marked_text)


    # Step 3: Tokenize without tensors
    model_inputs = tokenizer(
        marked_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        # return_attention_mask=True,
    )

    model_inputs["labels"] = examples["LABEL"]  # Note: Trainer expects "labels" key
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=dataset["train"].column_names
)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = dataset["train"].features["LABEL"].num_classes
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels  # Only pass this
)


# Resize embeddings if tokenizer has extra tokens
model.resize_token_embeddings(len(tokenizer))

# model.gradient_checkpointing_enable()  # save memory


In [None]:
print(device)  # should say "cuda"
model.to(device)

In [None]:
# from datasets import Dataset

# # Take subset for quick Colab test
# train_subset = tokenized_dataset["train"].shuffle(seed=42).select(range(5000))
# valid_subset = tokenized_dataset["valid"].shuffle(seed=42).select(range(1000))

# print(train_subset)
# print(valid_subset)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/medal-bert-checkpoints",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,     # small batch
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,     # effective batch size = 32
    num_train_epochs=1,
    fp16=True,                         # mixed precision
    logging_dir="./logs",
    logging_steps=200,
    report_to="none",
    remove_unused_columns=False,
)


In [None]:
import evaluate
import numpy as np
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
        "precision": precision.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": recall.compute(predictions=preds, references=labels, average="weighted")["recall"],
    }

In [None]:
from transformers import Trainer, DataCollatorWithPadding
# from sklearn.metrics import accuracy_score

data_collator = DataCollatorWithPadding(tokenizer)

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = logits.argmax(axis=-1)
#     return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)


In [None]:
# import torch
# from transformers import (
#     AutoModelForSequenceClassification,
#     BitsAndBytesConfig,
#     TrainingArguments,
#     Trainer,
#     DataCollatorWithPadding,
#     TrainerCallback
# )
# from sklearn.metrics import accuracy_score

# # ===============================
# # 1. Model & Device Setup
# # ===============================
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load model with quantization and auto device placement
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_checkpoint,
#     num_labels=num_labels,
#     id2label=id2label,
#     label2id=label2id,
#     device_map="auto"
# )

# # Enable gradient checkpointing to save memory
# model.gradient_checkpointing_enable()

# # Optional: split layers manually if needed (multi-GPU)
# # model.layer1.to('cuda:0')
# # model.layer2.to('cuda:1')

# # ===============================
# # 2. Training Arguments
# # ===============================
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="steps",
#     eval_steps=500,
#     save_strategy="steps",
#     save_steps=500,
#     save_total_limit=2,
#     learning_rate=2e-5,
#     per_device_train_batch_size=4,     # small batch
#     per_device_eval_batch_size=4,
#     gradient_accumulation_steps=4,     # effective batch size = 16
#     num_train_epochs=2,
#     fp16=True,                         # mixed precision
#     logging_dir="./logs",
#     logging_steps=100,
#     report_to="none",
#     dataloader_num_workers=0
# )

# # ===============================
# # 3. Data Collator & Metrics
# # ===============================
# data_collator = DataCollatorWithPadding(tokenizer)

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = logits.argmax(axis=-1)
#     return {"accuracy": accuracy_score(labels, predictions)}

# # ===============================
# # 4. Trainer Callback to Clear CUDA Cache
# # ===============================
# class ClearCudaCacheCallback(TrainerCallback):
#     def on_epoch_end(self, args, state, control, **kwargs):
#         torch.cuda.empty_cache()
#         return control

# # ===============================
# # 5. Optimized Trainer
# # ===============================
# class MemoryOptimizedTrainer(Trainer):
#     def evaluation_step(self, model, inputs):
#         """Override evaluation to use torch.no_grad() to save memory."""
#         model.eval()
#         with torch.no_grad():
#             outputs = model(**inputs)
#         return outputs

# trainer = MemoryOptimizedTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["valid"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.add_callback(ClearCudaCacheCallback)

# # ===============================
# # 6. Start Training
# # ===============================
# trainer.train()
