In [1]:
# !pip install transformers datasets accelerate bitsandbytes

In [None]:
from datasets import load_dataset

dataset = load_dataset("lutful2004/MeDAL-dataset")
print(dataset)


In [None]:
from datasets import ClassLabel

# Extract unique labels
unique_labels = sorted(set(dataset["train"]["LABEL"]))
num_labels = len(unique_labels)
print("Number of labels:", num_labels)

# Build mapping
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Example mapping:", list(label2id.items())[:5])


In [None]:
from transformers import AutoTokenizer

# Initialize tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Suppose you already have label2id dictionary
# label2id = {label: idx for idx, label in enumerate(unique_labels)}

def preprocess_function(examples):
    """
    Full preprocessing pipeline:
    1. Mark abbreviation using LOCATION
    2. Encode LABEL as numeric id
    3. Tokenize marked text with BERT tokenizer
    """
    marked_texts = []
    labels = []

    for text, loc, label in zip(examples["TEXT"], examples["LOCATION"], examples["LABEL"]):
        # Step 1: Mark abbreviation
        before = text[:loc]
        after = text[loc:]
        tokens = after.split(maxsplit=1)
        abbr = tokens[0]
        rest = tokens[1] if len(tokens) > 1 else ""
        marked_text = f"{before} [ABBR] {abbr} [/ABBR] {rest}"
        marked_texts.append(marked_text)

        # Step 2: Encode labels
        labels.append(label2id[label])

    # Step 3: Tokenize
    model_inputs = tokenizer(
        marked_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    model_inputs["label"] = labels
    return model_inputs

# Apply to dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Set format for PyTorch
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    load_in_8bit=True,   # bitsandbytes compression
    device_map="auto"
)

model.gradient_checkpointing_enable()  # save memory


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,     # small batch
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,     # effective batch size = 32
    num_train_epochs=3,
    fp16=True,                         # mixed precision
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
)


In [None]:
from transformers import Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)


In [None]:
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    BitsAndBytesConfig, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding, 
    TrainerCallback
)
from sklearn.metrics import accuracy_score

# ===============================
# 1. Model & Device Setup
# ===============================
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model with quantization and auto device placement
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    device_map="auto"
)

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Optional: split layers manually if needed (multi-GPU)
# model.layer1.to('cuda:0')
# model.layer2.to('cuda:1')

# ===============================
# 2. Training Arguments
# ===============================
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,     # small batch
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,     # effective batch size = 16
    num_train_epochs=2,
    fp16=True,                         # mixed precision
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=0
)

# ===============================
# 3. Data Collator & Metrics
# ===============================
data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# ===============================
# 4. Trainer Callback to Clear CUDA Cache
# ===============================
class ClearCudaCacheCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()
        return control

# ===============================
# 5. Optimized Trainer
# ===============================
class MemoryOptimizedTrainer(Trainer):
    def evaluation_step(self, model, inputs):
        """Override evaluation to use torch.no_grad() to save memory."""
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs

trainer = MemoryOptimizedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback(ClearCudaCacheCallback)

# ===============================
# 6. Start Training
# ===============================
trainer.train()
