In [1]:
# !pip install transformers datasets accelerate bitsandbytes

In [None]:
from datasets import load_dataset

dataset = load_dataset("lutful2004/MeDAL-dataset")
print(dataset)


In [None]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["TEXT"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=22250,
    load_in_8bit=True,   # bitsandbytes compression
    device_map="auto"
)

model.gradient_checkpointing_enable()  # save memory


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,     # small batch
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,     # effective batch size = 32
    num_train_epochs=3,
    fp16=True,                         # mixed precision
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
)


In [None]:
from transformers import Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)
