In [40]:
!pip install transformers datasets peft accelerate torch scikit-learn



In [41]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model

In [42]:
dataset = load_dataset("sms_spam")

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})
{'sms': 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'label': 0}


In [43]:
MODEL_NAME = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [44]:
def tokenize(batch):
    return tokenizer(
        batch["sms"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["sms"]
)

tokenized = tokenized.rename_column("label", "labels")

tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

In [45]:
split_dataset = tokenized["train"].train_test_split(
    test_size=0.2,
    seed=42
)

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4459
})
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1115
})


In [46]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
import torch
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        class_weights = torch.tensor([1.0, 4.0]).to(logits.device)
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [48]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

In [49]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


In [50]:
training_args = TrainingArguments(
    output_dir="bert_sms_lora",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none"
)

In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0581,0.056747
2,0.0316,0.049955
3,0.0294,0.046098
4,0.0258,0.04958
5,0.0139,0.048279


TrainOutput(global_step=1395, training_loss=0.04300737045571795, metrics={'train_runtime': 388.1832, 'train_samples_per_second': 57.434, 'train_steps_per_second': 3.594, 'total_flos': 1471591227724800.0, 'train_loss': 0.04300737045571795, 'epoch': 5.0})

In [53]:
trainer.save_model("bert_sms_lora/checkpoint-837")
tokenizer.save_pretrained("bert_sms_lora/checkpoint-837")

('bert_sms_lora/checkpoint-837/tokenizer_config.json',
 'bert_sms_lora/checkpoint-837/special_tokens_map.json',
 'bert_sms_lora/checkpoint-837/vocab.txt',
 'bert_sms_lora/checkpoint-837/added_tokens.json',
 'bert_sms_lora/checkpoint-837/tokenizer.json')

In [54]:
model.to("cuda")

device = next(model.parameters()).device

texts = [
    "Congratulations you won a free prize",
    "Can we meet tomorrow evening"
]

inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

inputs = {k: v.to(device) for k, v in inputs.items()}

model.eval()

with torch.no_grad():
    outputs = model(**inputs)

preds = outputs.logits.argmax(dim=1)

label_map = {0: "ham", 1: "spam"}

for text, pred in zip(texts, preds):
    print(text)
    print("Prediction", label_map[pred.item()])
    print()

Congratulations you won a free prize
Prediction spam

Can we meet tomorrow evening
Prediction ham

