In [None]:
import os

import kagglehub

import numpy as np

import pandas as pd

import torch
import torch.nn as nn

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)

from datasets import Dataset

import evaluate

from peft import LoraConfig, TaskType, get_peft_model

In [None]:
model_ckpt = "microsoft/deberta-v3-base"
# model_ckpt = "distilbert-base-uncased"
# model_ckpt = "microsoft/deberta-v3-large"
# model_ckpt = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
# model_ckpt = "vinai/bertweet-base"

results_path = os.path.join("results" "disaster-tweets")

# Training parameters
epochs = 20
batch_size = 128
learning_rate = 5e-5
weight_decay = 1e-4
warmup_ratio = 0.0
max_length = 64

# LORA parameters
r = 8
lora_alpha = 32
target_modules = ["query_proj", "key_proj", "value_proj"]
lora_dropout = 0.3
bias = "none"

In [None]:
data_path = kagglehub.competition_download("nlp-getting-started")

train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
train_df = train_df.drop(columns=["id", "keyword", "location"])
train_df = train_df.rename(columns={"target": "labels"})
train_dataset_train_eval = Dataset.from_pandas(train_df).train_test_split(
    train_size=0.9
)
train_dataset = train_dataset_train_eval["train"]
eval_dataset = train_dataset_train_eval["test"]

test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
submission_df = test_df[["id"]]
test_df = test_df.drop(columns=["id", "keyword", "location"])
test_df = test_df.rename(columns={"target": "labels"})
test_dataset = Dataset.from_pandas(test_df)

print("Train dataset shape:", train_dataset.shape)
print("Eval dataset shape:", eval_dataset.shape)

tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt, use_fast=False, clean_up_tokenization_spaces=True
)

tokenize = lambda batch: tokenizer(
    batch["text"],
    padding="max_length",
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
)

tokenized_train_dataset = train_dataset.map(
    tokenize, batched=True, batch_size=batch_size
)
tokenized_eval_dataset = eval_dataset.map(tokenize, batched=True, batch_size=batch_size)

tokenized_test_dataset = test_dataset.map(tokenize, batched=True, batch_size=batch_size)

In [None]:
model_name = model_ckpt.split("/")[-1] + "_disaster_tweets"

device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    precision = precision_metric.compute(
        predictions=predictions, references=labels, average="binary"
    )["precision"]
    recall = recall_metric.compute(
        predictions=predictions, references=labels, average="binary"
    )["recall"]
    f1 = f1_metric.compute(
        predictions=predictions, references=labels, average="binary"
    )["f1"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=len(set(train_dataset["labels"]))
)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias=bias,
)
model = get_peft_model(model, lora_config)

# for param in model.base_model.parameters():
# param.requires_grad = False

In [None]:
training_args = TrainingArguments(
    output_dir=results_path,
    logging_dir=os.path.join(results_path, "logs"),
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    disable_tqdm=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

trainer.model.save_pretrained(os.path.join(results_path, model_name))

In [None]:
model_prediction = trainer.predict(tokenized_test_dataset)
predictions = np.argmax(model_prediction.predictions, axis=-1)

submission_df["target"] = predictions
submission_df.to_csv(os.path.join(results_path, "submission.csv"), index=False)

submission_df