In [None]:
import os

import pandas as pd

from datasets import Dataset, DatasetDict

import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer, set_seed, DataCollatorWithPadding

import evaluate

In [None]:
data_path = os.path.join("..", "data", "nlp-getting-started")
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
submission_df = pd.read_csv(os.path.join(data_path, "test.csv"))

train_df = train_df.drop(columns=['id', 'keyword', 'location'])
train_df = train_df.rename(columns={"target": "labels"})

dataset = Dataset.from_pandas(train_df)
dataset_train_test_eval = dataset.train_test_split(train_size=0.80)
dataset_test_eval = dataset_train_test_eval['test'].train_test_split(train_size=0.50)
dataset = DatasetDict({
    'train' : dataset_train_test_eval['train'],
    'test' : dataset_test_eval['train'],
    'eval' : dataset_test_eval['test'],
})

print("Training Dataset Shape:", dataset['train'].shape)
print("Testing Dataset Shape:", dataset['test'].shape)
print("Evaluation Dataset Shape:", dataset['eval'].shape)

In [None]:
set_seed(42)

epochs = 5

num_labels = len(set(dataset["train"]["labels"]))

batch_size = 64
learning_rate = 2e-5

model_ckpt = "distilbert-base-uncased"
model_name = model_ckpt + "_" + "disaster_tweets"
results_path = os.path.join("..", "results", "distater_tweets")

metric = "f1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

tokenized_datasets = dataset.map(tokenize, batched=True, batch_size=batch_size)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir=results_path,
    logging_strategy='epoch',
    evaluation_strategy="epoch",
    save_strategy="epoch",         
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,                   
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,                       
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

In [None]:
train_results = trainer.train()

trainer.evaluate()

trainer.save_model(os.path.join(results_path, model_name))

In [None]:
model