In [1]:
import os

import pandas as pd

from datasets import Dataset, DatasetDict

import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer, set_seed, DataCollatorWithPadding

import evaluate

In [2]:
data_path = os.path.join("..", "data", "nlp-getting-started")
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
submission_df = pd.read_csv(os.path.join(data_path, "test.csv"))

train_df = train_df.drop(columns=['id', 'keyword', 'location'])
train_df = train_df.rename(columns={"target": "labels"})

dataset = Dataset.from_pandas(train_df)
dataset_train_test_eval = dataset.train_test_split(train_size=0.80)
dataset_test_eval = dataset_train_test_eval['test'].train_test_split(train_size=0.50)
dataset = DatasetDict({
    'train' : dataset_train_test_eval['train'],
    'test' : dataset_test_eval['train'],
    'eval' : dataset_test_eval['test'],
})

print("Training Dataset Shape:", dataset['train'].shape)
print("Testing Dataset Shape:", dataset['test'].shape)
print("Evaluation Dataset Shape:", dataset['eval'].shape)

Training Dataset Shape: (6090, 2)
Testing Dataset Shape: (761, 2)
Evaluation Dataset Shape: (762, 2)


In [3]:
set_seed(42)

epochs = 5

num_labels = len(set(dataset["train"]["labels"]))

batch_size = 64
learning_rate = 2e-5

model_ckpt = "distilbert-base-uncased"
model_name = model_ckpt + "_" + "disaster_tweets"
results_path = os.path.join("..", "results", "distater_tweets")

metric = "f1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

tokenized_datasets = dataset.map(tokenize, batched=True, batch_size=batch_size)



Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/761 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir=results_path,
    logging_strategy='epoch',
    evaluation_strategy="epoch",
    save_strategy="epoch",         
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,                   
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,                       
    args=training_args,                  
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)



In [7]:
train_results = trainer.train()

trainer.evaluate()

trainer.save_model(os.path.join(results_path, model_name))

  0%|          | 0/288 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 17.27 GB, other allocations: 866.67 MB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [11]:
model

Linear(in_features=768, out_features=2, bias=True)