In [1]:
#this notebook will do sentiment analysis on the disaster tweets from kaggle dataset using hugging face library

#importing libraries
import re
import torch

from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.nn import functional as F

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# initializing the model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
#create custom dataset 

class disaster_tweets(Dataset):
    def __init__(self, train_text_list, labels, tokenizer):
        self.encoding = tokenizer(train_text_list, padding=True, truncation=True)
        self.labels = labels

        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        # return dictionary compatible with the huggingface transformers
        item = {key:torch.tensor(val[idx]) for key, val in self.encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [8]:
#load data from data/tweetclassified/train.csv and seperate into train and eval
train_csv_path = 'data/tweetclassified/train.csv'
df = pd.read_csv(train_csv_path)
df['text'] = df['text'].replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)',"",regex=True)
tweet = df['text'].tolist()
label = df['target'].tolist()

train_text, val_text, train_label, val_label = train_test_split(tweet, label, test_size=0.2, random_state=42)

train_dataset = disaster_tweets(train_text, train_label, tokenizer)
val_dataset = disaster_tweets(val_text, val_label, tokenizer)

In [9]:
#setting up training process using hugging face's Trainer
training_args = TrainingArguments(
    output_dir='./diastertweetresult',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./diastertweetresult/logs',
    logging_steps=20,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [10]:
trainer.train()

***** Running training *****
  Num examples = 6090
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1524


  0%|          | 0/1524 [00:00<?, ?it/s]

{'loss': 2.1087, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.05}
{'loss': 1.9917, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'loss': 1.6015, 'learning_rate': 6e-06, 'epoch': 0.16}
{'loss': 1.2251, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.21}
{'loss': 0.8071, 'learning_rate': 1e-05, 'epoch': 0.26}
{'loss': 0.6759, 'learning_rate': 1.2e-05, 'epoch': 0.31}
{'loss': 0.5559, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.37}
{'loss': 0.5154, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.42}
{'loss': 0.4587, 'learning_rate': 1.8e-05, 'epoch': 0.47}
{'loss': 0.4828, 'learning_rate': 2e-05, 'epoch': 0.52}
{'loss': 0.3771, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.58}
{'loss': 0.4253, 'learning_rate': 2.4e-05, 'epoch': 0.63}
{'loss': 0.4662, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.68}
{'loss': 0.5023, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.73}
{'loss': 0.4406, 'learning_rate': 3e-05, 'epoch': 0.79}
{'loss': 0.4411, 'lea

Saving model checkpoint to ./diastertweetresult/checkpoint-500
Configuration saved in ./diastertweetresult/checkpoint-500/config.json


{'loss': 0.3781, 'learning_rate': 5e-05, 'epoch': 1.31}


Model weights saved in ./diastertweetresult/checkpoint-500/pytorch_model.bin


{'loss': 0.3536, 'learning_rate': 4.90234375e-05, 'epoch': 1.36}
{'loss': 0.3886, 'learning_rate': 4.8046875e-05, 'epoch': 1.42}
{'loss': 0.3769, 'learning_rate': 4.70703125e-05, 'epoch': 1.47}
{'loss': 0.3947, 'learning_rate': 4.609375e-05, 'epoch': 1.52}
{'loss': 0.3543, 'learning_rate': 4.5117187500000005e-05, 'epoch': 1.57}
{'loss': 0.3903, 'learning_rate': 4.4140625000000004e-05, 'epoch': 1.63}
{'loss': 0.4377, 'learning_rate': 4.31640625e-05, 'epoch': 1.68}
{'loss': 0.351, 'learning_rate': 4.21875e-05, 'epoch': 1.73}
{'loss': 0.3322, 'learning_rate': 4.12109375e-05, 'epoch': 1.78}
{'loss': 0.4187, 'learning_rate': 4.0234375e-05, 'epoch': 1.84}
{'loss': 0.3605, 'learning_rate': 3.92578125e-05, 'epoch': 1.89}
{'loss': 0.3131, 'learning_rate': 3.828125e-05, 'epoch': 1.94}
{'loss': 0.3695, 'learning_rate': 3.7304687500000005e-05, 'epoch': 1.99}
{'loss': 0.2839, 'learning_rate': 3.6328125000000004e-05, 'epoch': 2.05}
{'loss': 0.1773, 'learning_rate': 3.53515625e-05, 'epoch': 2.1}
{'lo

Saving model checkpoint to ./diastertweetresult/checkpoint-1000
Configuration saved in ./diastertweetresult/checkpoint-1000/config.json


{'loss': 0.1677, 'learning_rate': 2.55859375e-05, 'epoch': 2.62}


Model weights saved in ./diastertweetresult/checkpoint-1000/pytorch_model.bin


{'loss': 0.2328, 'learning_rate': 2.4609375e-05, 'epoch': 2.68}
{'loss': 0.2286, 'learning_rate': 2.3632812500000003e-05, 'epoch': 2.73}
{'loss': 0.2606, 'learning_rate': 2.2656250000000002e-05, 'epoch': 2.78}
{'loss': 0.284, 'learning_rate': 2.16796875e-05, 'epoch': 2.83}
{'loss': 0.2436, 'learning_rate': 2.0703125e-05, 'epoch': 2.89}
{'loss': 0.1426, 'learning_rate': 1.9726562500000003e-05, 'epoch': 2.94}
{'loss': 0.1975, 'learning_rate': 1.8750000000000002e-05, 'epoch': 2.99}
{'loss': 0.116, 'learning_rate': 1.77734375e-05, 'epoch': 3.04}
{'loss': 0.0736, 'learning_rate': 1.6796875e-05, 'epoch': 3.1}
{'loss': 0.0992, 'learning_rate': 1.58203125e-05, 'epoch': 3.15}
{'loss': 0.0949, 'learning_rate': 1.484375e-05, 'epoch': 3.2}
{'loss': 0.1145, 'learning_rate': 1.38671875e-05, 'epoch': 3.25}
{'loss': 0.1604, 'learning_rate': 1.2890625e-05, 'epoch': 3.31}
{'loss': 0.0764, 'learning_rate': 1.19140625e-05, 'epoch': 3.36}
{'loss': 0.0909, 'learning_rate': 1.09375e-05, 'epoch': 3.41}
{'loss

Saving model checkpoint to ./diastertweetresult/checkpoint-1500
Configuration saved in ./diastertweetresult/checkpoint-1500/config.json


{'loss': 0.0876, 'learning_rate': 1.1718750000000001e-06, 'epoch': 3.94}


Model weights saved in ./diastertweetresult/checkpoint-1500/pytorch_model.bin


{'loss': 0.1126, 'learning_rate': 1.953125e-07, 'epoch': 3.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 148.952, 'train_samples_per_second': 163.543, 'train_steps_per_second': 10.231, 'train_loss': 0.3624864375888519, 'epoch': 4.0}


TrainOutput(global_step=1524, training_loss=0.3624864375888519, metrics={'train_runtime': 148.952, 'train_samples_per_second': 163.543, 'train_steps_per_second': 10.231, 'train_loss': 0.3624864375888519, 'epoch': 4.0})

In [13]:
#setting up device to run on cuda
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#load some test data and run prediction
test_csv_path = 'data/tweetclassified/test.csv'
df = pd.read_csv(test_csv_path)
df['text'] = df['text'].replace(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)',"",regex=True)
tweet = df['text'].tolist()
batch = tokenizer(tweet[:10], padding=True, truncation=True, return_tensors='pt')
batch.to(device)

with torch.no_grad():
    outputs = model(**batch)
    predictions = F.softmax(outputs.logits, dim=1)
    labels = torch.argmax(predictions, dim=1)
    print(labels)

tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], device='cuda:0')
