### RoBERTa (PyTorch) for Disaster Tweets Classification

##### Data Source: https://www.kaggle.com/competitions/nlp-getting-started/data

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.optim import Adam
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

import numpy as np
import re
import nltk
import string

### Read data to pandas

In [18]:
BASE = 'D:\\ResearchDataGtx1060\\DisasterTweets\\'

train_df = pd.read_csv(BASE+"train.csv")
test_df = pd.read_csv(BASE+"test.csv")

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [4]:
# train_df

In [21]:
train_df = train_df.drop(["keyword", "location"], axis=1)
val_df = val_df.drop(["keyword", "location"], axis=1)
test_df = test_df.drop(["keyword", "location"], axis=1)

train_df

Unnamed: 0,id,text,target
4996,7128,Courageous and honest analysis of need to use ...,1
3263,4688,@ZachZaidman @670TheScore wld b a shame if tha...,0
4907,6984,Tell @BarackObama to rescind medals of 'honor'...,1
2855,4103,Worried about how the CA drought might affect ...,1
4716,6706,@YoungHeroesID Lava Blast &amp; Power Red #Pan...,0
...,...,...,...
5226,7470,@Eganator2000 There aren't many Obliteration s...,0
5390,7691,just had a panic attack bc I don't have enough...,0
860,1242,Omron HEM-712C Automatic Blood Pressure Monito...,0
7603,10862,Officials say a quarantine is in place at an A...,1


### Data Loading and Preprocessing

We remove mentions, links, hashtags, punctuations, and other stuff we deem not necessary for our model to come up with correct predictions.

In [8]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        texts = dataframe.text.values.tolist()

        texts = [self._preprocess(text) for text in texts]

        self._print_random_samples(texts)

        self.texts = [tokenizer(text, padding='max_length',
                                max_length=150,
                                truncation=True,
                                return_tensors="pt")
                      for text in texts]

        if 'target' in dataframe:
            classes = dataframe.target.values.tolist()
            self.labels = classes

    def _print_random_samples(self, texts):
        np.random.seed(42)
        random_entries = np.random.randint(0, len(texts), 5)

        for i in random_entries:
            print(f"Entry {i}: {texts[i]}")

        print()

    def _preprocess(self, text):
        text = self._remove_amp(text)
        text = self._remove_links(text)
        text = self._remove_hashes(text)
        text = self._remove_retweets(text)
        text = self._remove_mentions(text)
        text = self._remove_multiple_spaces(text)

        #text = self._lowercase(text)
        text = self._remove_punctuation(text)
        #text = self._remove_numbers(text)

        text_tokens = self._tokenize(text)
        text_tokens = self._stopword_filtering(text_tokens)
        #text_tokens = self._stemming(text_tokens)
        text = self._stitch_text_tokens_together(text_tokens)

        return text.strip()


    def _remove_amp(self, text):
        return text.replace("&amp;", " ")

    def _remove_mentions(self, text):
        return re.sub(r'(@.*?)[\s]', ' ', text)
    
    def _remove_multiple_spaces(self, text):
        return re.sub(r'\s+', ' ', text)

    def _remove_retweets(self, text):
        return re.sub(r'^RT[\s]+', ' ', text)

    def _remove_links(self, text):
        return re.sub(r'https?:\/\/[^\s\n\r]+', ' ', text)

    def _remove_hashes(self, text):
        return re.sub(r'#', ' ', text)

    def _stitch_text_tokens_together(self, text_tokens):
        return " ".join(text_tokens)

    def _tokenize(self, text):
        return nltk.word_tokenize(text, language="english")

    def _stopword_filtering(self, text_tokens):
        stop_words = nltk.corpus.stopwords.words('english')

        return [token for token in text_tokens if token not in stop_words]

    def _stemming(self, text_tokens):
        porter = nltk.stem.porter.PorterStemmer()
        return [porter.stem(token) for token in text_tokens]

    def _remove_numbers(self, text):
        return re.sub(r'\d+', ' ', text)

    def _lowercase(self, text):
        return text.lower()

    def _remove_punctuation(self, text):
        return ''.join(character for character in text if character not in string.punctuation)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        label = -1
        if hasattr(self, 'labels'):
            label = self.labels[idx]

        return text, label

### Building the Classifier

In [10]:
class TweetClassifier(nn.Module):
    def __init__(self, base_model):
        super(TweetClassifier, self).__init__()

        self.bert = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(bert_out)
        x = self.relu(x)
        
        x = self.fc2(x)
        x = self.sigmoid(x)

        return x

### Setting up the training loop

For each epoch we calculate the loss as well as the accuracy on the train and the validation set. We apply early stopping and store the best model, according to the validation loss, for later usage.

We use BinaryCrossEntropyLoss as we are dealing with a binary classification task, and our model is built in such a way that via the Sigmoid function at the end it should output a nice probability of 0 (=no disaster) and 1 (=disaster).

In [12]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    best_val_loss = float('inf')
    early_stopping_threshold_count = 0
    
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)

    for epoch in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        
        model.train()
        
        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)

            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, train_label.float().unsqueeze(1))

            total_loss_train += loss.item()

            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            total_acc_train += acc

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        with torch.no_grad():
            total_acc_val = 0
            total_loss_val = 0
            
            model.eval()
            
            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                val_label = val_label.to(device)

                output = model(input_ids, attention_mask)

                loss = criterion(output, val_label.float().unsqueeze(1))

                total_loss_val += loss.item()

                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                total_acc_val += acc
            
            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train / (len(train_dataloader.dataset)): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
            
            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val
                torch.save(model, f"best_model.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1
                
            if early_stopping_threshold_count >= 1:
                print("Early stopping")
                break

### Training our classifier

As our final step for the text classification we put it all together, we initialize our train and validation datasets, instantiate our chosen RoBERTa model and start a training loop.

In [14]:
torch.manual_seed(0)
np.random.seed(0)
    
    
BERT_MODEL = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
base_model = AutoModel.from_pretrained(BERT_MODEL)

train_dataloader = DataLoader(TweetDataset(train_df, tokenizer), batch_size=8, shuffle=True, num_workers=0)
val_dataloader = DataLoader(TweetDataset(val_df, tokenizer), batch_size=8, num_workers=0)

model = TweetClassifier(base_model)


learning_rate = 1e-5
epochs = 5
train(model, train_dataloader, val_dataloader, learning_rate, epochs)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Entry 860: If NWS wth rotating storm w report HUGE MASSIVE VIOLENT tornado would
Entry 5390: Demolished My Personal Best
Entry 5226: sing tsunami Beginners computer tutorial Everyone Wants To Learn To Build A Pc Re
Entry 5191: Survival Kit Whistle Fire Starter Wire Saw Cree Torch Emergency Blanket S knife Full reÛ
Entry 3772: Buddha man time massive urbanisation social upheaval also challenged Brahmans dominance Genius Ancient World

Entry 1126: They didnt succeed two times either Bomb didnt detonate Shots missed
Entry 1459: Nobody remembers came second Charles Schulz
Entry 860: Near sand half sunk shattered visage lies
Entry 1294: understanding umntu wakho If trust partner OK u know wont fear anything
Entry 1130: My back sunburned



100%|████████████████████████████████████████████████████████████████████████████████| 762/762 [04:36<00:00,  2.76it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 191/191 [00:18<00:00, 10.09it/s]


Epochs: 1 | Train Loss:  0.457 | Train Accuracy:  0.796 | Val Loss:  0.384 | Val Accuracy:  0.842


  0%|                                                                                          | 0/762 [00:00<?, ?it/s]

Saved model


100%|████████████████████████████████████████████████████████████████████████████████| 762/762 [04:39<00:00,  2.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 191/191 [00:18<00:00, 10.06it/s]

Epochs: 2 | Train Loss:  0.361 | Train Accuracy:  0.853 | Val Loss:  0.391 | Val Accuracy:  0.836
Early stopping





### Predicting for the test data

Analogously to the training loop, we define ourselves a method to conveniently extract the predictions done by our model. As our model is returning a probability between 0 and 1 we use a 50% threshold for our target classification.

In [15]:
def get_text_predictions(model, loader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = model.to(device)
    
    
    results_predictions = []
    with torch.no_grad():
        model.eval()
        for data_input, _ in tqdm(loader):
            attention_mask = data_input['attention_mask'].to(device)
            input_ids = data_input['input_ids'].squeeze(1).to(device)


            output = model(input_ids, attention_mask)
            
            output = (output > 0.5).int()
            results_predictions.append(output)
    
    return torch.cat(results_predictions).cpu().detach().numpy()

Next we load up the previously saved model and set up the test data loader.

To store our predictions we use the given "sample_submission.csv", which already contains the test IDs and the sample target, which we are going to overwrite with the predictions of our model. 

In [20]:
model = torch.load("best_model.pt")

test_dataloader = DataLoader(TweetDataset(test_df, tokenizer), 
batch_size=8, shuffle=False, num_workers=0)

sample_submission = pd.read_csv(BASE+"sample_submission.csv")

sample_submission["target"] = get_text_predictions(model, test_dataloader)

display(sample_submission.head(20))

sample_submission.to_csv("submission_1.csv", index=False)

Entry 3174: Rocky Fire cali SCFD wildfire LakeCounty
Entry 860: First time everything Coney Island Cyclone
Entry 1294: If told drowning I would lend hand
Entry 1130: HitchBot travels Europe greeted open arms Gets destroyed two weeks america Theres lesson learned
Entry 1095: Free Kindle Book Aug 37 Thriller Desolation Run



100%|████████████████████████████████████████████████████████████████████████████████| 408/408 [00:38<00:00, 10.73it/s]


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


Reference: 

https://www.auroria.io/nlp-disaster-tweet-text-classification-roberta-pytorch/