# Natural Language Processing with Disaster Tweets

Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).


# Load Data, Preprocessing

The dataset we have to feed consist of 'Tweets' according to the kaggle guide, which means that there sould be lots of 'cooloquial expressions','hashtags','links',etc.

Thus, we have to clean this data before we feed it to our model.

In [3]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./epoch:4_model.pt
./.DS_Store
./test.csv
./epoch:2_model.pt
./epoch:3_model.pt
./Gated_RNNs.pdf
./epoch:1_model.pt
./train.csv
./NLP_with_Disaster_Tweets.ipynb
./RNNs.pdf
./sample_submission.csv


  from pandas.core import (


In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_len = len(train) 
all_data = pd.concat([train,test]) 

We use the head()/describe()/info() functions to see what the dataset looks like.

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Cleaning data
    1. Match characters starting with '@' followed by any non-whitespace characters.
    2. `\S` matches any non-whitespace character (equivalent to `[^ \t\n\r\f\v]`).
    3. `+` means one or more repetitions of the preceding pattern.
    4. Match URLs such as http://..., https://..., or www....
    5. Match patterns like <...>, (...) or any characters within these brackets.
    6. List of punctuation characters: ['!','"','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~']

In [None]:
import re
import string

# Cleaning Functions
def remove_tag(text):
    tag = re.compile(r'@\S+')
    return tag.sub(r'',text)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url,'',text)

def remove_html(text):# Match HTML tags or content inside parentheses.
    html = re.compile(r'<[^>]+>|\([^)]+\)')
    return html.sub(r'',text)

def remove_punct(text):# Remove punctuation characters.
    punctuations = list(string.punctuation)
    table = str.maketrans('', '', ''.join(punctuations))
    return text.translate(table)

In [None]:
#Find stop words and punctuation marks.
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songruiming/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songruiming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
all_data['cleaned'] = all_data['text'].apply(lambda x:remove_tag(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_URL(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_html(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_punct(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: x.lower()) 
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: word_tokenize(x)) 
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: ' '.join([word for word in x if word not in stop]))

In [11]:
all_data.head()

Unnamed: 0,id,keyword,location,text,target,cleaned
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1.0,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,got sent photo ruby alaska smoke wildfires pou...


# Dataset, DataLoader

In [None]:
# we have combined train and test set into one all_data
train_data,test_data = all_data[:train_len],all_data[train_len:]

In [None]:
from torch.utils.data import Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self,df,is_grad,tokenizer):
        self.df = df 
        self.is_grad = is_grad 
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df) 

    def __getitem__(self,idx):
        text = self.df.loc[idx]['text'] 

        encoded_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=84, 
            return_tensors='pt', 
            return_attention_mask=True, 
        )

        if self.is_grad:
            labels = self.df.loc[idx]['target']
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze(),
                    'labels':torch.tensor(labels,dtype=torch.float).unsqueeze(dim=0)}
        else:
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze()}

In [None]:
from transformers import BertTokenizer
model_name = 'bert-large-uncased' 
tokenizer = BertTokenizer.from_pretrained(model_name)

train_dataset = TweetDataset(train_data,True,tokenizer)
test_dataset = TweetDataset(test_data,False,tokenizer)

BERT-base: BERT-base is the most basic BERT model, consisting of 12 Transformer encoder layers with a total of 110M parameters. The input embedding vector dimension for BERT-base is 768, and the hidden layer dimension is also 768.

BERT-large: BERT-large is a larger model compared to BERT-base. It consists of 24 Transformer encoder layers with a total of 340M parameters. Both the input embedding vector dimension and the hidden layer dimension for BERT-large are 1024.

BERT-cased: BERT-cased is a model trained while preserving case information in English text. This model is suitable for tasks that require case sensitivity.

In [15]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset)) # train:valid = 8:2
valid_size = len(train_dataset) - train_size

train_dataset,valid_dataset = random_split(train_dataset,[train_size,valid_size])

print(f'{len(train_dataset)} train samples')
print(f'{len(valid_dataset)} valid samples')
print(f'{len(test_dataset)} test samples')

6090 train samples
1523 valid samples
3263 test samples


In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True,pin_memory=True)
valid_dataloader = DataLoader(valid_dataset,batch_size=32,shuffle=False,pin_memory=True)
test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=False)

# Creating model

In [None]:
configs = {
    'model_name':'bert-large-uncased',
    'num_labels':2,
    'batch_size':32,
    'epochs':4,
    'learning_rate':5e-6,
}

In [None]:
import numpy as np
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification

# Never Detach Tensor during forward
class TweetsModel(nn.Module):
    def __init__(self,model_name):
        super().__init__()
        self.model = BertForSequenceClassification.from_pretrained(model_name)

    def forward(self,input_ids,attention_mask):
        output = self.model(input_ids=input_ids,attention_mask=attention_mask)
        logits = output.logits
        return logits

  torch.utils._pytree._register_pytree_node(


In [None]:
if torch.cuda.is_available():
    device = 'cuda'
    print('GPU is running on..')
else: 
    device = 'cpu'
    print('CPU is running on..')
model = TweetsModel(configs['model_name']).to(device)

CPU is running on..


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tools

In [20]:
# loss function
# (y_pred,y_label)
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()

In [21]:
# optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                lr=6e-6,
                eps=1e-8,
                no_deprecation_warning=True)

In [22]:
# metric for validation
# f1_score(y_label,y_pred)
from sklearn.metrics import f1_score

metric = f1_score

# Training the model

In [None]:
import gc,os
from tqdm.auto import tqdm # visualizing tool for progress

# They will be used to pick the best model.pt given to the valid loss
best_model_epoch, valid_loss_values = [],[] 
valid_loss_min = [1] # arbitrary loss set here
def train(model,device,train_dataloader,valid_dataloader,epochs,loss_fn,optimizer,metric):

    for epoch in range(epochs):
        gc.collect() 
        model.train()

        train_loss = 0
        train_step = 0
        pbar = tqdm(train_dataloader)

        for batch in pbar: 
            optimizer.zero_grad() # initialize
            train_step += 1

            train_input_ids = batch['input_ids'].to(device)
            train_attention_mask = batch['attention_mask'].to(device)
            train_labels = batch['labels'].squeeze().to(device).long()
            
            logits = model(train_input_ids, train_attention_mask).to(device)
            predictions = torch.argmax(logits, dim=1) # get an index from larger one
            detached_predictions = predictions.detach().cpu().numpy()
            
            loss = loss_fn(logits, train_labels)
            loss.backward() 
            optimizer.step()
            model.zero_grad()

            train_loss += loss.detach().cpu().numpy().item()

            pbar.set_postfix({'train_loss':train_loss/train_step})
        pbar.close()

        with torch.no_grad():
            model.eval()

            valid_loss = 0
            valid_step = 0
            total_valid_score = 0

            y_pred = [] # for getting f1_score that is a metric of the competition
            y_true = []

            pbar = tqdm(valid_dataloader)
            for batch in pbar:
                valid_step += 1

                valid_input_ids = batch['input_ids'].to(device)
                valid_attention_mask = batch['attention_mask'].to(device)
                valid_labels = batch['labels'].squeeze().to(device).long()

                logits = model(valid_input_ids, valid_attention_mask).to(device)
                predictions = torch.argmax(logits, dim=1)
                detached_predictions = predictions.detach().cpu().numpy()
                
                loss = loss_fn(logits, valid_labels)
                valid_loss += loss.detach().cpu().numpy().item()

                y_pred.extend(predictions.cpu().numpy())
                y_true.extend(valid_labels.cpu().numpy())

            valid_loss /= valid_step
            f1 = f1_score(y_true,y_pred)

            print(f'Epoch [{epoch+1}/{epochs}] Score: {f1}')
            print(f'Epoch [{epoch+1}/{epochs}] Valid_loss: {valid_loss}')

            if valid_loss < min(valid_loss_min):
                print('model improved!')
            else:
                print('model not improved')
    
            torch.save(model.state_dict(), f'epoch:{epoch+1}_model.pt')
            print('save checkpoint!')
            valid_loss_min.append(valid_loss)
            print(f'valid_loss_min:{min(valid_loss_min)}')

        best_model_epoch.append(f'epoch:{epoch+1}_model.pt')
        valid_loss_values.append(valid_loss)
        print('='*100)

    print('Train/Valid Completed!!')
    del train_dataloader, valid_dataloader # memory cleaning
    gc.collect()

In [24]:
if torch.cuda.is_available():
    print('GPU is running on...')
    device = 'cuda'
else:
    print('CPU is running on...')
    device = 'cpu'

CPU is running on...


In [26]:
print('Training Start!')
print('=' * 100)

train(model,
    device,
    train_dataloader,
    valid_dataloader,
    configs['epochs'],
    loss_fn,
    optimizer,
    metric)

del model,train_dataloader, valid_dataloader
gc.collect()

Training Start!


  0%|          | 0/191 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Inference

In [32]:
def inference(model,test_dataloader):
    all_preds = []
    model.eval()

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            logits = model(input_ids,attention_mask)
            logits = logits.detach().cpu().numpy()
            all_preds.append(logits)
    
    return all_preds

In [None]:
import torch
import numpy as np
from tqdm.auto import tqdm

def load_and_evaluate(model_paths, model, device, valid_dataloader, loss_fn):
    valid_loss_values = []

    for model_path in model_paths:
        print(f"Evaluating {model_path}...")
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()

        valid_loss = 0
        valid_step = 0

        with torch.no_grad():
            for batch in tqdm(valid_dataloader):
                valid_step += 1

                valid_input_ids = batch['input_ids'].to(device)
                valid_attention_mask = batch['attention_mask'].to(device)
                valid_labels = batch['labels'].squeeze().to(device).long()

                logits = model(valid_input_ids, valid_attention_mask).to(device)
                loss = loss_fn(logits, valid_labels)
                valid_loss += loss.detach().cpu().numpy().item()

            valid_loss /= valid_step
            valid_loss_values.append(valid_loss)
            print(f"{model_path} -> Valid Loss: {valid_loss}")

    best_model_idx = np.argmin(valid_loss_values)
    best_model_path = model_paths[best_model_idx]
    print(f"Best Model: {best_model_path} with Loss: {valid_loss_values[best_model_idx]}")

    return best_model_path


We have generated 4 model.pt, and then we compare each of them to find the best model for this case.

In [None]:
model_paths = [
    'epoch:1_model.pt',
    'epoch:2_model.pt',
    'epoch:3_model.pt',
    'epoch:4_model.pt'
]

best_model_path = load_and_evaluate(model_paths, model, device, valid_dataloader, loss_fn)

os.rename(best_model_path, 'best_model.pt')
print(f"Best model saved as 'best_model.pt'")


Evaluating epoch:1_model.pt...


  model.load_state_dict(torch.load(model_path, map_location=device))


  0%|          | 0/48 [00:00<?, ?it/s]

epoch:1_model.pt -> Valid Loss: 0.38174094694356125
Evaluating epoch:2_model.pt...


  0%|          | 0/48 [00:00<?, ?it/s]

epoch:2_model.pt -> Valid Loss: 0.32797790101418894
Evaluating epoch:3_model.pt...


  0%|          | 0/48 [00:00<?, ?it/s]

epoch:3_model.pt -> Valid Loss: 0.29382880482201773
Evaluating epoch:4_model.pt...


  0%|          | 0/48 [00:00<?, ?it/s]

epoch:4_model.pt -> Valid Loss: 0.25204828986898065
Best Model: epoch:4_model.pt with Loss: 0.25204828986898065
Best model saved as 'best_model.pt'


It turns out that the epoch:4_model.pt is the best model which we save it as "best_model.pt"

In [None]:
# Pick up the model.pt written with the best
# which has the lowest validation loss through all Epochs.

for filename in os.listdir():
    if 'best_model.pt' in filename: 
        best_pt = filename
print(f'Best model.pt: {best_pt}')
check_point = torch.load(best_pt)

model = TweetsModel(configs['model_name']).to(device)
model.to(device)
model.load_state_dict(check_point)

predictions = inference(model,test_dataloader)

Best model.pt: best_model.pt


  check_point = torch.load(best_pt)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3263 [00:00<?, ?it/s]

In [40]:
sample = pd.read_csv('sample_submission.csv')
sample

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [41]:
predictions = np.argmax(predictions,axis=2) # 0 or 1
sample['target'] = predictions
sample.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [42]:
sample.to_csv('submission.csv',index=False,header=True)