BERT implementation

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

train_csv_path = 'train.csv'
train_df = pd.read_csv(train_csv_path)

all_texts = train_df['text'].values.tolist()
all_labels = train_df['target'].values.tolist()

print("Tweets are loaded. Total # of tweets: {}.".format(len(all_texts)))
print("# of labels:")
print(train_df['target'].value_counts())

Tweets are loaded. Total # of tweets: 7613.
# of labels:
0    4342
1    3271
Name: target, dtype: int64


In [2]:
frequent_tweets = {}
for t, l in zip(all_texts, all_labels):
    if all_texts.count(t) > 2:
        frequent_tweets[t] = [l] if t not in frequent_tweets else frequent_tweets[t] + [l]
        
print("The number of tweeets which appear multiple times: {}"
      .format(len(frequent_tweets.keys())))     

print("Tweets which have inconsistent labeling:")
print()

for t, ls in frequent_tweets.items():
    if not all(element == ls[0] for element in ls):
        print(t)
        print(ls)

The number of tweeets which appear multiple times: 19
Tweets which have inconsistent labeling:

To fight bioterrorism sir.
[1, 0, 1, 0]
.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4
[1, 1, 0, 1]
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam
[0, 1, 1, 0, 0, 0]
Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her
 
#FARRAKHAN #QUOTE
[1, 0, 0]
#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption
[1, 1, 0]
The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'
[0, 0, 1, 0, 0, 1]
Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife
[0, 1, 0]
#Allah describes piling up #wealth thinking it would

In [3]:
should_be_real = [".POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4",
                 "#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption",
                 "CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring"]

should_not_be_real = ["He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam",
                     "Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her",
                      "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
                     "Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife",
                     "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect",
                     "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time",
                     "To fight bioterrorism sir."]
def fix_labels(tweets_to_fix, correct_label):
    for i, (tweet, label) in enumerate(zip(all_texts, all_labels)):
        if any(tweet.startswith(t) for t in tweets_to_fix):
            all_labels[i] = correct_label

        
fix_labels(should_be_real, 1)
fix_labels(should_not_be_real, 0)

print("Relabeled {} tweets in total".format(len(should_be_real) + len(should_not_be_real)))

Relabeled 10 tweets in total


In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels,
    stratify = train_df['target']
)

print('Train data is read and split into training and validation sets.')
print('Size of train data (# of entries): {}'.format(len(train_texts)))
print('Size of validation data (# of entries): {}'.format(len(val_texts)))

Train data is read and split into training and validation sets.
Size of train data (# of entries): 5709
Size of validation data (# of entries): 1904


In [5]:
import re
import string


def remove_urls(tweet):
    return re.sub(r"http(s?)://[\S]+", '', tweet)

def remove_at_links(tweet):
    return re.sub(r"\B(@)\S+", '', tweet)

def remove_non_ascii_chars(tweet):
    ascii_chars = set(string.printable)
    for c in tweet:
        if c not in ascii_chars:
            tweet = tweet.replace(c,'')
    return tweet
def fix_ax_nots(tweet):
    tweet = tweet.replace(" dont ", " do not ")
    tweet = tweet.replace(" don't ", " do not ")
    tweet = tweet.replace(" doesnt ", " does not ")
    tweet = tweet.replace(" doesn't ", " does not ")
    tweet = tweet.replace(" wont ", " will not ")
    tweet = tweet.replace(" won't ", " will not ")
    tweet = tweet.replace(" cant ", " cannot ")
    tweet = tweet.replace(" can't ", " cannot ")
    tweet = tweet.replace(" couldnt ", " could not ")
    tweet = tweet.replace(" couldn't ", " could not ")
    tweet = tweet.replace(" shouldnt ", " should not ")
    tweet = tweet.replace(" shouldn't ", " should not ")
    tweet = tweet.replace(" wouldnt ", " would not ")
    tweet = tweet.replace(" wouldn't ", " would not ")
    tweet = tweet.replace(" mustnt ", " must not ")
    tweet = tweet.replace(" mustn't ", " must not ")
    
    return tweet
def fix_personal_pronouns_and_verb(tweet):
    tweet = tweet.replace(" im ", " i am ")
    tweet = tweet.replace(" youre ", " you are")
    tweet = tweet.replace(" hes ", " he is") # ? he's can be he has as well
    tweet = tweet.replace(" shes ", " she is")
    # we are -> we're -> were  ---- were is a valid word
    tweet = tweet.replace(" theyre ", " they are")
    
    tweet = tweet.replace(" ive ", " i have ")
    tweet = tweet.replace(" youve ", " you have ")
    tweet = tweet.replace(" weve ", " we have ")
    tweet = tweet.replace(" theyve ", " they have ")
    
    tweet = tweet.replace(" youll ", " you will ")
    tweet = tweet.replace(" theyll ", " they will ")
    
    return tweet
def fix_special_chars(tweet):
    tweet = tweet.replace("&amp;", " and ")
    # tweet = tweet.replace("--&gt;", "")
    return tweet
        

def clean_tweet(tweet):
    tweet = remove_urls(tweet)
    tweet = remove_at_links(tweet)
    tweet = remove_non_ascii_chars(tweet)
    tweet = fix_special_chars(tweet)
    tweet = fix_ax_nots(tweet)
    tweet = fix_personal_pronouns_and_verb(tweet)
        
    return tweet

In [6]:
cleaned_train_texts = [clean_tweet(tweet) for tweet in train_texts]
print("Train tweets cleaned.")
cleaned_val_texts = [clean_tweet(tweet) for tweet in val_texts]
print("Validation tweets cleaned.")

Train tweets cleaned.
Validation tweets cleaned.


In [7]:
from transformers import AutoTokenizer

model_name = 'bert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(cleaned_train_texts, truncation=True, padding=True)
val_encodings = tokenizer(cleaned_val_texts, truncation=True, padding=True)

print('Train & validation texts encoded')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Train & validation texts encoded


In [8]:
import torch
class TweetDataset(torch.utils.data.Dataset):
    """
    Class to store the tweet data as PyTorch Dataset
    """
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        # an encoding can have keys such as input_ids and attention_mask
        # item is a dictionary which has the same keys as the encoding has
        # and the values are the idxth value of the corresponding key (in PyTorch's tensor format)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
        

print(TweetDataset.__doc__)


    Class to store the tweet data as PyTorch Dataset
    


In [9]:
# device (turn on GPU acceleration for faster execution)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Device used: {}.".format(device))

Device used: cpu.


In [12]:
from torch import nn
from transformers import BertModel

in_features = 768 # it's 768 because that's the size of the output provided by the underlying BERT model
class BertWithCustomNNClassifier(nn.Module):
    """
    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    """
    
    def __init__(self, linear_size):
        super(BertWithCustomNNClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout1 = nn.Dropout()
        self.linear1 = nn.Linear(in_features=in_features, out_features=linear_size)
        self.batch_norm1 = nn.BatchNorm1d(num_features=linear_size)
        self.dropout2 = nn.Dropout(p=0.8)
        self.linear2 = nn.Linear(in_features=linear_size, out_features=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, tokens, attention_mask):
        bert_output = self.bert(input_ids=tokens, attention_mask=attention_mask)
        x = self.dropout1(bert_output[1])
        x = self.linear1(x)
        x = self.dropout2(x)
        x = self.batch_norm1(x)
        x = self.linear2(x)
        x = self.batch_norm2(x)
        return self.sigmoid(x)
        
    def freeze_bert(self):
        """
        Freezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        only the wieghts of the custom classifier are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=False

    def unfreeze_bert(self):
        """
        Unfreezes the parameters of BERT so when BertWithCustomNNClassifier is trained
        both the wieghts of the custom classifier and of the underlying BERT are modified.
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad=True

            
print(BertWithCustomNNClassifier.__doc__)



    A pre-trained BERT model with a custom classifier.
    The classifier is a neural network implemented in this class.
    


In [13]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

def eval_prediction(y_batch_actual, y_batch_predicted):
    """Return batches of accuracy and f1 scores."""
    y_batch_actual_np = y_batch_actual.cpu().detach().numpy()
    y_batch_predicted_np = np.round(y_batch_predicted.cpu().detach().numpy())
    
    acc = accuracy_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np)
    f1 = f1_score(y_true=y_batch_actual_np, y_pred=y_batch_predicted_np, average='weighted')
    
    return acc, f1

print(eval_prediction.__doc__)

Return batches of accuracy and f1 scores.


In [23]:
# parameters
num_of_epochs = 1
learning_rate = 27e-6
batch_size = 16
hidden_layers = 8

print("Epochs: {}".format(num_of_epochs))
print("Learning rate: {:.6f}".format(learning_rate))
print("Batch size: {}".format(batch_size))
print("The number of hidden layers in the custom head: {}".format(hidden_layers))

Epochs: 1
Learning rate: 0.000027
Batch size: 16
The number of hidden layers in the custom head: 8


In [15]:
model = BertWithCustomNNClassifier(linear_size=hidden_layers)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertWithCustomNNClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [16]:
from transformers import AdamW

# optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)
print('Initialized optimizer.')

Initialized optimizer.




In [21]:
# loss function
loss_fn = nn.BCELoss()
print('Initialized loss function.')

Initialized loss function.


In [17]:
from torch.utils.data import DataLoader

# Dataset & dataloader
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
print('Created train & val datasets.')

Created train & val datasets.


In [18]:
def training_step(dataloader, model, optimizer, loss_fn, if_freeze_bert):
    """Method to train the model"""
    
    model.train()
    model.freeze_bert() if if_freeze_bert else model.unfreeze_bert()
      
    epoch_loss = 0
    size = len(dataloader.dataset)
 
    for i, batch in enumerate(dataloader):        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = torch.flatten(model(tokens=input_ids, attention_mask=attention_mask))
                        
        optimizer.zero_grad()
        loss = loss_fn(outputs, labels.float())
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

            
print(training_step.__doc__)

Method to train the model


In [19]:
def validation_step(dataloader, model, loss_fn):
    """Method to test the model's accuracy and loss on the validation set"""
    
    model.eval()
    model.freeze_bert()
    
    size = len(dataloader)
    f1, acc = 0, 0
    
    with torch.no_grad():
        for batch in dataloader:
            X = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            y = batch['labels'].to(device)
                  
            pred = model(tokens=X, attention_mask=attention_mask)
            
            acc_batch, f1_batch = eval_prediction(y.float(), pred)                        
            acc += acc_batch
            f1 += f1_batch

        acc = acc/size
        f1 = f1/size
                
    return acc, f1
print(validation_step.__doc__)

Method to test the model's accuracy and loss on the validation set


In [24]:
from tqdm.auto import tqdm

tqdm.pandas()

best_acc, best_f1 = 0, 0
path = './best_model.pt'
if_freeze_bert = False

for i in tqdm(range(num_of_epochs)):
    print("Epoch: #{}".format(i+1))

    if i < 5:
        if_freeze_bert = False
        print("Bert is not freezed")
    else:
        if_freeze_bert = True
        print("Bert is freezed")
    
    training_step(train_loader, model,optimizer, loss_fn, if_freeze_bert)
    train_acc, train_f1 = validation_step(train_loader, model, loss_fn)
    val_acc, val_f1 = validation_step(val_loader, model, loss_fn)
    
    print("Training results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(train_acc, train_f1))

    print("Validation results: ")
    print("Acc: {:.3f}, f1: {:.3f}".format(val_acc, val_f1))
    
    if val_acc > best_acc:
        best_acc = val_acc    
        torch.save(model, path)

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: #1
Bert is not freezed


In [None]:
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')


clean_test_texts = [clean_tweet(tweet) for tweet in test_data['text'].values.tolist()]
test_encodings = tokenizer(clean_test_texts,
                           truncation=True, padding=True,
                           return_tensors='pt').to(device)

print("Encodings are ready.")

In [None]:
model = torch.load(path)
model.eval()
with torch.no_grad():
    predictions = model(tokens=test_encodings['input_ids'], attention_mask=test_encodings['attention_mask'])
    
binary_predictions = np.round(predictions.cpu().detach().numpy()).astype(int).flatten()
    
print("Predictions are ready.")

In [None]:
sample_submission['target'] = binary_predictions
sample_submission.to_csv('submission.csv', index=False)
print('Predictions are saved to submission.csv.')