<a href="https://colab.research.google.com/github/michealman114/Natural-Language-Models-for-Hate-Speech-Classification/blob/main/ClassificationWithBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
#from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup


import torch
import torch.nn as nn 
import torch.utils.data as torch_data
import torch.optim as optim

from tqdm import tqdm


import numpy as np
import random
import json

In [3]:
from torch import cuda

seed = 4814

if cuda.is_available():
    device = 'cuda'
    torch.cuda.manual_seed_all(seed)
    print("running on GPU:", torch.cuda.get_device_name(0))
else:
    device = 'cpu'
    print("running on CPU")


random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

running on CPU


<torch._C.Generator at 0x7f8f6aa6f0b0>

In [4]:
def getCommentsTitlesLabels(file_lines):
    comment_list = []
    title_list = []
    labels = []
    for line in file_lines:
        content = json.loads(line)

        comment = content['text']
        comment_list.append(comment)

        title = content['title']
        title_list.append(title)

        labels.append(content['label'])
    
    return comment_list,title_list,labels

Pick one of the following models
- BERT for sequence classification
- DistilBERT for sequence classificaiton
- Customized DistilBERT for sequence classification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

Back to normal stuff

In [7]:
original_lines = open("./Data/pruned-fox-news-comments.json", "r").readlines() # original 2015 data
original_comments, original_titles, original_labels = getCommentsTitlesLabels(original_lines)

num_samples = len(original_labels)

print(len(original_comments), len(original_titles), len(original_labels))

1525 1525 1525


In [8]:
clipped_comments = []
clipped_titles = []
clipped_labels = []

largest_size = 200

#clip unnecessarily long comments to improve training speed
for comment,title,label in zip(original_comments, original_titles, original_labels):
    comment_length = len(tokenizer.encode(comment))
    if comment_length > largest_size :
        continue
    clipped_comments.append(comment)
    clipped_titles.append(title)
    clipped_labels.append(label)

num_samples = len(clipped_labels)
print(len(clipped_comments), len(clipped_titles), len(clipped_labels))

1510 1510 1510


In [9]:
import random

zipped = list(zip(clipped_comments, clipped_titles, clipped_labels))
random.shuffle(zipped)
shuffled_comments, shuffled_titles, shuffled_labels = zip(*zipped)


print(len(shuffled_comments), len(shuffled_titles), len(shuffled_labels))
print(shuffled_comments[:5])

1510 1510 1510
('God is , as always, whatever you imagine it is.', 'you just now said it was', 'Not a huge fan - state by state issue.', 'So, you object to the 1st Amendment?', "I wish he'd publish his book already and stop spamming here.")


In [10]:
input_ids = []
attention_masks = []


encoded_dict = tokenizer(
                    shuffled_comments,
                    add_special_tokens = True,
                    max_length = 200,
                    padding = True,
                    truncation = True,
                    return_tensors = 'pt',
                )
    


tokenized_comments = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']
all_labels = torch.tensor(shuffled_labels)


print('Original: ', shuffled_comments[0])
print('Token IDs:', tokenized_comments[0][:12])

Original:  God is , as always, whatever you imagine it is.
Token IDs: tensor([ 101, 2643, 2003, 1010, 2004, 2467, 1010, 3649, 2017, 5674, 2009, 2003])


In [11]:
print(tokenized_comments.shape)
print(attention_masks.shape)
print(all_labels.shape)

torch.Size([1510, 200])
torch.Size([1510, 200])
torch.Size([1510])


In [12]:
class BERT_raw_Dataset(torch.utils.data.Dataset): # renamed to ProcessingDataset to avoid reuse of name
    def __init__(self, comments, attention_masks, labels):
        """
        comments/titles: (batch_size, max_length, embed_dim)
        labels: (batch_size,)
        """
        #Initialization
        self.comments = comments
        self.attention_masks = attention_masks
        self.labels = labels
        self.length = labels.shape[0]

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        # Load data and get label
        comment = self.comments[index]
        attention_mask = self.attention_masks[index]
        label = self.labels[index]

        return comment,attention_mask,label

In [13]:
# Create a 85-15 train-validation split.
max_train = int(0.85 * num_samples)

max_train = 400
max_val = 500
train_dataset = BERT_raw_Dataset(tokenized_comments[:max_train], attention_masks[:max_train], all_labels[:max_train])
val_dataset = BERT_raw_Dataset(tokenized_comments[max_train:max_val], attention_masks[max_train:max_val], all_labels[max_train:max_val])
#val_dataset = BERT_raw_Dataset(tokenized_comments[max_train:], attention_masks[max_train:], all_labels[max_train:])

In [15]:
train_loader = torch_data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch_data.DataLoader(val_dataset, batch_size=16, shuffle=False)

In [16]:
import sklearn
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

Training customized classifier built from scratch on top of DistilBERT below:

In [26]:
class BertClassifier(nn.Module):
  def __init__(self, bert = None):
    assert bert is not None
    super().__init__()
    self.dense1 = nn.Linear(768, 100)
    self.relu = nn.ReLU()
    self.dense2 = nn.Linear(100,1)
    self.sigmoid = nn.Sigmoid()
    self.model = bert
  
  def forward(self, batch):
    sent_output = self.model(**batch)
    CLS_hidden_state = sent_output.last_hidden_state[:,0,:] #(batch_size, embed_dim)

    output = self.dense1(CLS_hidden_state)
    output = self.relu(output)
    output = self.dense2(output)
    output = self.sigmoid(torch.squeeze(output))
    
    return output


In [27]:
distil_bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
distil_classifier = BertClassifier(bert = distil_bert)

num_epochs = 3

distil_optimizer = optim.AdamW(distil_classifier.parameters(), lr = 5e-5, eps = 1e-8)
distil_scheduler = get_linear_schedule_with_warmup(                
                optimizer = distil_optimizer,
                num_warmup_steps = 0,
                num_training_steps = num_epochs * len(train_loader)
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
training_stats = []
loss_fn = nn.BCELoss()

distil_classifier.to(device)

for epoch in tqdm(range(num_epochs)):    
    epoch_training_loss = 0

    
    distil_classifier.train()
    for tokenized_comment, mask, label in train_loader:
        tokenized_comment = tokenized_comment.to(device)
        mask = mask.to(device)
        label = label.to(device).type(torch.float32)


        distil_classifier.zero_grad()

        batch = {'input_ids': tokenized_comment, 'attention_mask':mask}
        preds = distil_classifier(batch)

        loss = loss_fn(preds, label)
        epoch_training_loss += loss.item()

        loss.backward()
        distil_optimizer.step()
        distil_scheduler.step()
        
        print(f"batch of size {label.shape[0]} finished")

    print(f"epoch training loss = {epoch_training_loss}")

    distil_classifier.eval()

    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for tokenized_comment, mask, label in val_loader:
            tokenized_comment = tokenized_comment.to(device)
            mask = mask.to(device)
            label = label.to(device).type(torch.float32)

            batch = {'input_ids': tokenized_comment, 'attention_mask':mask}
            preds = distil_classifier(batch, labels=label, return_dict=True)

            
            all_val_preds.append(preds.detach().cpu().numpy())
            all_val_labels.append(label.detach().cpu().numpy())

    

    all_val_preds = np.concatenate(all_val_preds)
    all_val_labels = np.concatenate(all_val_labels)

    print(f"EPOCH {epoch + 1} finished")
    print('Accuracy:', accuracy_score(label,preds))
    print('Precision, Recall, F1:',precision_recall_fscore_support(all_val_labels, all_val_preds, average='binary'))

  0%|          | 0/3 [00:00<?, ?it/s]

batch of size 16 finished
batch of size 16 finished
batch of size 16 finished


  0%|          | 0/3 [01:10<?, ?it/s]


KeyboardInterrupt: ignored

To train BERT/DistilBERT for sequence classification, the infrastructure is a little different

In [14]:
"""
Recommended parameters: lr = 5e-5, 3e-5, 2e-5
num_epochs = 2,3,4
https://arxiv.org/pdf/1810.04805.pdf

AdamW because it experimentally generalizes better: https://towardsdatascience.com/why-adamw-matters-736223f31b5d

standard lr scheduler from here: https://towardsdatascience.com/advanced-techniques-for-fine-tuning-transformers-82e4e61e16e
"""
optimizer = optim.AdamW(model.parameters(), lr = 5e-5, eps = 1e-8)


num_epochs = 3
scheduler = get_linear_schedule_with_warmup(                
                optimizer = optimizer,
                num_warmup_steps = 0,
                num_training_steps = num_epochs * len(train_loader)
)

In [None]:
training_stats = []
loss_fn = nn.BCELoss()

model.to(device)

for epoch in tqdm(range(num_epochs)):    
    epoch_training_loss = 0

    
    model.train()
    for tokenized_comment, mask, label in train_loader:
        tokenized_comment = tokenized_comment.to(device)
        mask = mask.to(device)
        label = label.to(device)


        model.zero_grad()        

        result = model(tokenized_comment, attention_mask=mask, labels=label, return_dict=True)

        loss = result.loss
        logits = result.logits


        epoch_training_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        
        print(f"batch of size {label.shape[0]} finished")

    print(f"epoch training loss = {epoch_training_loss}")


    model.eval()

    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for tokenized_comment, mask, label in val_loader:
            tokenized_comment = tokenized_comment.to(device)
            mask = mask.to(device)
            label = label.to(device)

            outputs = model(tokenized_comment, attention_mask=mask, labels=label, return_dict=True)

            logits = outputs.logits #output values tensor[16,2] = (batch_size, num_classes) of output values prior to softmaxing
            preds = np.argmax(logits, axis=1)

            
            all_val_preds.append(preds.detach().cpu().numpy())
            all_val_labels.append(label.detach().cpu().numpy())

    

    all_val_preds = np.concatenate(all_val_preds)
    all_val_labels = np.concatenate(all_val_labels)

    print(f"EPOCH {epoch + 1} finished")
    print('Accuracy:', accuracy_score(label,preds))
    print('Precision, Recall, F1:',precision_recall_fscore_support(all_val_labels, all_val_preds, average='binary'))


  0%|          | 0/3 [00:00<?, ?it/s]

batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
batch of size 16 finished
epoch training loss = 13.97217546403408


  _warn_prf(average, modifier, msg_start, len(result))
 33%|███▎      | 1/3 [15:06<30:12, 906.28s/it]

EPOCH 1 finished
Accuracy: 1.0
Precision, Recall, F1: (0.0, 0.0, 0.0, None)
batch of size 16 finished
