<a href="https://colab.research.google.com/github/michealman114/Natural-Language-Models-for-Hate-Speech-Classification/blob/main/Embeddings_from_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
TODO: Decrease embedding size from BERT, currently is at least 768
TODO: Further improve embedding choice - either: sum last 4 hidden layers OR concatenate last 4 hidden layers
TODO: Switch from DistilBert to Bert?
"""

'\nTODO: Decrease embedding size from BERT, currently is at least 768\nTODO: Further improve embedding choice - either: sum last 4 hidden layers OR concatenate last 4 hidden layers\nTODO: Switch from DistilBert to Bert?\n'

In [1]:
!pip install transformers



In [2]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig

import torch
import torch.nn as nn 
import torch.utils.data as torch_data
import torch.optim as optim

import numpy as np
import json

In [3]:
from torch import cuda

if cuda.is_available():
    device = 'cuda'
    seed = 4814
    torch.cuda.manual_seed_all(seed)
    print("running on GPU:", torch.cuda.get_device_name(0))
else:
    device = 'cpu'
    print("running on CPU")

running on GPU: Tesla T4


In [4]:
def getCommentsTitlesLabels(file_lines):
    comment_list = []
    title_list = []
    labels = []
    for line in file_lines:
        content = json.loads(line)

        comment = content['text']
        comment_list.append(comment)

        title = content['title']
        title_list.append(title)

        labels.append(content['label'])
    
    return comment_list,title_list,labels

In [5]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
model = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config).to(device)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#test_lines = open("./Data/modern_comments.json", "r").readlines() # modern data
#test_comments,test_titles,test_labels = getCommentsTitlesLabels(test_lines)
#test_labels = torch.tensor(test_labels)

#tokenized_test_comments = tokenizer(train_comments, padding = True, truncation = True, return_tensors="pt").to(device)
#tokenized_test_titles = tokenizer(train_titles, padding = True, truncation = True, return_tensors="pt").to(device)

In [6]:
train_lines = open("./Data/fox-news-comments.json", "r").readlines() # original 2015 data
train_comments, train_titles, train_labels = getCommentsTitlesLabels(train_lines)
train_labels = torch.tensor(train_labels)

In [8]:
train_labels.shape[0]

1528

In [7]:
class ProcessingDataset(torch.utils.data.Dataset): # renamed to ProcessingDataset to avoid reuse of name
    def __init__(self, comments, titles, labels):
        """
        comments/titles: (batch_size, max_length, embed_dim)
        labels: (batch_size,)
        """
        #Initialization
        self.comments = comments
        self.titles = titles
        self.labels = labels
        self.length = labels.shape[0]

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        # Load data and get label
        comment = self.comments[index]
        title = self.titles[index]
        label = self.labels[index]

        return comment,title,label

In [8]:
train_data = ProcessingDataset(train_comments, train_titles, train_labels)
train_loader = torch_data.DataLoader(train_data, batch_size=128, shuffle=True)

In [15]:
print(train_comments[:5])

['Merkel would never say NO', 'Expect more and more women to be asking .. "why are men no longer interested in me"! We\'re not going touch you until you pull our pants down!', "Groping people in public wasn't already illegal? What's up with that, Deutschland?", 'Merkel, possible the only person in charge who is worse than what we have. Obama is trying his hardest though to get to Merkel\'s level. "A 21-year-old Iraqi man was convicted of sexual assault and given a one-year suspended sentence. A 26-year-old Algerian man was convicted of abetting a sexual assault and attempted assault, and given the same sentence." Sounds exactly like how Obama is trying to let all of the black people in this country get away with everything.', 'They know very well, no means NO! They need to pass a law making it legal to castrate those animals.']


In [9]:
# Put the model in eval mode to turn off dropout regularization etc.
model.eval()

embedded_train_comments = []
comments_attention_masks = []
embedded_train_titles = []
titles_attention_masks = []
BERT_train_labels = []

i = 1

#use torch.no_grad() to speed up the embedding process
with torch.no_grad():
    for comments, titles, labels in train_loader:
        """
        comments: list of size (num_comments)
        titles: list of size (num_comments)
        labels: torch.tensor of size (num_comments)
        """

        tokenized_train_comments = tokenizer(comments, padding = "max_length", max_length = 512, truncation = True, return_tensors="pt").to(device)
        tokenized_train_titles = tokenizer(titles, padding = "max_length", max_length = 512, truncation = True, return_tensors="pt").to(device)


        train_comments_outputs = model(**tokenized_train_comments) # (last_hidden_state,hidden_states[optional], attentions[optional])
        train_titles_outputs = model(**tokenized_train_titles) # (last_hidden_state,hidden_states[optional], attentions[optional])

        all_train_comments_embeddings = train_comments_outputs[1] #tuple of hidden states from each layer of DistilBERT
        all_train_titles_embeddings = train_titles_outputs[1] #tuple of hidden states from each layer of DistilBERT

        BERT_train_comments_embeddings_temp = all_train_comments_embeddings[-2]
        BERT_train_titles_embeddings_temp = all_train_titles_embeddings[-2]

        #print(BERT_train_comments_embeddings_temp.shape) #(batch_size=128, length = 512, embed_dim = 768)

        embedded_train_comments.append(BERT_train_comments_embeddings_temp)
        embedded_train_titles.append(BERT_train_titles_embeddings_temp)
        BERT_train_labels.append(labels)

        comments_attention_masks.append(tokenized_train_comments['attention_mask'])
        titles_attention_masks.append(tokenized_train_titles['attention_mask'])

        print(f'iteration {i} completed')
        i+= 1


iteration 1 completed
iteration 2 completed
iteration 3 completed
iteration 4 completed
iteration 5 completed
iteration 6 completed
iteration 7 completed
iteration 8 completed
iteration 9 completed
iteration 10 completed
iteration 11 completed
iteration 12 completed
torch.Size([1528, 512, 768])
torch.Size([1528, 512, 768])
torch.Size([1528, 512])
torch.Size([1528, 512])


In [18]:
BERT_train_comments_embeddings = torch.cat(embedded_train_comments)
BERT_train_titles_embeddings = torch.cat(embedded_train_titles)
BERT_train_comments_attention_masks = torch.cat(comments_attention_masks)
BERT_train_titles_attention_masks = torch.cat(titles_attention_masks)

BERT_train_labels = torch.cat(BERT_train_labels)


print(BERT_train_comments_embeddings.shape)
print(BERT_train_titles_embeddings.shape)
print(BERT_train_comments_attention_masks.shape)
print(BERT_train_titles_attention_masks.shape)

print(BERT_train_labels.shape)

torch.Size([1528])


In [11]:
torch.save(BERT_train_comments_embeddings, './StoredTensors/BERT_train_comments_embeddings.pt')
torch.save(BERT_train_titles_embeddings, './StoredTensors/BERT_train_titles_embeddings.pt')

torch.save(BERT_train_comments_attention_masks, './StoredTensors/BERT_train_comments_attention_masks.pt')
torch.save(BERT_train_titles_attention_masks, './StoredTensors/BERT_train_titles_attention_masks.pt')

torch.save(BERT_train_labels, './StoredTensors/BERT_train_labels.pt')

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
!cp ./StoredTensors/BERT_train_titles_attention_masks.pt /content/drive/MyDrive
!cp ./StoredTensors/BERT_train_titles_embeddings.pt /content/drive/MyDrive
!cp ./StoredTensors/BERT_train_comments_attention_masks.pt /content/drive/MyDrive
!cp ./StoredTensors/BERT_train_comments_embeddings.pt /content/drive/MyDrive

!cp ./StoredTensors/BERT_train_labels.pt /content/drive/MyDrive

In [22]:
train_comments_array = torch.load('drive/MyDrive/Natural Language Models for Hate Speech Classification/StoredTensors/BERT_train_comments_attention_masks.pt')

In [23]:
print(train_comments_array.shape)

torch.Size([1528, 512])
