In [5]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
import torch
import json

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
model = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
train_lines = open("./Data/fox-news-comments.json", "r").readlines() # original 2015 data
test_lines = open("./Data/modern_comments.json", "r").readlines() # modern data

def getCommentsTitlesLabels(file_lines):
    comment_list = []
    title_list = []
    labels = []
    for line in file_lines:
        content = json.loads(line)

        comment = content['text']
        comment_list.append(comment)

        title = content['title']
        title_list.append(title)

        labels.append(content['label'])
    
    return comment_list,title_list,labels

train_comments, train_titles, train_labels = getCommentsTitlesLabels(train_lines)
test_comments,test_titles,test_labels = getCommentsTitlesLabels(test_lines)

tokenized_train_comments = tokenizer(train_comments[:5], padding = True, truncation = True, return_tensors="pt")
tokenized_train_titles = tokenizer(train_titles[:5], padding = True, truncation = True, return_tensors="pt")

tokenized_test_comments = tokenizer(train_comments[:5], padding = True, truncation = True, return_tensors="pt")
tokenized_test_titles = tokenizer(train_titles[:5], padding = True, truncation = True, return_tensors="pt")

In [7]:
print(tokenized_train_comments['input_ids'].shape)

torch.Size([5, 106])


In [15]:
# Put the model in eval mode to turn off dropout regularization etc.
model.eval()


#use torch.no_grad() to speed up the embedding process
with torch.no_grad():
    outputs = model(**tokenized_train_comments) # (last_hidden_state,hidden_states[optional], attentions[optional])

    final_comments_embeddings = outputs[0]
    comments_embeddings = outputs[1] 
    # tuple of hidden states at each layer of DistilBERT
    # (comments_embeddings[0] corresponds to first layer, comments_embeddings[6] corresponds to last hidden layer)

In [16]:
print(len(comments_embeddings))
print([comments_embeddings[i].shape for i in range(len(comments_embeddings))])

7
[torch.Size([5, 106, 768]), torch.Size([5, 106, 768]), torch.Size([5, 106, 768]), torch.Size([5, 106, 768]), torch.Size([5, 106, 768]), torch.Size([5, 106, 768]), torch.Size([5, 106, 768])]


In [20]:
print(final_embeddings[:1])
print(comments_embeddings[6][:1])

tensor([[[-0.1364, -0.0461,  0.0592,  ..., -0.0681,  0.2070,  0.2929],
         [ 0.2435, -0.3248,  0.4040,  ..., -0.3834,  0.6307, -0.0926],
         [-0.0554, -0.6182,  0.2613,  ..., -0.1236,  0.3009, -0.4368],
         ...,
         [ 0.3537,  0.1837,  0.1733,  ..., -0.1353,  0.1789, -0.1086],
         [ 0.1004,  0.0388,  0.1707,  ..., -0.0324,  0.0140,  0.0051],
         [ 0.1112, -0.0319,  0.1162,  ...,  0.1083,  0.0094,  0.0175]]])
tensor([[[-0.1364, -0.0461,  0.0592,  ..., -0.0681,  0.2070,  0.2929],
         [ 0.2435, -0.3248,  0.4040,  ..., -0.3834,  0.6307, -0.0926],
         [-0.0554, -0.6182,  0.2613,  ..., -0.1236,  0.3009, -0.4368],
         ...,
         [ 0.3537,  0.1837,  0.1733,  ..., -0.1353,  0.1789, -0.1086],
         [ 0.1004,  0.0388,  0.1707,  ..., -0.0324,  0.0140,  0.0051],
         [ 0.1112, -0.0319,  0.1162,  ...,  0.1083,  0.0094,  0.0175]]])


In [35]:
last_hidden_states = comments_embeddings.last_hidden_state 

In [36]:
print(last_hidden_states.shape)

torch.Size([100, 106, 768])


In [None]:
"""
Cleans the dataset and returns the 

@param file_lines: list of lines in the input file where each line contains all the information for a given comment (content + title + author title + etc.)

@returns [labels, comment_list, title_list, max_len, max_title_len]
    labels: file
    comment_list: list of all comments in the file
    title_list: list of all titles in the file
    max_comment_len: length of the longest comment in the dataset
    max_title_len: length of the longest title in the dataset
"""
def clean(file_lines):
    max_len = 0
    max_title_len = 0  
    comment_list = []
    title_list = []
    label = []
    for line in file_lines:
        comment = json.loads(line)
        
        t = comment['text']
        t = ' '.join([x for x in t.split() if x[0] != '@'])
        t = ' '.join(re.findall("[a-zA-Z,.]+",t))
        t = t.replace(',', ' ')
        t = t.replace('.', ' ')
        text = word_tokenize(t)
        text = [x for x in text if x.lower() not in stop_words]
        max_len = max(max_len, len(text))
        comment_list.append(text)
        
        title = comment['title']
        title = title.replace(',', '')
        title = title.replace('.', '')
        title = re.findall("[a-zA-Z,.]+",title)
        title_list.append(title)
        max_title_len = max(max_title_len, len(title))
        
        label.append(comment['label'])
    
    labels = np.array(label)
    return labels, comment_list, title_list, max_len, max_title_len


"""
Returns word2vec embeddings for an input word string

@param word : a string
@param embed : the embedding keyed vectors (in our case word2vec)
@returns : the (300,0) embedding for word
"""
def get_embed(word, embed):
    x = np.zeros((300,)) # default value should be 0
    corrected = spell.correction(word) # closest correction
    if word in embed: # base word
        x = embed[word]
    elif word.upper() in embed: # capitalized (edge case for acronyms like BLM) (for some reason blm doesn't exist but BLM does?)
        x = embed[word.upper()]
    elif word.lower() in embed: # opposite of capitalization
        x = embed[word.lower()]
    elif corrected in embed: # last case, check if closest correction exists (might be bad, some corrections are kinda ass)
        x = embed[corrected]
    
    return x

"""
Converts the lists for comments, titles into ndarrays

@params : straightforward
@returns: [comment_array,title_array] list of ndarrays for comments and titles
"""
def to_array(embed, comments, titles, max_comment_len, max_title_len):
    comment_array = np.zeros((len(comments), max_comment_len, 300))
    title_array = np.zeros((len(titles), max_title_len, 300))
    for ix1, sent in enumerate(comments):
        for ix2, word in enumerate(sent):
            comment_array[ix1,ix2] = get_embed(word,embed)
    for ix1, title in enumerate(titles):
        for ix2, word in enumerate(title):
            title_array[ix1,ix2] = get_embed(word,embed)
    
    return comment_array, title_array

"""
Randomly shuffles the outputs of to_array
"""
def custom_shuffle(comments,titles,labels):
    """
    comments/title is a (batch_size, max_comment/title_length,embedding size) ndarray, this means we need batch_first=true in nn.LSTM
    comment_array.shape = (batch_size, max_comment_len, 300) #300 is wor2vec embedding size
    labels.shape = (batch_size,)
    """
    num_samples, _ , _ = comments.shape
    shuffled_indices = np.random.permutation(num_samples) #return a permutation of the indices
    
    shuffled_comments = comments[shuffled_indices,:,:]
    shuffled_titles = titles[shuffled_indices,:,:]
    shuffled_labels = labels[shuffled_indices]

    return (shuffled_comments, shuffled_titles, shuffled_labels)


def process_data(embed):
    train_lines = open("./Data/fox-news-comments.json", "r").readlines() # original 2015 data
    test_lines = open("./Data/modern_comments.json", "r").readlines() # modern data

    train_labels, train_comments, train_titles, train_max_len, train_max_title_len = clean(train_lines)
    test_labels, test_comments, test_titles, test_max_len, test_max_title_len = clean(test_lines)

    train_comment_array, train_title_array = to_array(embed, train_comments, train_titles, train_max_len, train_max_title_len)
    test_comment_array, test_title_array = to_array(embed, test_comments, test_titles, test_max_len, test_max_title_len)

    train_comment_array, train_title_array, train_labels = custom_shuffle(train_comment_array,train_title_array,train_labels)
    test_comment_array, test_title_array, test_labels = custom_shuffle(test_comment_array, test_title_array, test_labels)

    train_comment_array = np.float32(train_comment_array)
    train_title_array = np.float32(train_title_array)

    test_comment_array = np.float32(test_comment_array)
    test_title_array = np.float32(test_title_array)

    return {'train' : [train_comment_array, train_title_array, train_labels], 'test' : [test_comment_array, test_title_array, test_labels]}