Heavily adapted from: 
https://github.com/tobyatgithub/bert_tutorial/blob/master/Bert_tutorial1_embeddings.ipynb

We will be using pretrained BERT model to go from raw words into latent embeddings
#### word -> tokens -> ids -> hidden states -> embeddings

In [None]:
!conda install pytorch torchvision -c pytorch
!pip install pytorch_pretrained_bert

In [1]:
import pandas as pd
import nlp
#import nltk
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dakaspar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
labels = nlp.load_labels('articles_db.db')
corpus = nlp.load_articles('articles_db.db')
df = pd.DataFrame(corpus, columns=['title', 'body'])
df.head(2)

Unnamed: 0,title,body
0,Brexit’s generational divide is a major fault ...,"PORTSMOUTH, England — Eddie Izzard, one of Bri..."
1,UK votes for Brexit: What just happened -- and...,"London (CNN)As dawn broke over the UK Friday, ..."


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [29]:
df['prepped_body'] = df['body'] + " [SEP]"
df.loc[0, 'prepped_body'] = "[CLS] " + df.loc[0, 'prepped_body']
df['tokens'] = None
len(df)

198

In [30]:
for i, text in enumerate(df['prepped_body']):
    df.loc[i, 'tokens'] = tokenizer.tokenize(text)
    df.loc[i, 'tokens'] = df.loc[i, 'tokens'][:512]

In [36]:
# word -> tokens -> ids -> hidden states -> embeddings

all_tokens = []
input_type_ids = []
# masks for segment, 0 for the first sentence, 1 for the second sentence.
# use 1 if there's only one sentence.

for i, tokens in enumerate(df['tokens']):
    for token in tokens:
        all_tokens.append(token)
        input_type_ids.append(i)
print(len(input_type_ids))
input_type_ids = input_type_ids[:511]
print(len(input_type_ids))
# print("tokens:", tokens)   
# print("type_ids:", input_type_ids)

84344
511


In [37]:
len(all_tokens)

84344

In [38]:
# We can only use 512 tokens with BERT
input_ids = tokenizer.convert_tokens_to_ids(all_tokens[:511])
for pair in zip(tokens[:25], input_ids[:25]):
    print(pair)
# notice the case ---> uncased

('things', 101)
('fall', 10913)
('apart', 1010)
('.', 2563)
('and', 1517)
('now', 5752)
(',', 1045)
('"', 20715)
('things', 4103)
('"', 1010)
('includes', 2028)
('the', 1997)
('european', 3725)
('union', 1521)
('.', 1055)
('british', 2087)
('voters', 3297)
('delivered', 25119)
('a', 1010)
('well', 2003)
('-', 2006)
('aimed', 1037)
('kick', 23624)
('at', 2278)
('the', 1010)


In [43]:
# padding
seq_length = 512 # max allowed length & padding length for each pair of sentences. 512
input_mask = [1] * len(input_ids)
print(input_ids[:20])
print(input_mask[:20])
print(input_type_ids[:20])
while len(input_ids) < seq_length:
    input_ids.append(0)
    input_mask.append(0)
    input_type_ids.append(0)
    
print()
print(input_ids[:20])
print(input_mask[:20])
print(input_type_ids[:20])

[101, 10913, 1010, 2563, 1517, 5752, 1045, 20715, 4103, 1010, 2028, 1997, 3725, 1521, 1055, 2087, 3297, 25119, 1010, 2003]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]])

[101, 10913, 1010, 2563, 1517, 5752, 1045, 20715, 4103, 1010, 2028, 1997, 3725, 1521, 1055, 2087, 3297, 25119, 1010, 2003]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
tensor([[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]])


In [40]:
print(len(input_ids))
print(len(input_mask))
print(len(input_type_ids))

512
512
512


In [41]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# model = model.cuda()
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval();

In [42]:
# Predict hidden states features for each layer
with torch.no_grad():
    # ids -> hidden state vectors
    input_tensor = torch.LongTensor(input_ids).view(-1,1)
    input_mask = torch.LongTensor(input_mask).view(-1,1)
    input_type_ids = torch.LongTensor(input_type_ids).view(-1,1)
    
    print(input_tensor.shape)
    print(input_mask.shape)  
    print(input_type_ids.shape)
    encoded_layers, _ = model(input_tensor, token_type_ids=input_type_ids, attention_mask=input_mask)

torch.Size([512, 1])
torch.Size([512, 1])
torch.Size([512, 1])


In [51]:
# to get the token embedding vector, we can sum the last four
#print(text_a, text_b)
sum_last_four = torch.sum(torch.stack(encoded_layers[-4:]), dim=0)
print('\n\n', sum_last_four.shape)



 torch.Size([512, 1, 768])


In [52]:
print(torch.cat(encoded_layers[-4:]).shape)

torch.Size([2048, 1, 768])


In [None]:
# -----------------------------EVERYTHING BELOW IS OLD MATERIAL ---- NOT USED ---------

In [None]:
nltk.download('stopwords')

In [None]:
def process_document(text):
	"""
	Processes a text document by coverting all words to lower case,
	tokenizing, removing all non-alphabetical characters,
	and stemming each word.
	Args:
		text: A string of the text of a single document.
	Returns:
		A list of processed words from the document.
	"""
	# Convert words to lower case
	text = text.lower()

	# Tokenize corpus and remove all non-alphabetical characters
	tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
	tokens = tokenizer.tokenize(text)

	# Remove stopwords
	stop_words = nltk.corpus.stopwords.words('english')
	set_stopwords = set(stop_words)
	stopwords_removed = [token for token in tokens if not token in set_stopwords]

	# Stem words
	stemmer = nltk.stem.SnowballStemmer('english')
	stemmed = [stemmer.stem(word) for word in stopwords_removed]

	# Return list of processed words
	return stemmed

In [None]:
df = pd.DataFrame(corpus, columns=['title', 'body'])

In [None]:
process_document(corpus[0][1])

In [None]:
corpus[0][1]