In [5]:
# init 

# !pip install transformers datasets tokenizers
# !wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
# !unzip -qq cornell_movie_dialogs_corpus.zip
# !rm cornell_movie_dialogs_corpus.zip
# !mkdir datasets
# !mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
# !mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets

In [15]:
import utils
import torch
import BERTDataset


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data Prep 

In [7]:
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

MAX_LEN = 64

### loading all data into memory
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
len(pairs)

221616

In [11]:
pairs[2]

['Not the hacking and gagging and spitting part. Please.',
 "Okay... then how 'bout we try out some French cuisine. Saturday? Night?"]

### Tokenizer 


In [14]:
# WordPiece tokenizer

### save data as txt file
if not os.path.exists('./data'):
    os.mkdir('./data')
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)

    # once we hit the 10K mark, save to file
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

### training own tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )


if not os.path.exists('./bert-it-1'):
    os.mkdir('./bert-it-1')
tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

100%|██████████| 221616/221616 [00:00<00:00, 3549769.43it/s]









## Create a Dataset

In [18]:
from BERTDataset import BERTDataset

In [19]:
train_data = BERTDataset(
   pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

sample_data = next(iter(train_loader))

In [33]:
one_data = train_data[1]
one_data

{'bert_input': tensor([    1,   303,    15,     3,   515,   184,    11,    43,   672,   231,
         15148,   295,  7731,    93,   242,    15,   270,   173,    11,    58,
           459,   231,   146,    17,     2,   213,   150, 15709,   179,  6889,
          2986,   179, 17761,   610,    17,   630,    17,     2,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'bert_label': tensor([ 0,  0,  0, 48,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 'segment_label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0

In [None]:
one_data['bert_input'].shape

## Emebeddings 

- Token embedding 
- Segment embedding 
- position embedding 

In [34]:
from embeddings import PositionalEmbedding, BERTEmbedding

In [36]:
x = torch.arange(0,5)
x 

tensor([0, 1, 2, 3, 4])

In [43]:
pos = PositionalEmbedding(d_model=10, max_len=64)

In [48]:
pos(3).shape

# so any initiation of pos, just gives the entire positioning embedding table
# of the following shape

torch.Size([1, 64, 10])

In [52]:
# here is hwo you can see the embedded 4th position
pos(3)[0,4,:]

tensor([-7.5680e-01,  8.0569e-01,  1.0031e-01,  9.9987e-01,  2.5238e-03,
         1.0000e+00,  6.3396e-05,  1.0000e+00,  1.5924e-06,  1.0000e+00])

In [54]:
# so no matter how pos is called, the embedded matrix remains the same
pos(5)[0,4,:]

tensor([-7.5680e-01,  8.0569e-01,  1.0031e-01,  9.9987e-01,  2.5238e-03,
         1.0000e+00,  6.3396e-05,  1.0000e+00,  1.5924e-06,  1.0000e+00])

In [55]:
whole_embed = BERTEmbedding(vocab_size=100, embed_size=10)

In [59]:
torch.tensor([4])

tensor([4])

In [61]:
# one token (i.e 4) at with the seg-id 1 is embedded into this shape
whole_embed(torch.tensor([4]), torch.tensor([1])).shape

torch.Size([1, 64, 10])

In [66]:
torch.tensor([4,3,67,44]).view((4,1)).shape

torch.Size([4, 1])

In [67]:
# if i give a batch of 4 examples to embed... 
whole_embed(torch.tensor([4,3,67,44]).view(4,1), torch.tensor([[1,1,2,2]]).view(4,1)).shape

torch.Size([4, 64, 10])

In [68]:
# let's check one token's embedded values
whole_embed(torch.tensor([4,3,67,44]).view(4,1), torch.tensor([[1,1,2,2]]).view(4,1))[1,10,:]

tensor([-0.1945, -1.9649,  0.7537,  1.4467, -0.0335, -0.3770, -0.9488, -0.5267,
         0.8116,  3.4542], grad_fn=<SliceBackward0>)

In [70]:
# let's increase the dropout rate to 0.9

# as expected almost 90% of them are 0.

high_dropout_embed = BERTEmbedding(vocab_size=100, embed_size=10, dropout=0.9)
high_dropout_embed(torch.tensor([4]), torch.tensor([1]))[0,0,:]

tensor([ -0.0000,  -0.0000,  -0.0000,   0.0000,   0.0000,   0.0000,  -0.0000,
        -15.6981,   0.0000,   0.0000], grad_fn=<SliceBackward0>)