### 1. Read Text File

In [10]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print('Number of characters: ', len(raw_text))
raw_text[:99]

Number of characters:  20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

### 2. Tokenizer

#### 2.1 Word Based Tokenizer

#### 2.1.1 Word Based Tokenizer (Regex splitting trials)

In [7]:
import regex as re

text = 'Hello, world. This, is a test.'
result = re.split(r'(\s)', text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

The given result is a list of words, whitespaces, and punctuation characters. This tokenization spltis the punctuations alongside words, such as `Hello,`, but we might consider to capture punctuations separately.

In [8]:
result = re.split(r'([,.]|\s)', text)
result

['Hello',
 ',',
 '',
 ' ',
 'world',
 '.',
 '',
 ' ',
 'This',
 ',',
 '',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 'test',
 '.',
 '']

In [9]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

#### 2.1.2 Applying to main text

In [11]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [13]:
print('Total number of tokens obtained:', len(preprocessed))

Total number of tokens obtained: 4690


### 3. Creating Token IDs (with Word Based Tokenizer)

In [15]:
tokens = sorted(set(preprocessed))
vocab_size = len(tokens)
vocab_size

1130

In [16]:
vocab = {token: idx for idx, token in enumerate(tokens)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

#### 2.1 Create Tokenizer Class

In [46]:
class Tokenizer():
    """
    Word Based Tokenizer 
    """
    def __init__(self):
        self.str_to_int = None
        self.int_to_str = None
        self.unknown_token = '<|unk|>'
        self.eot_token = '<|endoftext|>'
    
    def apply_regex_splitting(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        return preprocessed
    
    def train(self, text):
        preprocessed = self.apply_regex_splitting(text)
        all_words = sorted(set(preprocessed))
        
        self.str_to_int = {token:idx for idx, token in enumerate(all_words)}
        self.str_to_int[self.unknown_token] = len(self.str_to_int)
        self.str_to_int[self.eot_token] = len(self.str_to_int)
        
        self.int_to_str = {idx:token for token, idx in self.str_to_int.items()}
    
    def encode(self, text):
        preprocessed = self.apply_regex_splitting(text)
        
        token_ids = [self.str_to_int.get(token, self.str_to_int.get(self.unknown_token))
                    for token in preprocessed]
        return token_ids
    
    def decode(self, token_ids):
        text = ' '.join([self.int_to_str[token_id] for token_id in token_ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [47]:
tokenizer = Tokenizer()
tokenizer.train(raw_text)

In [48]:
tokenizer.int_to_str[1130]

'<|unk|>'

In [49]:
text = """"It's the last he painted, you know,"
            Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)

for p1, p2 in zip(tokenizer.apply_regex_splitting(text), ids):
    print(p1, '--', p2)

" -- 1
It -- 56
' -- 2
s -- 850
the -- 988
last -- 602
he -- 533
painted -- 746
, -- 5
you -- 1126
know -- 596
, -- 5
" -- 1
Mrs -- 67
. -- 7
Gisburn -- 38
said -- 851
with -- 1108
pardonable -- 754
pride -- 793
. -- 7


In [50]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [53]:
text2 = 'Hello, world! <|endoftext|> some other text taken from internet.'
ids = tokenizer.encode(text2)
for p1, p2 in zip(tokenizer.apply_regex_splitting(text2), ids):
    print(p1, '--', p2)

Hello -- 1130
, -- 5
world -- 1130
! -- 0
<|endoftext|> -- 1131
some -- 910
other -- 735
text -- 1130
taken -- 973
from -- 477
internet -- 1130
. -- 7


In [54]:
tokenizer.decode(ids)

'<|unk|>, <|unk|>! <|endoftext|> some other <|unk|> taken from <|unk|>.'

### 4. Subword Based Tokenizer (BPE) & Creating Token IDs

In [56]:
# We'll use a pretrained BPE tokenizer that's used by the OpenAI
# Also there is an implementation example from scratch in the `02_tokenizer.ipynb` file

import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [57]:
text = (
    'Hello, do you like tea? <|endoftext|> In the sunlit terraces'
    'of someunknownPlace.'
)
integers =  tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In the contrast of word based tokenizer, the algorithm underlying BPE breaks down wods that aren't in its predefined vocabulary into smaller subword units or even individual characters. This enables it to handle out-of-vocabulary words.

In [59]:
tokenizer.encode('someunknownPlace')

[11246, 34680, 27271]

In [58]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

### 5. Dataset & DataLoader

In [62]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [67]:
class DatasetV1(Dataset):
    
    def __init__(self, text, tokenizer, max_length, stride):
        self.inputs = []
        self.outputs = []
    
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids) - max_length, stride):
            self.inputs.append(torch.tensor(token_ids[i:i+max_length]))
            self.outputs.append(torch.tensor(token_ids[i+1:i+max_length+1]))
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return self.inputs[index], self.outputs[index]

def create_dataloader(txt, batch_size=4, max_length=256,
                      stride=128, shuffle=True,
                      drop_last=True, num_workers=0):
    
    tokenizer = tiktoken.get_encoding('gpt2')    
    dataset = DatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader

In [72]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
ingputs, targets = next(data_iter)
print(ingputs)
print('\n', targets)

tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])

 tensor([[  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])


In [81]:
inputs, targets = next(data_iter)
print(ingputs)
print('\n', targets)

tensor([[10899,  2138,   257,  7026],
        [ 2138,   257,  7026, 15632],
        [  257,  7026, 15632,   438],
        [ 7026, 15632,   438,  2016],
        [15632,   438,  2016,   257],
        [  438,  2016,   257,   922],
        [ 2016,   257,   922,  5891],
        [  257,   922,  5891,  1576]])

 tensor([[5891, 1576,  438,  568],
        [1576,  438,  568,  340],
        [ 438,  568,  340,  373],
        [ 568,  340,  373,  645],
        [ 340,  373,  645, 1049],
        [ 373,  645, 1049, 5975],
        [ 645, 1049, 5975,  284],
        [1049, 5975,  284,  502]])


### 6. Token Embeddings

#### 6.1 Small Demo

In [87]:
vocab_size = 5
output_dim = 3

token_embedding_layer = nn.Embedding(vocab_size, output_dim)
token_embedding_layer.weight

Parameter containing:
tensor([[-0.2433, -1.2907, -0.3056],
        [-1.5494,  0.6584,  0.8837],
        [-0.7560, -0.4269, -0.7097],
        [-1.0531, -1.5958,  0.6364],
        [ 1.2865,  1.4511,  1.3312]], requires_grad=True)

In [86]:
ids = torch.tensor([1, 2, 4])

In [88]:
token_embedding = token_embedding_layer(ids)
token_embedding

tensor([[-1.5494,  0.6584,  0.8837],
        [-0.7560, -0.4269, -0.7097],
        [ 1.2865,  1.4511,  1.3312]], grad_fn=<EmbeddingBackward0>)

#### 6.2 Token Embeddings with Original Dataset

In [89]:
vocab_size = tokenizer.n_vocab
output_dim = 256

token_embedding_layer = nn.Embedding(vocab_size, output_dim)

In [93]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print('Token IDs: \n', inputs)
print('\nInputs Shape: \n', inputs.shape)

Token IDs: 
 tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])

Inputs Shape: 
 torch.Size([8, 4])


In [100]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

### 7. Positional Embeddings

In [95]:
context_length = 4 # max_length
output_dim = 256 # must be same as given in the token embedding layer

pos_embedding_layer = nn.Embedding(context_length, output_dim)

In [97]:
posible_positions = torch.arange(context_length)
pos_embeddings = pos_embedding_layer(posible_positions)

posible_positions.shape, pos_embeddings.shape

(torch.Size([4]), torch.Size([4, 256]))

### 8. Input Embeddings

Input Embeddings = Token Embeddings + Positional Embeddings

In [102]:
inputs_embeddings = token_embeddings + pos_embeddings

token_embeddings.shape, pos_embeddings.shape, inputs_embeddings.shape

(torch.Size([8, 4, 256]), torch.Size([4, 256]), torch.Size([8, 4, 256]))