Tokenizing Text

In [100]:
import os
import urllib.request


In [101]:
with open("the_verdict_by_edith_wharton.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [8]:
raw_text

'I had always thought Jack Gisburn rather a cheap genius--though a\n\ngood fellow enough--so it was no great surprise to me to hear\n\nthat, in the height of his glory, he had dropped his painting,\n\nmarried a rich widow, and established himself in a villa on the\n\nRiviera. (Though I rather thought it would have been Rome or\n\nFlorence.)\n\n"The height of his glory"--that was what the women called it. I\n\ncan hear Mrs. Gideon Thwing--his last Chicago sitter--deploring\n\nhis unaccountable abdication. "Of course it\'s going to send the\n\nvalue of my picture \'way up; but I don\'t think of that, Mr.\n\nRickham--the loss to Arrt is all I think of." The word, on Mrs.\n\nThwing\'s lips, multiplied its RS as though they were reflected in\n\nan endless vista of mirrors. And it was not only the Mrs. Thwings\n\nwho mourned. Had not the exquisite Hermia Croft, at the last\n\nGrafton Gallery show, stopped me before Gisburn\'s "Moon-dancers"\n\nto say, with tears in her eyes: "We shall not lo

In [9]:
len(raw_text)

20758

In [10]:
import re

In [102]:
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [103]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [18]:
result = [item for item in result if item.strip()]

print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [104]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
result = [item for item in result if item.strip()]

preprocessed = result
print(preprocessed)

['I', 'had', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [55]:
len(preprocessed)

4663

In [56]:
all_words = sorted(set(preprocessed))

len(all_words)

1145

In [57]:
all_words[:20]

['!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 ';',
 '?',
 'A',
 'AM',
 'Ah',
 'Among',
 'And',
 'Are',
 'Arrt',
 'As',
 'At']

In [31]:
en = enumerate(all_words)
en

<enumerate at 0x10bb04090>

In [105]:
vocab = {token:integer for integer,token in enumerate(all_words)}

# vocab

## Simple Tokenizer

In [106]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        # split into words, punctuations etc.
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # strip out whitespaces
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]

        # use vocabulary to convert words, punctiuations etc. to ids
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [107]:
tokenizer = SimpleTokenizerV1(vocab)

In [108]:
text = "It was square and brown and leathery: no \"effects\"; no bric-a-brac, none of the air of posing for reproduction in a picture weekly--above all, no least sign"

In [109]:
ids = tokenizer.encode(text)

print(ids)

[61, 1092, 934, 170, 248, 170, 627, 8, 724, 1, 393, 1, 9, 724, 243, 5, 725, 737, 1003, 157, 737, 798, 470, 852, 583, 129, 784, 1099, 6, 133, 159, 5, 724, 626, 903]


In [45]:
tokenizer.decode(ids)

'It was square and brown and leathery : no" effects" ; no bric-a-brac, none of the air of posing for reproduction in a picture weekly -- above all, no least sign'

## Adding special tokens

In [110]:
# using a word that is not in the vocabulary "Hello"
text = "Hello, do you like tea. is this-- a test?"

In [111]:
tokenizer.encode(text)

KeyError: 'Hello'

In [112]:
# extend the tokenizer
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [113]:
len(vocab.items())

1147

In [114]:
# last 5 items of vicab dictionary
list(vocab.items())[-5:]

[('younger', 1142),
 ('your', 1143),
 ('yourself', 1144),
 ('<|endoftext|>', 1145),
 ('<|unk|>', 1146)]

## Improved Tokenizer with special tokens

In [120]:
import unicodedata

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        # normalize the text if it has special utf chars
        text = unicodedata.normalize("NFKC", text)

        # split into words, punctuations etc.
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # strip out whitespaces
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        # use vocabulary to convert words, punctiuations etc. to ids
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [121]:
tokenizer = SimpleTokenizerV2(vocab)

In [122]:
tokenizer.encode(text)

[1146, 5, 369, 1141, 642, 990, 7, 599, 1014, 6, 129, 1146, 10]

In [123]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea. is this -- a <|unk|>?'

In [73]:
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'AM': 12,
 'Ah': 13,
 'Among': 14,
 'And': 15,
 'Are': 16,
 'Arrt': 17,
 'As': 18,
 'At': 19,
 'Be': 20,
 'Begin': 21,
 'Burlington': 22,
 'But': 23,
 'By': 24,
 'Carlo': 25,
 'Chicago': 26,
 'Claude': 27,
 'Come': 28,
 'Croft': 29,
 'Destroyed': 30,
 'Devonshire': 31,
 'Don': 32,
 'Dubarry': 33,
 'Emperors': 34,
 'End': 35,
 'FELT': 36,
 'Florence': 37,
 'For': 38,
 'Gallery': 39,
 'Gideon': 40,
 'Gisburn': 41,
 'Gisburns': 42,
 'Grafton': 43,
 'Greek': 44,
 'Grindle': 45,
 'Grindles': 46,
 'HAD': 47,
 'HAS': 48,
 'HAVE': 49,
 'Had': 50,
 'Hang': 51,
 'Has': 52,
 'He': 53,
 'Her': 54,
 'Hermia': 55,
 'His': 56,
 'How': 57,
 'I': 58,
 'If': 59,
 'In': 60,
 'It': 61,
 'Jack': 62,
 'Jove': 63,
 'Just': 64,
 'KNOWN': 65,
 'Lord': 66,
 'MINE': 67,
 'Made': 68,
 'Miss': 69,
 'Money': 70,
 'Monte': 71,
 'Moon-dancers': 72,
 'Mr': 73,
 'Mrs': 74,
 'My': 75,
 'NEVER': 76,
 'NOT': 77,

## Byte pair encoding  

In [137]:
import tiktoken

# tiktoken.encoding_for_model("gpt-4o")

tokenizer = tiktoken.get_encoding("gpt2")

tokenizer.n_vocab

tokens = tokenizer.encode("Hello, world!")

print(tokens)

tokenizer.decode(tokens)



[15496, 11, 995, 0]


'Hello, world!'

In [134]:
tiktoken.__version__

'0.12.0'

In [140]:
text = (
  "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
  "of someunknownPlace."
)

tokens = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(tokens)

tokenizer.decode(tokens)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

## Data sampling with a sliding window

In [141]:
with open("the_verdict_by_edith_wharton.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

len(raw_text)

20758

In [144]:
enc_text = tokenizer.encode(raw_text)

print(len(enc_text))

print(enc_text[:20])



5765
[40, 550, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 198, 198, 11274, 5891, 1576]


In [151]:
enc_sample = enc_text[50:]

print(enc_sample)

tokenizer.decode(enc_sample)




[30526, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 198, 198, 49, 452, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 198, 198, 7414, 382, 1198, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 198, 198, 5171, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 198, 198, 14363, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 198, 198, 8367, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 198, 198, 33048, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 198, 198, 817, 5469, 338, 11914, 11, 33096, 663, 19340, 355, 996, 484, 547, 12548, 287, 198, 198, 272, 13079, 410, 12523, 286, 22353, 13, 843, 340, 373, 407, 691, 262, 9074, 13, 536, 48819, 198, 198, 8727, 25722, 276, 13, 11161, 407, 262, 40123, 18113, 544, 9325, 701, 11, 379, 262,

'married a rich widow, and established himself in a villa on the\n\nRiviera. (Though I rather thought it would have been Rome or\n\nFlorence.)\n\n"The height of his glory"--that was what the women called it. I\n\ncan hear Mrs. Gideon Thwing--his last Chicago sitter--deploring\n\nhis unaccountable abdication. "Of course it\'s going to send the\n\nvalue of my picture \'way up; but I don\'t think of that, Mr.\n\nRickham--the loss to Arrt is all I think of." The word, on Mrs.\n\nThwing\'s lips, multiplied its RS as though they were reflected in\n\nan endless vista of mirrors. And it was not only the Mrs. Thwings\n\nwho mourned. Had not the exquisite Hermia Croft, at the last\n\nGrafton Gallery show, stopped me before Gisburn\'s "Moon-dancers"\n\nto say, with tears in her eyes: "We shall not look upon\n\nits like again"?\n\nWell!--even through the prism of Hermia\'s tears I felt able to\n\nface the fact with equanimity. Poor Jack Gisburn! The women had\n\nmade him--it was fitting that they 

In [155]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:        {y}")



x: [30526, 257, 5527, 27075]
y:        [257, 5527, 27075, 11]


In [159]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  # print(f"context: {context}")
  #print(f"target:  {desired}")

  print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))


married --->  a
married a --->  rich
married a rich --->  widow
married a rich widow ---> ,


In [161]:
import torch

In [162]:
torch.__version__


'2.9.1'

In [163]:
from torch.utils.data import DataLoader, Dataset

In [None]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    # tokenize the entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    # use a sliding window to chunk the book into overlapping sequences of max_length
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1:i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))
      
  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]




In [165]:
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

  # Initialize the tokenizer
  tokenizer = tiktoken.get_encoding("gpt2")

  # Create the dataset
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  # Create the dataloader
  dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    drop_last=drop_last,
    num_workers=num_workers
  )

  return dataloader

In [171]:
with open("the_verdict_by_edith_wharton.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False, drop_last=True, num_workers=0)

data_iter = iter(dataloader)

input_ids, target_ids = next(data_iter)
first_batch = next(data_iter)
print(first_batch)


[tensor([[  198,  5562,    11,   287],
        [  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [  198,   198, 30526,   257],
        [ 5527, 27075,    11,   290],
        [ 4920,  2241,   287,   257],
        [ 4489,    64,   319,   262]]), tensor([[ 5562,    11,   287,   262],
        [ 6001,   286,   465, 13476],
        [   11,   339,   550,  5710],
        [  465, 12036,    11,   198],
        [  198, 30526,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262,   198]])]


In [172]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  198,   198,    49,   452],
        [41976,    13,   357, 10915],
        [  314,  2138,  1807,   340],
        [  561,   423,   587, 10598],
        [  393,   198,   198,  7414],
        [  382,  1198,  2014,   198],
        [  198,     1,   464,  6001],
        [  286,   465, 13476,     1]]), tensor([[  198,    49,   452, 41976],
        [   13,   357, 10915,   314],
        [ 2138,  1807,   340,   561],
        [  423,   587, 10598,   393],
        [  198,   198,  7414,   382],
        [ 1198,  2014,   198,   198],
        [    1,   464,  6001,   286],
        [  465, 13476,     1,   438]])]


## Create Token Embeddings

In [181]:
# some sample inputs
input_ids = torch.tensor([  4, 1, 2, 5])


In [176]:
vocab_size = 6
output_dim = 3


embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


embedding_layer.weight



Parameter containing:
tensor([[-1.4834, -0.7829, -1.0509],
        [-0.7511,  2.6537, -0.5902],
        [-1.4377,  0.6608,  0.7411],
        [ 0.1504,  0.5441, -0.5396],
        [ 0.2280,  0.0703,  1.6889],
        [ 1.2675, -0.6175, -0.7259]], requires_grad=True)

In [178]:
embedding_layer(torch.tensor([3]))

tensor([[ 0.1504,  0.5441, -0.5396]], grad_fn=<EmbeddingBackward0>)

In [179]:
input_ids

tensor([4, 1, 2, 8])

In [182]:
embedding_layer(input_ids)

tensor([[ 0.2280,  0.0703,  1.6889],
        [-0.7511,  2.6537, -0.5902],
        [-1.4377,  0.6608,  0.7411],
        [ 1.2675, -0.6175, -0.7259]], grad_fn=<EmbeddingBackward0>)

## Encoding Word Postions

In [183]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [184]:
max_length = 4
dataloader = create_dataloader(
  raw_text, batch_size=8, max_length=max_length, 
  stride=max_length, shuffle=False, drop_last=True, num_workers=0
)

data_iter = iter(dataloader)

inputs, targets = next(data_iter)

print(inputs)
print(targets)






tensor([[   40,   550,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   198],
        [  198, 11274,  5891,  1576],
        [  438,   568,   340,   373],
        [  645,  1049,  5975,   284],
        [  502,   284,  3285,   198]])
tensor([[  550,  1464,  1807,  3619],
        [  402,   271, 10899,  2138],
        [  257,  7026, 15632,   438],
        [ 2016,   257,   198,   198],
        [11274,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   198,   198]])


In [187]:
token_embeddings = token_embedding_layer(inputs)

print(token_embeddings.shape)
# print(token_embeddings)

torch.Size([8, 4, 256])


In [188]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embedding_layer.weight

Parameter containing:
tensor([[ 0.8857, -0.5817,  1.1501,  ..., -0.4804,  0.2321,  1.1119],
        [ 0.3211,  0.1874,  0.4939,  ...,  0.7454, -0.0894,  1.5383],
        [-0.3486, -0.7884, -1.9957,  ...,  0.7999, -0.0103,  0.6461],
        [-2.5018, -0.3786, -0.6431,  ..., -0.0654, -0.9490,  0.4128]],
       requires_grad=True)

In [189]:
torch.arange(context_length)

pos_embedding = pos_embedding_layer(torch.arange(context_length))

pos_embedding.shape










torch.Size([4, 256])

In [190]:
input_embeddings = token_embeddings + pos_embedding
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [191]:
input_embeddings

tensor([[[ 1.2122e+00, -8.3866e-02,  6.3529e-01,  ..., -1.9621e+00,
          -1.4483e+00,  1.1541e+00],
         [-4.8449e-01, -3.6350e-01,  2.8062e+00,  ...,  5.3224e-01,
           6.5470e-01,  1.9782e+00],
         [-1.5299e+00,  7.1715e-01, -2.9867e+00,  ...,  1.8494e+00,
          -7.6761e-01, -5.5942e-01],
         [-2.3130e+00, -7.5287e-01,  2.7377e-01,  ..., -4.1636e-01,
           1.7123e+00,  3.3432e-01]],

        [[ 1.8728e-01, -3.5839e-01,  1.5275e+00,  ..., -2.3140e-01,
           8.9205e-01,  2.4623e+00],
         [ 1.4688e+00, -1.4767e+00,  6.8701e-01,  ..., -1.6858e+00,
          -5.0899e-01,  2.0349e+00],
         [-9.9928e-01,  6.3689e-02, -3.4178e+00,  ...,  1.8795e+00,
           5.3224e-01,  9.1744e-01],
         [-3.3621e+00, -2.4295e+00,  6.4705e-01,  ..., -1.0744e+00,
          -1.0677e+00,  5.6172e-01]],

        [[-5.8574e-01, -1.8432e+00,  1.5181e+00,  ..., -1.4711e+00,
           6.8737e-01, -7.5139e-01],
         [ 1.0637e+00, -1.9614e+00,  2.4300e+00,  .