# Setup

In [43]:
import re
import urllib.request
from pathlib import Path
import tiktoken

from tokenizer import REGEX_PATTERN, SimpleTokenizerV1, SimpleTokenizerV2

# Word tokenizers (preprocessing)

In [44]:
url = "https://raw.githubusercontent.com/rasbt/" "LLMs-from-scratch/main/ch02/01_main-chapter-code/" "the-verdict.txt"
file_path = "the-verdict.txt"

if not Path(file_path).exists():
    urllib.request.urlretrieve(url, file_path)

with Path(file_path).open("r") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20480
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## Splitting text into tokens

In [45]:
some_text = "Hello, world. This, is a test."
result = re.split(REGEX_PATTERN, some_text, flags=re.IGNORECASE)
result = [token for token in result if token and token.strip()]
print(result)

some_text = "Hello, world. Is this-- a test?"
result = re.split(REGEX_PATTERN, some_text, flags=re.IGNORECASE)
result = [token for token in result if token and token.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


### Applying regex to short story

In [46]:
pre_prcessed = re.split(REGEX_PATTERN, raw_text, flags=re.IGNORECASE)
pre_prcessed = [token for token in pre_prcessed if token and token.strip()]
print(len(pre_prcessed))
print(pre_prcessed[:30])

4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Assigning ids to tokens

In [47]:
all_words = sorted(set(pre_prcessed))
vocab_size = len(all_words)
vocab_size

1159

In [48]:
vocab = {token: integer for integer, token in enumerate(all_words)}
max_tokens = 50
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= max_tokens:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


## Using simple tokenizer

### Encoding

In [49]:
tokenizer = SimpleTokenizerV1(vocab)

some_text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(some_text)
ids

[1,
 58,
 2,
 872,
 1013,
 615,
 541,
 763,
 5,
 1155,
 608,
 5,
 1,
 69,
 7,
 39,
 873,
 1136,
 773,
 812,
 7]

### Decoding

In [50]:
result = tokenizer.decode(ids)
result

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Applying it to never seen text

In [51]:
some_text = "Hello, do you like tea?"
# print(tokenizer.encode(text))  # raises because "Hello" is not part of the vocabulary !

### Adding special tokens <unk> and <|endoftext|>

In [52]:
all_tokens = sorted(set(pre_prcessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token: integer for integer, token in enumerate(all_tokens)}
len(vocab.items())

1161

In [53]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [54]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


# BPE Tokenizer


In [57]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces "
     "of someunknownPlace."
)
token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("Tokens and their text:")
for token_id in token_ids:
    print(f"Token ID: {token_id}, Text: {tokenizer.decode([token_id])}")

Tokens and their text:
Token ID: 15496, Text: Hello
Token ID: 11, Text: ,
Token ID: 466, Text:  do
Token ID: 345, Text:  you
Token ID: 588, Text:  like
Token ID: 8887, Text:  tea
Token ID: 30, Text: ?
Token ID: 220, Text:  
Token ID: 50256, Text: <|endoftext|>
Token ID: 554, Text:  In
Token ID: 262, Text:  the
Token ID: 4252, Text:  sun
Token ID: 18250, Text: lit
Token ID: 8812, Text:  terr
Token ID: 2114, Text: aces
Token ID: 286, Text:  of
Token ID: 617, Text:  some
Token ID: 34680, Text: unknown
Token ID: 27271, Text: Place
Token ID: 13, Text: .


In [None]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


## BPE encoding for unknown words excercise

In [59]:
tokenizer = tiktoken.get_encoding("gpt2")
text = "Rouchi malaga proutokoul !! Whesh manakol!"
token_ids = tokenizer.encode(text)
print(token_ids)
print("Tokens and their text:")
for token_id in token_ids:
    print(f"Token ID: {token_id}, Text: {tokenizer.decode([token_id])}")

[49, 7673, 72, 6428, 8126, 778, 448, 482, 2852, 37867, 370, 956, 71, 582, 461, 349, 0]
Tokens and their text:
Token ID: 49, Text: R
Token ID: 7673, Text: ouch
Token ID: 72, Text: i
Token ID: 6428, Text:  mal
Token ID: 8126, Text: aga
Token ID: 778, Text:  pr
Token ID: 448, Text: out
Token ID: 482, Text: ok
Token ID: 2852, Text: oul
Token ID: 37867, Text:  !!
Token ID: 370, Text:  W
Token ID: 956, Text: hes
Token ID: 71, Text: h
Token ID: 582, Text:  man
Token ID: 461, Text: ak
Token ID: 349, Text: ol
Token ID: 0, Text: !
