In [1]:
import re
import pandas
import numpy

import urllib.request

In [2]:
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url,file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x28d798dd890>)

In [3]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
    print("Total number of characters:", len(raw_text))
    print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [43]:
class SimpleTokenizerV1:
    """
    A simple tokenizer that splits text into tokens using punctuation and whitespace,
    maps tokens to integers (encode), and converts integer sequences back to text (decode).

    Attributes
    ----------
    str_to_int : dict
        A dictionary mapping string tokens to integer token IDs.
    int_to_str : dict
        A dictionary mapping integer token IDs to string tokens (inverse of str_to_int).
    """
    def __init__(self,vocab):
        """
        Initialize the tokenizer with a vocabulary.

        Parameters
        ----------
        vocab : dict
            A mapping {token_string: integer_id}.
        """
        self.str_to_int = vocab
        # Reverse mapping for decoding
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Tokenize a string into token IDs.

        Steps:
        1. Split on punctuation, quotes, parentheses, and whitespace.
        2. Strip whitespace and remove empty strings.
        3. Convert tokens into integer IDs using the vocabulary.

        Parameters
        ----------
        text : str
            Input text to tokenize.

        Returns
        -------
        list of int
            List of token IDs corresponding to the input text.
        """
        # Split on punctuation or whitespace while keeping delimiters as tokens
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)',text)
        # Remove empty tokens and strip whitespace
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        # Convert each token into its integer representation
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        """
        Convert a list of token IDs back into readable text.

        Steps:
        1. Convert each ID back into a token string.
        2. Join with spaces.
        3. Remove extra spaces before punctuation.

        Parameters
        ----------
        ids : list of int
            List of token IDs.

        Returns
        -------
        str
            Decoded text string.
        """
        # Convert integer IDs back to string tokens
        text = " ".join([self.int_to_str[i] for i in ids])

        # Remove spaces before punctuation
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text
    


In [9]:
# Split story into tokens the same way the tokenizer does
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

4649
1159


In [10]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [44]:
tokenizer = SimpleTokenizerV1(vocab)

In [45]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]


In [46]:
tokenizer.encode('Ah')

[12]

In [27]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1161


In [28]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


In [47]:
class SimpleTokenizerV2:
    """
    A simple tokenizer that splits text into tokens using punctuation and whitespace,
    maps tokens to integers (encode), and converts integer sequences back to text (decode).

    Attributes
    ----------
    str_to_int : dict
        A dictionary mapping string tokens to integer token IDs.
    int_to_str : dict
        A dictionary mapping integer token IDs to string tokens (inverse of str_to_int).
    """
    def __init__(self,vocab):
        """
        Initialize the tokenizer with a vocabulary.

        Parameters
        ----------
        vocab : dict
            A mapping {token_string: integer_id}.
        """
        self.str_to_int = vocab
        # Reverse mapping for decoding
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Tokenize a string into token IDs.

        Steps:
        1. Split on punctuation, quotes, parentheses, and whitespace.
        2. Strip whitespace and remove empty strings.
        3. Convert tokens into integer IDs using the vocabulary.

        Parameters
        ----------
        text : str
            Input text to tokenize.

        Returns
        -------
        list of int
            List of token IDs corresponding to the input text.
        """
        # Split on punctuation or whitespace while keeping delimiters as tokens
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        # Remove empty tokens and strip whitespace
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int 
                else "<|unk|>" for item in preprocessed]
        # Convert each token into its integer representation
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        """
        Convert a list of token IDs back into readable text.

        Steps:
        1. Convert each ID back into a token string.
        2. Join with spaces.
        3. Remove extra spaces before punctuation.

        Parameters
        ----------
        ids : list of int
            List of token IDs.

        Returns
        -------
        str
            Decoded text string.
        """
        # Convert integer IDs back to string tokens
        text = " ".join([self.int_to_str[i] for i in ids])

        # Remove spaces before punctuation
        text = re.sub(r'\s+([,.:;?!"()\'])',r'\1',text)
        return text
    


In [48]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [49]:
text

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.'

In [50]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]


In [1]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.12.0


In [2]:
tokenizer = tiktoken.get_encoding('gpt2')

In [3]:
text = (
    "Hello, do you think tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

In [4]:
print(integers)

[15496, 11, 466, 345, 892, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [5]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

[33901, 86, 343, 86, 220, 959]


In [10]:
for i in integers:
    print(tokenizer.decode([i])) 

Ak
w
ir
w
 
ier


In [6]:
tokenizer.decode(integers)

'Akwirw ier'