In [1]:
# Tokenizing text

import os  # Imports Python's built-in os module.
import urllib.request # Provides functions for making HTTP requests and handling URLs

#Checks if a file named "Story.txt" exists in a "Resources" directory
if not os.path.exists("Resources/Story.txt"):
    #Defines the URL where the story text file can be downloaded from. Points to a raw text file hosted on GitHub.
    URL = ("https://raw.githubusercontent.com/majidarasteh/Large-Language-Model_LLM/refs/heads/main/Resources/Story.txt")

    # Sets the local file path where the downloaded file will be saved
    file_path = "Story.txt"

    # In this case, it will save to "Story.txt" in the current directory
    urllib.request.urlretrieve(URL, file_path)

# Reads the entire file content as a single string
with open("Story.txt", "r", encoding="utf-8") as f:
    story_text = f.read()

In [2]:
print(story_text)

One dollar and eighty-seven cents. That was all. And sixty cents of it was in pennies. Pennies saved one and two at a time by bulldozing the grocer and the vegetable man and the butcher until one's cheeks burned with the silent imputation of parsimony that such close dealing implied. Three times Della counted it. One dollar and eighty-seven cents. And the next day would be Christmas.

There was clearly nothing left to do but flop down on the shabby little couch and howl. So Della did it. Which instigates the moral reflection that life is made up of sobs, sniffles, and smiles, with sniffles predominating.

While the mistress of the home is gradually subsiding from the first stage to the second, take a look at the home. A furnished flat at $8 per week. It did not exactly beggar description, but it certainly had that word on the look-out for the mendicancy squad.

In the vestibule below was a letter-box into which no letter would go, and an electric button from which no mortal finger coul

In [3]:
# Lenght text
len(story_text)

11247

In [4]:
"""
  Regular expressions are like super-powered search patterns for text.
  Like a Google Search...
  But instead of searching for exact words, you search for patterns like:
    Find me all words that start with 'c' and end with 't'"
    "Find all 10-digit phone numbers"
    Find dates in MM/DD/YYYY format"
    Find all email addresses.
"""

import re # Regulars expression

text = "Hello World, A text just for testing! -!-@-@$$"
result1 = re.split(r'(\s)',text)
result2 = re.split(r'([,.]|\s)',text)
result3 = re.split(r'([,.:;?_!"()\']|--|\s)',text)
print(result1)
print(result2)
print(result3)

['Hello', ' ', 'World,', ' ', 'A', ' ', 'text', ' ', 'just', ' ', 'for', ' ', 'testing!', ' ', '-!-@-@$$']
['Hello', ' ', 'World', ',', '', ' ', 'A', ' ', 'text', ' ', 'just', ' ', 'for', ' ', 'testing!', ' ', '-!-@-@$$']
['Hello', ' ', 'World', ',', '', ' ', 'A', ' ', 'text', ' ', 'just', ' ', 'for', ' ', 'testing', '!', '', ' ', '-', '!', '-@-@$$']


In [5]:
# Number of tokens
tokens = re.split(r'([,.:;?_!"()\']|--|\s)',story_text)
print(len(tokens))

4987


In [6]:
# Tokens
print(tokens[:100])

['One', ' ', 'dollar', ' ', 'and', ' ', 'eighty-seven', ' ', 'cents', '.', '', ' ', 'That', ' ', 'was', ' ', 'all', '.', '', ' ', 'And', ' ', 'sixty', ' ', 'cents', ' ', 'of', ' ', 'it', ' ', 'was', ' ', 'in', ' ', 'pennies', '.', '', ' ', 'Pennies', ' ', 'saved', ' ', 'one', ' ', 'and', ' ', 'two', ' ', 'at', ' ', 'a', ' ', 'time', ' ', 'by', ' ', 'bulldozing', ' ', 'the', ' ', 'grocer', ' ', 'and', ' ', 'the', ' ', 'vegetable', ' ', 'man', ' ', 'and', ' ', 'the', ' ', 'butcher', ' ', 'until', ' ', 'one', "'", 's', ' ', 'cheeks', ' ', 'burned', ' ', 'with', ' ', 'the', ' ', 'silent', ' ', 'imputation', ' ', 'of', ' ', 'parsimony', ' ', 'that', ' ']


In [7]:
# Unique tokens
all_words = sorted(set(tokens))
all_words_size = len(all_words)
print(all_words_size)

826


In [8]:
print(all_words[:100])

['', '\n', ' ', '!', '"', '$1', '$20', '$30', '$8', "'", ',', '--', '.', '7', '78', '87', ':', ';', '?', 'A', 'All', 'Also', 'And', 'As', 'At', 'Babe', 'Bat', 'Be', 'Beautiful', 'Being', 'Broadway', 'But', 'Christmas', 'Combs', 'Coney', 'Cut', 'D', 'Day', 'Dell', 'Della', 'Dillingham', 'Don', 'Down', 'Eight', 'Eve', 'Everywhere', 'Expenses', 'For', 'Forget', 'Give', 'God', 'Goods', 'Grand', 'Had', 'Hair', 'He', 'Her', 'His', 'I', 'If', 'In', 'Instead', 'Island', 'Isn', 'It', 'James', 'Jim', 'Kinds', 'King', 'Madame', 'Majesty', 'Many', 'Maybe', 'Merry', 'Mme', 'Mr', 'Mrs', 'My', 'Now', 'Of', 'Oh', 'On', 'Once', 'One', 'Only', 'Out', 'Pennies', 'Perhaps', 'Please', 'Poor', 'Queen', 'Quietness', 'Rapidly', 'Say', 'Shall', 'She', 'Sheba', 'So', 'Sofronie', 'Solomon']


In [9]:
# Mapping each word to an integer.
vocabulary = {token:integer for integer, token in enumerate(all_words)}
print(len(vocabulary))

826


In [10]:
print(vocabulary)

{'': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '$1': 5, '$20': 6, '$30': 7, '$8': 8, "'": 9, ',': 10, '--': 11, '.': 12, '7': 13, '78': 14, '87': 15, ':': 16, ';': 17, '?': 18, 'A': 19, 'All': 20, 'Also': 21, 'And': 22, 'As': 23, 'At': 24, 'Babe': 25, 'Bat': 26, 'Be': 27, 'Beautiful': 28, 'Being': 29, 'Broadway': 30, 'But': 31, 'Christmas': 32, 'Combs': 33, 'Coney': 34, 'Cut': 35, 'D': 36, 'Day': 37, 'Dell': 38, 'Della': 39, 'Dillingham': 40, 'Don': 41, 'Down': 42, 'Eight': 43, 'Eve': 44, 'Everywhere': 45, 'Expenses': 46, 'For': 47, 'Forget': 48, 'Give': 49, 'God': 50, 'Goods': 51, 'Grand': 52, 'Had': 53, 'Hair': 54, 'He': 55, 'Her': 56, 'His': 57, 'I': 58, 'If': 59, 'In': 60, 'Instead': 61, 'Island': 62, 'Isn': 63, 'It': 64, 'James': 65, 'Jim': 66, 'Kinds': 67, 'King': 68, 'Madame': 69, 'Majesty': 70, 'Many': 71, 'Maybe': 72, 'Merry': 73, 'Mme': 74, 'Mr': 75, 'Mrs': 76, 'My': 77, 'Now': 78, 'Of': 79, 'Oh': 80, 'On': 81, 'Once': 82, 'One': 83, 'Only': 84, 'Out': 85, 'Pennies': 86, 'Perhaps':

In [11]:
# Create a class to encoding (tokenization) and decoding a text.

class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab            #1
        self.int_to_str = {i:s for s,i in vocab.items()}        #2

    def encode(self, text):         #3
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        tokens = [
            item.strip() for item in tokens if item.strip() # This removes leading/trailing whitespace from each token
        ]
        ids = [self.str_to_int[s] for s in tokens]
        return ids

    def decode(self, ids):         #4
        text = ' '.join([self.int_to_str[i] for i in ids]) 

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)    #5
        return text

In [12]:
# Make an object from the class SimpleTokenizerV1
tokenizer = SimpleTokenizerV1(vocabulary)

In [13]:
# Coding a sample text with our tokenizer object.
new_text = """ it is a new word!"""
ids = tokenizer.encode(new_text)
print(ids)

[428, 427, 125, 520, 812, 3]


In [14]:
# Decoding the tokenized text.
tokenizer.decode(ids)

'it is a new word!'

In [15]:
# Handling unknown words like: &*^&*
# We sill see an error for unknown wrods such as &*^&*!
new_text = """ it is a new word! &*^&* """
ids = tokenizer.encode(new_text)
print(ids)

KeyError: '&*^&*'

In [16]:
# Handling "unknown words" and "end of text" using '<|unk|>' and '<|endoftext|>'
# Extending the vocabulary by adding '<|unk|>' and '<|endoftext|>'

print(all_words_size)
tokens = sorted(list(all_words))
tokens.extend(['<|endoftext|>', '<|unk|>'])
vocabulary = {token:integer for integer,token in enumerate(tokens)}

826


In [17]:
print(len(vocabulary))

828


In [18]:
# Print the extended vocabulary
for i, item in enumerate(list(vocabulary.items())[-5:]):
    print(item)

('yet', 823)
('you', 824)
('your', 825)
('<|endoftext|>', 826)
('<|unk|>', 827)


In [19]:
# Create a class to encoding (tokenization) and decoding a text by handlig "unkown words" and "end of text"

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):         #3
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        tokens = [
            item.strip() for item in tokens if item.strip() # This removes leading/trailing whitespace from each token
        ]
        tokens = [item if item in self.str_to_int
            else "<|unk|>" for item in tokens]
        ids = [self.str_to_int[s] for s in tokens]
        return ids

    def decode(self, ids):         #4
        text = ' '.join([self.int_to_str[i] for i in ids]) 

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)    #5
        return text

In [20]:
new_text = """ it is a new word! &*^&* """
tokenizer = SimpleTokenizerV2(vocabulary)
print(tokenizer.encode(text))

[827, 827, 10, 19, 827, 436, 334, 827, 3, 827, 3, 827]


In [21]:
print(tokenizer.decode(tokenizer.encode(new_text)))

it is a new word! <|unk|>
