In [8]:
# Step 1 : Read DataSet

In [20]:
with open("data-sets/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character: ", len(raw_text))
print(raw_text[:99])

Total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [22]:
# ex: use re library to example to split text


In [24]:
import re

text="hello, this is a simple text :) . "
result_of_simple_tokenazation = re.split(r'(\s)', text)
print(result_of_simple_tokenazation)

['hello,', ' ', 'this', ' ', 'is', ' ', 'a', ' ', 'simple', ' ', 'text', ' ', ':)', ' ', '.', ' ', '']


In [30]:
#Tip for tokenization

"""
REMOVING WHITESPACES OR NOT

When developing a simple tokenizer, whether we should encode whitespaces as separate characters or just remove them depends on our application and its requirements. Removing whitespaces reduces the memory and computing requirements.
However, keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text (for example, Python code, which is sensitive to indentation and spacing). Here, we remove whitespaces for simplicity and brevity of the tokenized outputs. Later, we will switch to a tokenization scheme that includes whitespaces.

"""


'\nREMOVING WHITESPACES OR NOT\n\nWhen developing a simple tokenizer, whether we should encode whitespaces as separate characters or just remove them depends on our application and its requirements. Removing whitespaces reduces the memory and computing requirements.\nHowever, keeping whitespaces can be useful if we train models that are sensitive to the exact structure of the text (for example, Python code, which is sensitive to indentation and spacing). Here, we remove whitespaces for simplicity and brevity of the tokenized outputs. Later, we will switch to a tokenization scheme that includes whitespaces.\n\n'

In [32]:
#Step 2: Create TOKENS

In [36]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()] #remove whitespace's
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [42]:
print(len(preprocessed)) # our total token lengths.

4690


In [50]:
# Step 3: Convert tokens into Token Id's, Create Token Id's
# Each unique token is mapped to an unique integer called token Id


In [56]:
all_words = sorted(set(preprocessed)) # get unique words with alfapeticall
vocabulary_size = len(all_words)
print(vocabulary_size)

1130


In [62]:
vocab = { # assign token to unique id's
    token: integer for integer, token in enumerate(all_words)
}

In [66]:
for i, item in enumerate(vocab.items()): # show to tokens and token Id's
    print(item)
    if i >= 50:
        break
        

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [68]:
# Step 4: create simple Tokenizer
class SimpleTokenizerV1:
    def __init__(self, vocab): #vocab is hash map to key value pair. # s=token, i=tokenId
        self.str_to_int = vocab # token to tokenId need for encoder
        self.int_to_str = {i:s for s,i in vocab.items()} # reverse is need to for decoder, tokenId to token
    
    def encode(self, text): # encode ile convert sample text into TOKEN Id's
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text) # split into individual tokens.
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip() #whitespace kaldırıyor.
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) #TokenId den Token'lara dönüşüm sağlıyor.
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [70]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [72]:
tokenizer.decode(ids)



'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [74]:
# we will add, adding special context token, yani girilen kelime eğer datasetde yoksa error vermesin diye endoftext token ile unknown token
#ekleyeceğiz.

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab.items())


1132

In [76]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [78]:
tokenizer = SimpleTokenizerV2(vocab) # new tokenizer to handle new unknown token and endoftext token

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [80]:
tokenizer.encode(text)


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [82]:
tokenizer.decode(tokenizer.encode(text))


'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'