## loading files

In [63]:
import re
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw = f.read()

print("total number of character : ", len(raw))
print(raw[:150])

total number of character :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of


## converting into tokens

In [38]:
def get_token(text):
    """
    it converts raw utf-8 file to a list of words with sperate punctuation and spacial character
    and also it removes whitespace
    """
    text = re.split(r'([,.?_!"()\']|--|\s)', text)
    # removing whitespace in the list
    # if item.strip() come across white space it removes it and the if state became false and that is not included
    text = [item for item in text if item.strip()]
    return text

print(get_token("Hello, world. Is this-- a test?"))

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [39]:
pre = get_token(raw)
# number of token without whitespace
print(len(pre))
print(pre[:20])

4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


## converting tokens into token IDs

In [57]:
def token_to_ids(pre_text):
    all_words = sorted(list(set(pre_text)))
    vocab_size = len(all_words)

    vocabs = {token:integer for integer,token in enumerate(all_words)}
    return vocabs

In [93]:
vocabs = token_to_ids(pre)
vocab_size = len(vocabs)
print(vocab_size)

len(dict())

1159


0

In [52]:
for i,j in enumerate(vocabs.items()):
    print(j)
    if i>5: break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)


# full tokenizer class

In [107]:
class Tokenizer_v1:
    def __init__(self):
        self.raw = ''
        self.str_to_int = dict()  #make_vocab(text)
        self.int_to_str = dict()  #{i:s for s,i in self.str_to_int.items()}

    def get_token(self,text):
        """
        it converts raw utf-8 file to a list of words with sperate punctuation and spacial character
        and also it removes whitespace
        """
        text = re.split(r'([,.?_!"()\']|--|\s)', text)
        # removing whitespace in the list
        # if item.strip() come across white space it removes it and the if state became false and that is not included
        text = [item for item in text if item.strip()]
        return text

    def token_to_ids(self,pre_text):
        all_words = sorted(list(set(pre_text)))
        vocab_size = len(all_words)
    
        vocabs = {token:integer for integer,token in enumerate(all_words)}
        return vocabs
        
    def make_vocab(self,text):
        self.raw = text
        
        txt = get_token(text)
        vocabs = token_to_ids(txt)

        self.str_to_int = vocabs
        self.int_to_str = {i:s for s,i in vocabs.items()}
        print("vocab successfully created with ",len(self.str_to_int)," words");
    
    def encode(self,text):
        text = self.get_token(text)
        ids = [self.str_to_int[s] for s in text]
        return ids
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



In [108]:
tokenizer = Tokenizer_v1()
tokenizer.make_vocab(raw)

vocab successfully created with  1159  words


In [109]:
text = """"It's the last he painted, you know," Mrs. Gisburn said with"""
ids = tokenizer.encode(text)
print(ids)

print(tokenizer.decode(ids))

[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136]
" It' s the last he painted, you know," Mrs. Gisburn said with
