In [90]:
import re
from typing import List

In [3]:
with open('verdict.txt',"r",encoding="utf-8") as f:
    txt=f.read()

In [4]:
#lets tokenize 
tokenized=re.split(r'([,.?_!"()\']|--|\s)',txt)

# print(len(tokenized))
tokenized=[item for item in tokenized if item.split()]
print(len(tokenized))


4649


# lets make vocublary

In [5]:
clean_token=sorted(list(set(tokenized)))

In [6]:
print(len(clean_token))

1159


In [7]:
#lets make the vocab
text_index={text:index for index,text in enumerate(clean_token)}
index_text={index:text for index,text in enumerate(clean_token)}

In [8]:
print(text_index['that'])

1012


In [9]:
#this is the encoder
def encoder(text):
    #lets tokenize it
    tokenizer=re.split(r'([,.?_!"()\']|--|\s)',text)  
    tokenizer=[item for item in tokenizer if item.strip()]
    
    #lets make token id
    token_id=[]
    for i in tokenizer:
        token_id.append(text_index[i])
    return token_id

#this is the decoder
def decoder(encoded):
    decoded_id=[]
    for i in encoded:
        decoded_id.append(index_text[i])
    return decoded_id

In [10]:
print(encoder('this is not good'))
print(decoder([1024, 595, 727, 508]))

[1024, 595, 727, 508]
['this', 'is', 'not', 'good']


# lets make ita a  class

In [11]:
class TokenizerVersion1():
    def __init__(self,vocab) -> None:
        self.vocab=vocab
    
    def encoder(self,text):
        #lets tokenize it
        tokenizer=re.split(r'([,.?_!"()\']|--|\s)',text)  
        tokenizer=[item for item in tokenizer if item.strip()]
        
        #lets make token id
        token_id=[]
        for i in tokenizer:
            token_id.append(self.vocab[i])
        return token_id
    
    def decoder(self,encoded):
         self.decoded_id=[]
         for i in encoded:
            self.decoded_id.append(index_text[i])
        
         text=" ".join(i for i in self.decoded_id)
         text= re.sub(r'\s+([,.?_!"()\'"])', r'\1', text)
         return text

In [12]:
#lets check out the class
text="""
    this is the whole word of that women said "her man?"
"""
toknizer=TokenizerVersion1(vocab=text_index)

encode=toknizer.encoder(text=text)
print(encode)

decoder=toknizer.decoder(encoded=encode)
print(decoder)

[1024, 595, 1013, 1125, 1145, 738, 1012, 1140, 873, 1, 547, 671, 10, 1]
this is the whole word of that women said" her man?"


# handle unknown tokens

In [21]:
#add new token into old
all_tokens=clean_token
#appending
all_tokens.extend(['<|endoftext|>',"<|unk|>"])
#make the vocab 
vocab={token:index for index,token in enumerate(all_tokens)}

In [67]:
if 'the' in vocab:
    print("present")
else:
    print('not present')

present


In [68]:
vocab['the']

1013

In [22]:
len(all_tokens),len(vocab)

(1163, 1161)

# include the 2 new tokens into the tokenizer

In [122]:
class TokenizerVersion2():
    def __init__(self,vocab) -> None:
        self.vocab=vocab  #text to num
        self.index_text={index:text for text,index in self.vocab.items()}  #num to text
    
    def encoder(self,text)->List:
        #lets tokenize it
        tokenized=re.split(r'([,.?_!"()\']|--|\s)',text)  
        tokenized=[item for item in tokenized if item.strip()]
        
        #lets make token id
        token_id=[item if item in self.vocab else "<|UNK|>" for item in tokenized]
        ids=[self.vocab[item] for item in token_id]
       
        return ids
    
    def decoder(self,encoded):
         print(encode)
         text = " ".join([self.index_text[i] for i in encoded])
         
         text= re.sub(r'\s+([,.?_!"()\'"])', r'\1', text)
         return text

In [123]:
#lets check out the class
text="""
    the is the whole word of that women said "her man?"
"""
toknizer=TokenizerVersion2(vocab)

encode=toknizer.encoder(text=text)
print("this is encoder")
print(encode)

decoder=toknizer.decoder(encode)
print("this is the decoder")
print(decoder)

this is encoder
[1013, 595, 1013, 1125, 1145, 738, 1012, 1140, 873, 1, 547, 671, 10, 1]
[1013, 595, 1013, 1125, 1145, 738, 1012, 1140, 873, 1, 547, 671, 10, 1]
this is the decoder
the is the whole word of that women said" her man?"


In [124]:
for i,(k,v) in enumerate(vocab.items()):
    print(k,v)
    if(i>10):
        break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
