## Tokenization
#### how text tokenization can be done in a basic way.
#### I've used Andrej Karpathy's video(https://www.youtube.com/watch?v=zduSFxRajkE&t=2576s&ab_channel=AndrejKarpathy) to do this excersice

In [551]:
class MyTokenizer():

    def stats(self, ids):
        
        counts={}
        
        for pair in zip(ids,ids[1:]):
            counts.setdefault(pair,0) 
            counts[pair] +=1
            maxpair=max(counts, key=counts.get)

        return counts,maxpair

    def merge(self,ids,maxpair,newtok):
        newids=[]
        for i,pair in enumerate(zip(ids,ids[1:])):
            
            if pair==maxpair: #if the pair matches the max pair then we need to append newtoken.
                newids.append(newtok)
            elif ids[i]==maxpair[1] and ids[i-1]==maxpair[0]:  #if the previous cycle was merging cycle, we need to jump one cycle
                pass
            elif i==len(ids)-2: #if we are at the last pair then we need to append last digit as an extra.
                newids.append(ids[i])
                newids.append(ids[i+1])
            else:
                newids.append(ids[i])

        return newids


    def train(self,text,vocabsize, verbose=False):
        assert vocabsize>=256
        nummerges = vocabsize-256
        ids=list(text.encode("utf-8")) #identifiers are actually bytes forms of the characters in base 10
        savedmerges={}
        vocab={token:bytes([token]) for token in range(256)}
        
        for i in range(nummerges):
            counts,maxpair=self.stats(ids)
            newtok=256+i #the new token will be iterated through number of merges
            ids=self.merge(ids,maxpair,newtok)

            savedmerges[maxpair]=newtok
            vocab[newtok]=vocab[maxpair[0]]+vocab[maxpair[1]]

        self.merges=savedmerges
        self.tokens=vocab
        return savedmerges
        

    
    def encode(self,text):  #this text is a new text. Not the text in the training session.

        ids=list(text.encode("utf-8"))
        while len(ids)>=2:
            textpairs,b=self.stats(ids)
            pair=min(textpairs, key=lambda p: self.merges.get(p, float("inf"))) 
            #minimum newtoken(256 and so on)'s parent pair is needed to be find first.
            #Because remember our merging dict is hierarchical. 
            #And the later merged pairs can be a result of previous merged pairs

            if pair not in self.merges: #we still need to check out whether pair in the merges or not
                break

            newtok=self.merges[pair]
            ids=self.merge(ids, pair, newtok)
        return ids

    def decode(self,ids):
        text=b"".join(self.vocab[i] for i in ids).decode("utf-8", errors="replace")
        return text 
        
        
        
        

In [552]:
mytok=MyTokenizer()
merges=mytok.train(text,275,False)

In [554]:
b=mytok.encode(text)

