In [3]:
#for special tokens
class Handle_special():
    def __init__(self, allowed_special, ids, vocab_length):
        self.allowed_special = allowed_special
        self.ids = ids
        self.vocab_length = vocab_length
        self.special_dict = {}
        
    #replace sequence
    def replace_sequence(self, id_lst, target, replacement):
        n = len(target)
        i=0
        if isinstance(replacement, list):
            m = len(replacement)
        else:
            m=1
            replacement = [replacement]
        while i <= len(id_lst)-n+1:
            if id_lst[i:i+n]==target:
                id_lst[i:i+n]=replacement
                i+=m
                #print("replcaing")
            else:
                i+=1

        return id_lst
    #small vocab for special tokens
    def special_tokens(self):
        special_ids = [tuple(map(int, i.encode('utf-8'))) for i in self.allowed_special]
        j=0
        for id_ in special_ids:
            self.special_dict[id_] = self.vocab_length+j
            j+=1
        #return special_dict

    #small
    def add_tokens(self):
        newer = self.ids.copy()
        #print(newer)
        #print(len(newer))
        #print("-"*50)
        self.special_tokens() #populate special_dict
        for k in self.special_dict.keys():
            newer = self.replace_sequence(newer, list(k), self.special_dict[k])
        return newer
        
        

In [4]:
#usage
#encoder = MyTokenizer(corpus, vocab_length)
#encoder.train()
#then u can use it to enccode and decode 

class MyTokenizer():
    def __init__(self, corpus, vocab_size=1000, allowed_special=None):
        self.corpus = corpus
        self.vocab_size = vocab_size
        self.merges = {} # (int, int) -> int
        self._vocab = {idx:bytes([idx]) for idx in range(256)}
        self.allowed_special = allowed_special #list of special tokens
        self.h=None
        
    def _get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]): #pythonic way to iterate over 
            counts[pair] = counts.get(pair, 0) + 1
        return counts
    
    def _merge(self, ids, pair, idx):
        newids = []
        i=0
        while i<len(ids):
            #if not at last position and pair matches, replace it
            if i<len(ids)-1 and ids[i]==pair[0] and ids[i+1] ==pair[1]:
                newids.append(idx)
                i+=2
            else:
                newids.append(ids[i])
                i+=1
        return newids
    
    def train(self):
        tokens = self.corpus.encode("utf-8") #raw bytes
        tokens = list(map(int, tokens))
        
        ids = list(tokens) # copy not to destroy orig list
        self.h = Handle_special(self.allowed_special, ids,self.vocab_size)
        ids = self.h.add_tokens() # merge special tokens and return new tokens
        #
        # add new tokens to end of vocab
        spec_dict =  self.h.special_dict
        for i in list(spec_dict.keys()):
            v = b''
            for j in range(len(i)):
                v+=self._vocab[i[j]]
            self._vocab[spec_dict[i]]=v

        
        num_merges = self.vocab_size - 256


        #for each run, get top pair and replace it
        for i in range(num_merges):
            stats = self._get_stats(ids)
            # Check if stats is empty
            if not stats:
                print(f"No more pairs to merge at iteration {i}.")
                print(f"You are overfitting, reduce the vocabulary size to an appropriate one")
                break
            pair = max(stats, key = stats.get)
            idx = 256+i
            #print(f"merging {pair} into new token {idx}")
            ids = self._merge(ids, pair, idx) #replcae occuances
            self.merges[pair] = idx
            if i%100==0:
                print(f"Merging the {i}th pair")
            
        
        for (p0, p1), idx in self.merges.items():
            self._vocab[idx] = self._vocab[p0]+self._vocab[p1]

        
            
        print("tokens length: ", len(tokens))
        print("ids length: ", len(ids))
        print(f"compression ratio: {len(tokens)/len(ids):.2f}X")

        
    def decode(self,ids): #enter list of tokens i.e [23,44,55]
        tokens = b"".join(self._vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors = "replace")
        return text
    
    def encode(self, text): #enter text
        tokens = list(text.encode('utf-8'))
        #first replace the special tokens if present
        new_tokens = list(tokens)
        for i in range(len(self.allowed_special)):
            target = tuple(map(int,self.allowed_special[i].encode('utf-8')))
            new_tokens = self.h.replace_sequence(new_tokens, list(target), self.h.special_dict[target])
        
        print("Length of original tokens: ",len(tokens))
        print("Length of tokens after handling special words: ", len(new_tokens))
        
        tokens = list(new_tokens)
        while len(tokens)>=2:
            stats = self._get_stats(tokens)

            #we gonna check pair with minimum value in vocab, if it exist the first one actually
            #until all are done
            pair = min(stats, key=lambda p:self.merges.get(p, float("inf")))

            if pair not in self.merges:
                break #nothing else to merge, break
            idx = self.merges[pair]
            tokens = self._merge(tokens, pair, idx)
        print("Length of tokens after merging: ",len(tokens))
        return tokens
    
    def get_vocabulary(self):
        return dict(sorted(self._vocab.items()))
    
    

    
        

In [5]:
#a longer text
text = """Mikwano gyange bwoya gitera okumbuuza engeri amaaso gange agaali ag'empujjo gye 
gaateereeramu are ne bwe nakutuka ekiwalaata eky'embagirawo ekitaali mu lulyo ate 
n'ennyindo n'ensongola. Okubannyonnyola bino nga bwe byajja nali nteekwa okubawayiza 
ebyantuukako mu sematalo on owa jjo lya balamu bye nali nnyimye amazzi. Naye no bwe 
nabinyumizaako kagafumba ne nfabulago, bo kwe kuntayirira mbinyumizeeko n'abalala era 
baludde ddaaki ne bankuulamu omwasi.|<eos>| 
Be twasomanga nabo bammanyi nga Kakundugulu, erinnya lye bampaatiikako ne lisimba 
emmizi, ate bo bwe twazirwanako bammanyi nga Kasiribiti. Eby'e Kololo nga Sargent  ŋŋoŋo 
simusudde, Tororo, Giligiri, Mbagasi ne Gonda ebyo byo mbirekedde bannange abalala 
babirombojjeko naye ebyange nja kubitandikira mu nsi z’Abawalabu.|<eos>| 
Nga tumaze okukekeza ennyago mu nsi nnyingi, twatuuka mu nsi z’Abawarabu gye 
nnyingirira mu kitongole ekikessi. Bwe twamala okutendekebwa, okubangulwa era 
nokukenkusibwa mu katoola w’ebintu ne tubuuzibwa ebigezo ebyalumya buli omu 
ogwengulu. Bwe kityo nga abamu tumaze okubikuba oluku mu mutwe n'abalala nga 
bibatudde ku nfeete, Sargent Kwi Tamutaamu omuyugoyugo era eyali omugabe waffe Kopolo 
Monyo Odeku, Figo Mukombankuyege Kataayi, Palalemo Byomera, Ludyeku era nange 
kamwakoogera, twalondebwa mu gubinja gwa bantu nga kinaana tugende tukette mu bizinga 
bye Yugoyugo ebyetondese mu bwengula bweriyanja li Pasifika (Pacific Ocean) ebyali 
biwambiddwa aba Japan era kwe basinziiranga okutagenya amaato ga b'amawanga 
amagatte. Okuggyako nze, bannange bonna baali njasabiggu za basajja abatayisikamu maaso 
wadde okuwetwamu ennoga.|<eos>| 
Buli omu ku ffe yalina kyatwalirirwa okuketta. Nze nnabinikibwako kuketta gusaawe 
gw’ennyonyi ogwaliyo ngukube ebifaananyi era nekenneenye era nkube ezigubuukako 
ebifaananyi  ate era nekkaanye embeera n’enkyukakyuka z’obudde mu bitundu ebyo. 
Naweebwa emyezi ebiri gyokka okutuukiririzaamu bino byonna. Nali Oluyugoyugo ndukuba 
budinda era nga n’Olujapaani nkusuwazizaamu.|<eos>| 
Mu matulutulu g’olunaku olumu, awatali na kusooka kubbirwako, twawawamulwa mu tulo 
okunyanyagira kw’eŋŋombe era okugenda okwesimba mu luggya lw’ensiisira zaffe nga tulaba 
gamotoka galwanyi ge gakungubadde. Twayambazibwa kipaku mu byambalo by’ekiwarabu. 
Amakanzu gwatwambikiddwa, ebimyu bimyuumyuddwa, eminagiro gisuuliddwa, agalemba 
galinnye ku mitwe, balikkupu mususaane zinaanikiddwa, amasulubu n’amalevu 
amajingejinge gatujenjekeddwaako era n’amannya g’ekiwarabu gatupaatikiddwaako. Nze 
munnowo nga mpitibwa Abdulla Hussein. Mu ndabika yonna nga otulengedde wamma ggwe 
nga ffe Bawarabu ba nnakabala ggeregere awatali kuwunnaanya.
Okwesiiga gerenge kwatutwalira akaseera akatabandaaza na mbooge. Bwe kutyo nga 
kuwedde, twesolossa mu gamotoka agalwanyi. Olwali okuggweeramu tuti nga tweggyawo. 
Twagawuliranga nga gagenda gawenyuka, gagenda gawuluguma, gatokota, gaguluba, 
gayuuga era gebbinkiza, gasituka naye okukugamba nti twalengerako gyetuva oba gyetulaga, 
oba wadde okulaba akamunyeenya obumunyeenyi, olwo ekkubo eryange liba ku lusebenju 
nga lye bagamba ery’omulimba. Munda mwe twali nga mukutte be kwatekwate, mufunda, 
twezinze bugongolo era nga n’entuuyo zituttulukuka nga abatudde ku kabiga.
Mu ttuntu wakati twagguka ku kaalo keetikkiddwa enfuufu olwa kikunta eyali yekaaliisa era 
ke nateebereza okubeera ku nnyanja emmyuufu. Abantu abaakalimu baali ba munaganwa 
ngoyinza okubabalira ku ngalo. Saabetegera nakatono kubanga olwali okwesowola mu 
gumotoka nga bampalampya bukwakku mu kaato akaali ake kika ekyezi ky’emitego era ne 
bannange bwebatyo bwe baabakola. Ffenna okugenda okuggweerako nga nga kaatandise dda 
kusiikuula nkasi .|<eos>| 
Bwe twatuula mu nnyanja ye Buyindi (Indian Ocean) ne tumaamulwako ebyambalo 
by’eKiwarabu byonna era manalevu gaffe ge twali tutandise okunyumirwa ne galugenda. 
Twayambazibwa ebyambalo byekirunnyanja ne tuzzibwa ne mulyato eddala ery’ekika 
ekirawunyi. Mu kutyekula emisinde nga lino eryato lye lisinga amalala gonna agaali awo. 
Omuyaga oba engezi bwe byabanga tebiriboyaanya oba okulisunda oba okulizinyisa oluguje, 
nga tukkirizibwa okudaaladaalako waggulu ku lyo naye naye nga tetukkirizibwa kulasa 
mboozi wadde okuwaya n’omulunnyanja yenna. Kye siyinza kwerabira gwe muyaga oguli ku 
nnyanja eno anti buli kiseera nga gwesooza bwesooza.|<eos>| 
Nga tumaze ennaku nga tupekukira ku nnyanja eno, twayingirira eriyanja li kiri mulaala li 
Pasifika ate netuzzibwa ne mu gwato ogubuukirwako ennyonyi ogw’Abamerika. Olwo nno 
nga twenwanwagirizza ne mu byambalo by’Amerika. Ebyekijaasi bye bye baatunaanika bwe 
baatujjaako ebyekirunnyanja. Lumu ennyonyi za Japan nga ziwagirwa obwato bu lubbira 
zaalumba ne zitigomya ogwato ogwo okuva enkya okusuulira ddala enjuba, kyokka 
olwokutuntumula agazinga gaagwo obutasalako n’olwokuteregenya ennyo nga 
guwunjawunja ate nga nennyozi zaagwo engabo zigirumizza mannyo era nga zittunsa 
agakoomi gomukka okugusiikiriza, gwasobola okwerwanako okutuusa ekiyamba mumizi lwe 
kyagudduukirira ne guzeemululako ne gubula. Naye nga ddala kwali kuyita mu mannyo ga 
ntaggya. Ekiseera kyonna enkalu we zakalambirira nga ffe buli omu afuuyirira kanwe nti sso 
anaabuzzaako ddi eddiba ne twesoobolola ku lukokobe era ddala ku lukokere lwomulabe. 
Buli omu nno nga yezingiridde ekikoba ekiseeyeeyesa omuntu nga abadde agudde mu mazzi 
aleme okumira naye ate okusinga byonna aleme okusaanawo.|<eos>| 
Nga ennaku bbiri zeklungudde okuva ku olwo, enkoko yakwatwa mumwa ne tutwalibwa 
kinoomu mu maaso g’ofiisa eyali akkalidde ku mmeeza okwali entuutuli y’empapula. Nga 
akutunuulidde mu munye enkakaba, yakukemesanga olufubenje lwebibuuzo ebyajjanga 
bisindikagana era nga osuubirwa okubiddamu byonna nga bwe byakuyigirizibwa. 
Byatandikanga bwebiti: “Erinnyalyo ggwe ani? (Eryange nno eryekiyugoyugo lyali Pwi 
Zaalizaali) “Wazaalibwa wa era ddi? Kitaawo ne nyoko be bani?webalu oba baafa? “ Ku ebyo 
ngobuulizibwako kajojijoji webirala lutottoebikwata ku mpisa, obuwangwa, obulombolombo 
n’ennono z’ebika byabayugoyugo. Wano munnaffe Ludyeku akakusu we kaamulijjira enkalu 
ne kamutandaggira ennume yekigwo n’awambira era bwatyo teyeeyongera kulamaga naffe 
okuva wano.|<eos>| 
Enjuba nga egolooba, enviiri zaffe zaababirwa ne zisikibwa ziwanvuwe nga eza Bayugoyugo. 
Ekyo bwe kyaggwa ne twambala ebyambalo ebyekinnansi era ne balikkupu okwali amalobo 
n’amasanda bituziyize okuseerera. Bwe twamala okuwenjebwa balabe oba nga tetuliiko 
kayinza kutuloopa konna singa nga tugwa mu mikono gya ba kawenkene, twassibwa mu 
kaato akomuliro ne katandika okuwenyuka. Nga obudde bukunukkiriza akawozamasiga, 
akaato kaggyibwaamu omuliro ne katandika kuseeyeeyeza ku maanyi ga masannyalaze 
kasirise nnyo era kasoobo. Mu kiseera kyekimu twawulira okubwatuka kw'emizinga gyerimu 
ku maato gaffe nga gibunduggula amasasi ku mwalo ogwatuli ku ddyo mailo nga musanvu 
okuva we twaali, ate nga ku kkono waffe ennyonyi ziwandagaza ku lusiisira lwabajaasi 
b’omulabe.|<eos>| 
ESSUULA EYOKUBIRI: Nsimba Ekigere mu Yugoyugo
Nga tumulisibwa okumyansa kw’emizinga n’okutulika kwa bbomu, twagenda nga 
twewagaanya mu lukono lw’ennyanja olubugiddwa agasenge g’enjazi ennangaavu ezeesimbye 
obulanga. Bwe twatuuka e kkomekkome w’olugomo olwo, munnaffe Taamutaamu yatagalala 
ku kaato akatengeetera, era nga akozesa obukugu obusukkirivu yakasuka omuguwa okwali 
empuluttulizo n’aloba ensoomi yomutwe ogwali ku lukolekole lw’ejjinja. Kino lya tulaga nti 
munnaffe yali lugo olumanyi embuzi emponge mu bitundu bino.   Ku muguwa guno kwe 
twawalampira kinoomu nga tuyambibwa balikkupu zaffe okutuusa ffenna lwe twaggwaayo. 
Bwe twatuuka waggulu buli omu nanaanulamu balikkupu ze ne tuzisuula mu kaato wansi 
ebifuba nga kkumi na bitaano okuva we twaali. Twanaanika mu bigere engatto zi-nkya ne 
tutambula mu lwakasota nga tugenda twewagaanya mu mpago zenjazi awamu nga tutandira 
nokulandira ku zo ate nga awalala nga tuyuuguumira tunyegerera ku mkiribi 
n’eŋŋongogongo zaago.|<eos>| 
Mu kawozamasiga twagguka ku liyumba eritimbaganye ebimli nga biriboyedde lyonna era 
nga nebimu bireebeetera mu kisasi. Mwali mwakamu etaala ensiikirize, kimpoowooze. 
Twaatambula kasoobo (twerinnyako) nga tugenda tulikiiba naye nga tuliyita kumpi ddala 
emabega wakakomera akaali kajjudde enkanaga ne kyukompoleze. Nga tweekiise eriyumba 
eriyumba eryo mu bwanga, twalengera abajaasi abaJapan babiri nga boogera biwanvuwanvu, 
omu ku bon go officer. Mu bye nasobola okunojjola mu lukunkumuli lwebigambo bye baal 
bafukumula, kyawulikika nga baali basowaganira naŋŋanda eyali afaabina nga bwasobola 
okubatawulula naye ng’afuuwa mukka mu kisero. |<eos>| 
Twayimirira mu kakuukuulu nga mpaawo anyega ne tugabeegeka. Omujaasi ataali Offisa 
yasowola ekitala nakigalula nga alinga agenda okusanjaga Ofiisa, naye nga omukonogwe 
gukyakongobadde waggulu, Ofiisa yasika basitoola ku lukugunyu n’amubabika essasi 
n’ekitala nekimansuka eri. |<eos>| 
Omukazi yabikka engalo ze ku mumwa n’abaaluuka nnyo nga n’amaaso mu kiwanga 
tasigazzaamu. Nga afunyeeko are nga awambaatidde oluveeralwekiteeteeyi kye yatyagira ne 
yevumba ennyumba. Omukube yadduka nga akutte omukonogwe mu kifuba, nga 
abendabenda era akotyonka, n’agezaako alinnyelinnye amadaala ayingire mu nnyumba. 
Ettaala zo ku lubalaza zaatandika okumemuka.  Nga amalako amadaala, yalemererwa nagwa 
nga yevuunise. Offiisa yasooka okumugoberezesa amaaso noluvannyuma ye yennyini 
yamulumba. Bwe yamutuukako namutunuulira ng’akimba era nga yenyinyimbwa. 
Yamusindisa ekigere era omukube eyalabika ng’afudde yayiringitira ku madaala okutuusa 
lwe yawagamira mu kisasi awatonnya amazzi. Ofiisa nga abiina ensige nga nemikono gimuli 
mu nsawo yakyuka ayingire era twawulira amaloboozi agavuuvuuma naga gava mu nju gajja 
gasembera era ne tulengera nabantu nga bewungulawungula. Twaleka biri bwebityonga 
tugumbulukuka kukunta ku zaffe. |<eos>| 
Mu kawansazi (ttumbi)twagguka  mu kikubo ekyanfaananira ekikuute ky’ente. Taamutaamu 
yasindogoma nga ekiwuugulu. Eddoboozi ne limwanukula mu ngeri y’emu nga liyima mu 
gayinja agaali ku mabbali g’ekikubo. Eddoboozi lyelimu lyakaaba ng’ettutuma. Taamutaamu 
n’alyanukula nga akaaba ng’olubugabuga. Nga wayiseewo akaseera nalengera ekintu 
ekizijjagavu nga kyekkata mu kkubo okuva mu mayinja eddoboozi gye lyafulumanga.   
ŋŋenda okukyetegereza nga nga ndaba muntu asakaatidde mu byambalo.
Taamutaamu yadduka okumusisinkana era nga bamaze okugwaŋŋana mu bifuba, oli 
yagendako mu maaso katono nakukunula mu kasaka ekigaali ekyali kiwalulwa ensolo bbiri. 
Taamutaamu yasembera natuwenya tugende tulinye mu kigaali ekyaali kifaananako 
“kasimby’obwaala”. Nga tumaze okukkalira mu kyo, kyaatandika okutukunguzza nga 
bwekigenda kiguluba. Nali ntandise okutema ebisiki ne nsisimulwa okweggunda n’okuyuuga 
eby’ekitalo.|<eos>| 
Nga nyiimudde amaaso, nalaba ogusolo emabega waffe nga gutumezeeko. Gwabuuka 
enfunda ssatu zokka nga gwerippye dda ku lubugirizo lw’ekigaali. Gwansikulako Kataayi ku 
lusegere era nze okugenda okugezaako okumubakirira nga gwakuunye dda naye. Ekigaali 
tekyayimirira wadde okukendeeza ku misinde gyakyo era engeri gye kyesulikangamu nga 
kiweta ensonda, nga ekutabangula n’ebyenda are nga ddala tolowooza nti kinaddayo 
okutereera.  Naye nga tumazeeko obubirabira, omugoba yassa ekikkowe era nekisala ku 
mbiro zaakyo. Nga bwanyeenya omutwe yagamba nti, “Kino kitalo. Oyo munnaffe alugenze, 
jjagwa emulidde. Owa, obulamu buno buzibu!” Ensolo eno jjagwa etiibwa nnyo mu bitundu 
bino. Mu nkula nenfaanana eri wakati wa genge n’engo naye zombie ezireebeeseza wala mu 
maanyi, mu bukalabakalaba ne mu bukambwe. Abantu abali wakati w’enkumi ennya 
nenkumi ettaano be babweebweenebwa ensolo eno buli mwakaate abalala nga enkumi bbiri 
ne batirimbulwa ekisota ki tipitipi. Ekisot kino kikunguyivu nnyo era kifaananamu kaamuje. 
Kirina erinnyo liringa ejjindu lya sseggwanga are kya busagwa buyitirivu. Bwe kibojja 
omuntu afiirawo mbulaga ate bakibusalako busazi. Ekisingako ggwe kwe kukyesalako nga 
kyakakubojja sso notopaapaala bupaapaazi. Lugaba bwaba ng’akukwatiddeko oyinza 
okusumattuka  amagombe.|<eos>| 
Obudde buba busaasaana ne tuva ku kigaali ne tukwata akakubo akamawunjuwunju. 
Oluvannyuma lw’akatemerero, twaakavaamu ne tubandira mu nsiko. Obudde nga 
butangadde, twagguka mu kiwonvu ekyaali kumpi kyetooloddwa ebisozisozi kyonna, nga 
kirimu n’ogukonko omwali omugga ogutokota n’okusaala mu ngeri eyekitalo. Gwaali 
gwesengeseko agayinja n’enjazi ku mbangabanga zaagwo. Nalagibwa oluwokowoko mu jjinja 
okwali linnaalyo nga liryetengereddeko ng’erigenda okunegukako, ne ndagirwa nekukume 
omwo era nkozese ne byonna ebyalimu.|<eos>| 
Nze wano natandika okutya nga ndowooza nti oba nga abaJapan bawulidde omusinde 
gw’emmundu era nga bajja baguwondera, ddala nali mu katyabaga. Ate nendowooza nti 
omuwala bwanaddayo tajja kulema kulaalaasa bibaddewo era n’abantu abalala okubisansa ne 
bituuka ne ku ba kafulu. |<eos>| 
Nga nkyali mu birowoozo ebinnyogovu ng’ebyo ntongeza, Taamutaamu nagoba. Ddala 
yansala ku gwa kabugu. Yalabika nga munyiikaavu mpozzi olw’okunsanga n’omuwala. Naye 
sikutendera ngeri gye yatunulamu ng’amaasoge gagudde ku mutulumbi gwa jjagwa. 
Ng’ayasaamiridde, ensaya atadde, amaaso gakoonose mu kiwanga, yatunuulira jjagwa 
ng’anaagimira. Bwagigasimbulako nagazza ku ffe ate n’agigazzaako ate n’agazza. Ng’ali mu 
mbeera bwetyo, omuwala yevaamu namutegeeza byonna ebibaddewo. Yampomeka eriiso nga 
bwafuuwa ekiwa ng’akizza munda, ng’anyeenya n’omutwe, yagamba nti, “Oli musajja 
musajja wattu.” Yazza omuwala kubbali n’amukuba akaama. Oluvannyuma yajja nantegeeza 
nga bweyali agenda okuwerekera ki Wi (eryo lye lyali erinnya ly’omuwala) amuzzeeyo 
ewaabwe.|<eos>| |<endoftext>|]"""

In [6]:
tokenizer = MyTokenizer(text, vocab_size=400, allowed_special=["|<endoftext>|", "|<eos>|"])

In [7]:
#training
tokenizer.train()

Merging the 0th pair
Merging the 100th pair
tokens length:  13908
ids length:  6886
compression ratio: 2.02X


In [8]:
tokenizer.get_vocabulary()

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [9]:
id_s = tokenizer.encode("Hello world!!!")
id_s

Length of original tokens:  14
Length of tokens after handling special words:  14
Length of tokens after merging:  12


[72, 101, 108, 108, 260, 366, 114, 108, 100, 33, 33, 33]

In [10]:
text = tokenizer.decode(id_s)
print(text)

Hello world!!!


In [11]:
#with special tokens
id_s2 = tokenizer.encode("Hey there |<eos>| Gyebaleko |<endoftext>|")
print(id_s2)

Length of original tokens:  41
Length of tokens after handling special words:  23
Length of tokens after merging:  17
[72, 101, 121, 32, 116, 104, 265, 257, 401, 32, 71, 305, 98, 261, 101, 299, 400]


In [12]:
text = tokenizer.decode(id_s2)
print(text)

Hey there |<eos>| Gyebaleko |<endoftext>|


# usage
1. prepare corpus/text data with relevant special tokens if need be
2. call the tokenizer class with input parameters, text, vocab_size and allowed_special characters 
    e.g `tokenizer = MyTokenizer(text, vocab_size=500, allowed_special=["|<endoftext>|", "|<eos>|"])`
3. train the tokenizer
    e.g `tokenizer.train()`
4. then use it to encode any new text to get tokens
    e.g `ids = tokenizer.encode("Hello world")`
5. you can finally do any decoding
    e.g `tokenizer.decode(ids)`
6. to get vocabulary, u can use the get_vocabulary function
   e.g `tokenizer.get_vocabulary()`