In [None]:
# Load model directly
from transformers import GemmaTokenizerFast

gemma2_tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b-it")
gemma2_tokenizer.is_fast

True

In [3]:
from transformers import AutoTokenizer

tr_tokenizer = AutoTokenizer.from_pretrained("alibayram/tr_tokenizer")
tr_tokenizer.is_fast

True

In [7]:
gemma2_tokenizer.vocab

{'ebilir': 178884,
 '▁nuevamente': 83881,
 '▁반': 78821,
 'ether': 70073,
 '▁nanti': 84902,
 'ヨーロッパ': 139068,
 '📁': 247737,
 'culated': 227694,
 'Bone': 54234,
 'report': 12960,
 '▁Pelham': 206977,
 'nie': 4188,
 'Rik': 133466,
 'Lanz': 195309,
 '▁volando': 212958,
 '⢄': 252483,
 '▁trasero': 75435,
 '▁Oester': 229197,
 '作业': 97241,
 'Ѕ': 245473,
 'Doge': 151190,
 '褐': 241819,
 '▁llevará': 135680,
 '憷': 252489,
 '▁Shen': 34493,
 '▁reviving': 168890,
 '▁gamle': 105984,
 'PARENT': 84901,
 'lüğ': 124366,
 '▁памят': 107892,
 'ݿ': 254481,
 'daten': 45759,
 'änger': 30064,
 '▁Communauté': 173573,
 '(!(': 94799,
 '▁comfortably': 54957,
 'perturb': 218088,
 'зай': 165370,
 '▁Decision': 33355,
 '▁fédé': 158256,
 '障碍': 129670,
 '▁Gn': 100069,
 '▁模拟': 209273,
 'IMAGE': 10503,
 'horizontalLayout': 162141,
 '▁Disco': 42888,
 '▁celé': 169768,
 '▁recognized': 14712,
 '诶': 242160,
 '▁couvrir': 191342,
 '▁dummies': 180365,
 '왠': 255108,
 '▁procedure': 10286,
 'astéroïdes': 218549,
 'вяз': 208525,
 'ทะ': 

In [6]:
tr_tokenizer.vocab['a'], gemma2_tokenizer.vocab['a']

(70, 235250)

In [11]:
# create a dataframe with 3 columns tr_token, tr_token_id, gemma2_token_id for each token
import pandas as pd

df = pd.DataFrame(columns=["tr_token", "tr_token_id", "gemma2_token_id"])
counter = 0
for tr_token in tr_tokenizer.vocab:
    tr_token_id = tr_tokenizer.vocab[tr_token]
    try:
      gemma2_token_id = gemma2_tokenizer.vocab[tr_token]
    except Exception as e1:
       print("Token not found in Gemma2 vocab directly: ", tr_token, e1)
       try:
         gemma2_token_id = gemma2_tokenizer.vocab["▁" + tr_token]
       except Exception as e:
          print("Token not found in Gemma2 vocab with '▁' prefix: ", tr_token, e)
          gemma2_token_id = None          
    #      AttributeError: 'DataFrame' object has no attribute 'append'
    """ df = df.append(
        {"tr_token": tr_token, "tr_token_id": tr_token_id, "gemma2_token_id": gemma2_token_id},
        ignore_index=True,
    ) """
    counter += 1
    print(counter, tr_token, tr_token_id, gemma2_token_id)
    df = pd.concat([df, pd.DataFrame([{"tr_token": tr_token, "tr_token_id": tr_token_id, "gemma2_token_id": gemma2_token_id}])])

df

Token not found in Gemma2 vocab directly:  salavat 'salavat'
Token not found in Gemma2 vocab with '▁' prefix:  salavat '▁salavat'
1 salavat 21105 None
Token not found in Gemma2 vocab directly:  iyel 'iyel'
Token not found in Gemma2 vocab with '▁' prefix:  iyel '▁iyel'
2 iyel 6395 None
Token not found in Gemma2 vocab directly:  muharebe 'muharebe'
Token not found in Gemma2 vocab with '▁' prefix:  muharebe '▁muharebe'
3 muharebe 24267 None
Token not found in Gemma2 vocab directly:  bumerang 'bumerang'
Token not found in Gemma2 vocab with '▁' prefix:  bumerang '▁bumerang'
4 bumerang 27137 None
5 yat 1550 26558
Token not found in Gemma2 vocab directly:  kalsit 'kalsit'
Token not found in Gemma2 vocab with '▁' prefix:  kalsit '▁kalsit'
6 kalsit 16853 None
Token not found in Gemma2 vocab directly:  dönük 'dönük'
Token not found in Gemma2 vocab with '▁' prefix:  dönük '▁dönük'
7 dönük 26790 None
Token not found in Gemma2 vocab directly:  Mızrak 'Mızrak'
Token not found in Gemma2 vocab with '▁

KeyboardInterrupt: 

In [14]:
# count the number of tokens that are not found in Gemma2 vocab
df.count, df[df["gemma2_token_id"].isnull()].shape

(<bound method DataFrame.count of      tr_token tr_token_id gemma2_token_id
 0     salavat       21105            None
 0        iyel        6395            None
 0    muharebe       24267            None
 0    bumerang       27137            None
 0         yat        1550           26558
 ..        ...         ...             ...
 0     fahriye       17991            None
 0    hiperbol       28451            None
 0   Düzenleme       10313            None
 0       Kopya        8324            None
 0      haczet       22344            None
 
 [3283 rows x 3 columns]>,
 (2525, 3))

In [15]:
import sys
from multiprocessing import Pool
from multiprocessing.reduction import ForkingPickler
from types import FunctionType
import cloudpickle

assert sys.version_info >= (3, 8), 'python3.8 or greater required to use reducer_override'

def reducer_override(obj):
    if type(obj) is FunctionType:
        return (cloudpickle.loads, (cloudpickle.dumps(obj),))
    else:
        return NotImplemented

# Monkeypatch our function reducer into the pickler for multiprocessing.
# Without this line, the main block will not work on windows or macOS.
# Alterntively, moving the defintionn of foo outside of the if statement
# would make the main block work on windows or macOS (when run from
# the command line).
ForkingPickler.reducer_override = staticmethod(reducer_override)



In [20]:
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
llama_tokenizer.is_fast

True

In [21]:
llama_tokenizer.vocab

{'ĠFedEx': 86335,
 'Ġpul': 7893,
 'Trade': 40273,
 'è¿ª': 108620,
 'ZD': 127829,
 'ØµØ§Øª': 118216,
 '}>': 13815,
 'Ġbac': 80980,
 'Ġì¦ī': 116786,
 'Ð¾Ð±Ð°Ð²': 36960,
 '@Json': 83141,
 'à¹ĩà¸ķà¸²à¸¡': 127003,
 'ubuntu': 39079,
 'ĠÐ¿Ð¾Ð²': 96034,
 'azer': 23697,
 'ucu': 106538,
 'Iran': 62819,
 'etto': 53979,
 'bots': 63005,
 'ç±³': 73361,
 'ãĤ¹ãĥĨ': 106826,
 'ĠWeather': 23454,
 'Ġcomic': 20303,
 'Ġ...(': 86498,
 'Ġà¤īà¤¦': 106441,
 'ViewItem': 98313,
 'ÐºÐ¾Ð²Ð¸Ð¹': 119249,
 'SEARCH': 44645,
 '.Green': 65110,
 'Ġrefr': 19914,
 'ĠConcept': 35455,
 'ĠÚ©ÙĦ': 103805,
 'GNUC': 66475,
 'ĠAdolf': 90585,
 'fone': 32441,
 'ĠJin': 39611,
 'Ġabolition': 76445,
 'Î®Î»': 125124,
 'ĠArizona': 17368,
 'ìĽĲìĿĦ': 125160,
 'Ġskeptical': 44929,
 'Ð½Ð¸ÑĨÑĤÐ²Ð¾': 114049,
 'Ġserum': 41529,
 ')`Ċ': 50337,
 'Mul': 60255,
 'quipe': 61673,
 '.general': 56890,
 'ĠHol': 16071,
 'Ġhern': 72909,
 'ĠfrÃ¦kke': 99503,
 'ãĤģãģŁ': 109013,
 "Ġ[{'": 62208,
 'ĠFerm': 99362,
 'Set': 1681,
 'ĠjsonString': 80789,
 'HandlerCont

In [29]:
chunksize = 1000

chunks = [list(tr_tokenizer.vocab.keys())[i:i + chunksize] for i in range(0, len(tr_tokenizer.vocab), chunksize)]
len(chunks), chunks[0][:5]

(31, ['salavat', 'iyel', 'muharebe', 'bumerang', 'yat'])

In [47]:
gemma2_tokenizer.encode("yat")

[2, 26558]

In [48]:
counter = 0
if __name__ == '__main__':
  def find_tokens_map(tr_tokens):
    global counter
    token_maps = []
    for tr_token in tr_tokens:
        tr_token_id = tr_tokenizer.vocab[tr_token]
        llama_token_ids = llama_tokenizer.encode(tr_token)
        gemma2_token_ids = gemma2_tokenizer.encode(tr_token)

        token_maps.append({
          "tr_token": tr_token,
          "tr_token_id": tr_token_id,
          "llama_token_ids": llama_token_ids,
          "gemma2_token_ids": gemma2_token_ids
          })
        counter += 1
        print(counter, tr_token, tr_token_id, len(gemma2_token_ids), len(llama_token_ids))
    return token_maps
  
  with Pool(100) as p:
    token_maps_list = p.map(find_tokens_map, chunks)
    

87 yapılmıştır 4712 3 5
65 hoşlan 11662 4 4
88 Divane 14286 3 3
32 Sayfa 10306 3 3
27 1999 2682 5 3
884 Pepsin 14331 3 4
885 üy 1383 2 2
243 çalık 26414 3 3
28 Ufak 8786 3 3
33 Girift 22259 3 4
886 kooperatif 7982 4 4
244 musakka 17562 3 4
89 Uğraş 7626 5 5
66 20 1250 3 2
887 Sifilis 19071 3 4
29 Doğaç 26529 4 4
67 periton 11240 3 3
90 Titrek 18569 3 3
30 Kroşe 19205 3 5
31 Golf 8199 2 3
168 menfez 18608 3 4
34 Dalkavuk 10026 5 5
245 Telepatik 19161 4 4
91 Ikametgah 19051 4 5
35 Kitre 22017 3 3
246 hilat 16578 3 3
92 bohça 17503 3 4
68 yel 22231 2 3
169 Lobi 30032 3 3
93 kırç 7163 4 3
1 teokrasi 14559 4 5
94 imgele 23616 3 4
69 alçak 28672 3 3
36 yaygara 19145 3 5
888 Iguana 13065 4 4
95 teizm 10010 3 3
96 hovarda 18477 4 4
2 Kolektivizm 23447 4 6889 argüman 9017 3 4
32 Sök 27811 3 3

170 gerilim 21355 3 3
890 pey 8986 2 3
247 Bakara 12710 3 4
33 Coş 23312 3 3
9737 hükmet 18142 4  ron 2499 2 2
4
70 istifle 7253 4 3
34 Çatlak 23984 4 4
3 Göbek 20923 3 4
171 น 634 2 2
38 Heybe 29986 3 3


In [49]:
token_list_for_df = []
for token_maps in token_maps_list:
  for token_map in token_maps:
    token_list_for_df.append(token_map)

df = pd.DataFrame(token_list_for_df)
df

Unnamed: 0,tr_token,tr_token_id,llama_token_ids,gemma2_token_ids
0,salavat,21105,"[128000, 19776, 402, 266]","[2, 7871, 58714]"
1,iyel,6395,"[128000, 116710]","[2, 24179, 521]"
2,muharebe,24267,"[128000, 76, 12825, 548, 1395]","[2, 2704, 34471, 555]"
3,bumerang,27137,"[128000, 65, 3471, 526]","[2, 235268, 1198, 754]"
4,yat,1550,"[128000, 99715]","[2, 26558]"
...,...,...,...,...
30153,yusufçuk,9124,"[128000, 88, 355, 1739, 121762]","[2, 235267, 159529, 232370]"
30154,dığı,2232,"[128000, 103711]","[2, 235258, 33624]"
30155,Feyiz,15148,"[128000, 37, 1216, 450]","[2, 235311, 1221, 716]"
30156,Prosedür,14794,"[128000, 1360, 32424, 5297]","[2, 1087, 800, 1943]"


In [52]:
# remove first token from llama_token_ids and gemma2_token_ids
df["llama_token_ids"] = df["llama_token_ids"].apply(lambda x: x[1:])
df["gemma2_token_ids"] = df["gemma2_token_ids"].apply(lambda x: x[1:])
df

Unnamed: 0,tr_token,tr_token_id,llama_token_ids,gemma2_token_ids
0,salavat,21105,"[19776, 402, 266]","[7871, 58714]"
1,iyel,6395,[116710],"[24179, 521]"
2,muharebe,24267,"[76, 12825, 548, 1395]","[2704, 34471, 555]"
3,bumerang,27137,"[65, 3471, 526]","[235268, 1198, 754]"
4,yat,1550,[99715],[26558]
...,...,...,...,...
30153,yusufçuk,9124,"[88, 355, 1739, 121762]","[235267, 159529, 232370]"
30154,dığı,2232,[103711],"[235258, 33624]"
30155,Feyiz,15148,"[37, 1216, 450]","[235311, 1221, 716]"
30156,Prosedür,14794,"[1360, 32424, 5297]","[1087, 800, 1943]"


In [56]:
df.to_json("tr_token_mapping.json",orient="records")