In [1]:
import pandas as pd
import time
from transformers import AutoTokenizer
from is_turkish import get_is_turkish
from is_pure import get_is_pure

df = pd.read_parquet("hf://datasets/alibayram/yapay_zeka_turkce_mmlu_model_cevaplari/data/train-00000-of-00001.parquet")
df = df[["soru", "secenekler"]]
text = ""
for _, row in df.iterrows():
    text += row["soru"] + "\n"  
    for secenek in row["secenekler"]:
        text += secenek + "\n"

len(text)



1605376

In [2]:
len(text.split())

198193

In [54]:
def score_tokenizer(text:str, tokenizer_score_map: dict):
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_score_map["tokenizer"])
  tokenizer_score_map["vocab-size"] = len(tokenizer.get_vocab())
  time_start = time.time()

  token_ids = tokenizer.encode(text)
  tokenizer_score_map["tokens-count"] = len(token_ids)

  tokens = []

  for token_id in token_ids:
    try:
      token = tokenizer.decode(token_id)
      tokens.append(token)
    except:
      print("Error: ", token_id)

  time_end = time.time()

  tokenizer_score_map["time"] = round(time_end - time_start, 4)
  
  tokens = set(tokens)
  is_turkish_map = get_is_turkish(tokens)
  tokenizer_score_map["unique-token-count"] = len(is_turkish_map)

  # count of true values in is_turkish_map
  tokenizer_score_map["turkish-token-count"] = sum(is_turkish_map.values())
  tokenizer_score_map["turkish-token-percent"] = round(tokenizer_score_map["turkish-token-count"] / tokenizer_score_map["unique-token-count"], 4)

  is_pure_map = get_is_pure(" ".join(is_turkish_map.keys()))
  # count of true values in is_pure_map
  tokenizer_score_map["pure-token-count"] = sum(is_pure_map.values())
  tokenizer_score_map["pure-token-percent"] = round(tokenizer_score_map["pure-token-count"] / tokenizer_score_map["unique-token-count"], 4)
  

  return tokenizer_score_map

In [55]:
tokenizers = [
  "google/gemma-2-9b",
  "alibayram/tr_tokenizer",
  "AhmetSemih/tr_tokenizer",
  "aliarda/turkish_tokenizer_256k",
  "aliarda/turkish_tokenizer",
  "meta-llama/Llama-3.2-3B",
  "utter-project/EuroLLM-9B-Instruct",
  "Qwen/Qwen2.5-7B-Instruct",
  "CohereForAI/aya-expanse-8b",
  "openai-community/gpt2",
  "mistralai/Mistral-Nemo-Instruct-2407",
  "microsoft/Phi-3.5-mini-instruct",
  "Trendyol/Trendyol-LLM-8b-chat-v2.0",
  "ytu-ce-cosmos/turkish-gpt2-large-750m-instruct-v0.1"
]

In [None]:
tokenizer_score_maps = []

for tokenizer in tokenizers:
  tokenizer_score_map = {
    "tokenizer": tokenizer
  }
  tokenizer_score_map = score_tokenizer(text, tokenizer_score_map)
  tokenizer_score_maps.append(tokenizer_score_map)
  print(tokenizer_score_map)

df = pd.DataFrame(tokenizer_score_maps)
df.to_csv("tokenizer_score_maps.csv", index=False)

In [57]:
df

Unnamed: 0,tokenizer,vocab-size,tokens-count,time,unique-token-count,turkish-token-count,turkish-token-percent,pure-token-count,pure-token-percent
0,google/gemma-2-9b,256000,497015,2.95,6383,3104,0.4863,2365,0.3705
1,alibayram/tr_tokenizer,30158,476556,2.4231,11531,11342,0.9836,11055,0.9587
2,AhmetSemih/tr_tokenizer,59572,451883,2.4849,13370,13253,0.9912,13357,0.999
3,aliarda/turkish_tokenizer_256k,256000,488267,2.5124,13631,13351,0.9795,12981,0.9523
4,aliarda/turkish_tokenizer,58526,451936,2.3406,13268,13170,0.9926,13256,0.9991
5,meta-llama/Llama-3.2-3B,128256,488535,3.1249,6823,3125,0.458,2109,0.3091
6,utter-project/EuroLLM-9B-Instruct,128000,497173,3.2019,5226,2457,0.4701,1838,0.3517
7,Qwen/Qwen2.5-7B-Instruct,151665,561866,3.315,5752,2320,0.4033,1734,0.3015
8,CohereForAI/aya-expanse-8b,255029,434526,2.7651,8562,4338,0.5067,2822,0.3296
9,openai-community/gpt2,50257,821139,4.3765,3454,1582,0.458,1119,0.324


In [3]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")
enc.decode([43519, 95938])

' Risk yönet'

In [4]:
def score_openai_tokenizer(text:str, tokenizer_score_map: dict):

  tokenizer_score_map["vocab-size"] = enc.n_vocab
  time_start = time.time()

  token_ids = enc.encode(text)
  tokenizer_score_map["tokens-count"] = len(token_ids)

  tokens = []

  for token_id in token_ids:
    try:
      token = enc.decode([token_id])
      tokens.append(token)
    except:
      print("Error: ", token_id)

  time_end = time.time()

  tokenizer_score_map["time"] = round(time_end - time_start, 4)
  
  tokens = set(tokens)
  is_turkish_map = get_is_turkish(tokens)
  tokenizer_score_map["unique-token-count"] = len(is_turkish_map)

  # count of true values in is_turkish_map
  tokenizer_score_map["turkish-token-count"] = sum(is_turkish_map.values())
  tokenizer_score_map["turkish-token-percent"] = round(tokenizer_score_map["turkish-token-count"] / tokenizer_score_map["unique-token-count"], 4)

  is_pure_map = get_is_pure(" ".join(is_turkish_map.keys()))
  # count of true values in is_pure_map
  tokenizer_score_map["pure-token-count"] = sum(is_pure_map.values())
  tokenizer_score_map["pure-token-percent"] = round(tokenizer_score_map["pure-token-count"] / tokenizer_score_map["unique-token-count"], 4)
  

  return tokenizer_score_map

In [None]:
openai_tokenizer_score_map = {
  "tokenizer": "openai/o200k_base-gpt-4o"
}

openai_tokenizer_score_map = score_openai_tokenizer(text, openai_tokenizer_score_map)

This may lead to errors when urllib3 tries to modify verify_mode.
Please report an issue at https://gitlab.com/alelec/pip-system-certs with your
python version included in the description



In [None]:
df = pd.read_csv("tokenizer_score_maps.csv")
df = pd.concat([df, pd.DataFrame([openai_tokenizer_score_map])])
df.to_csv("tokenizer_score_maps.csv", index=False)
df

Unnamed: 0,tokenizer,vocab-size,tokens-count,time,unique-token-count,turkish-token-count,turkish-token-percent,pure-token-count,pure-token-percent
0,google/gemma-2-9b,256000,497015,2.95,6383,3104,0.4863,2365,0.3705
1,alibayram/tr_tokenizer,30158,476556,2.4231,11531,11342,0.9836,11055,0.9587
2,AhmetSemih/tr_tokenizer,59572,451883,2.4849,13370,13253,0.9912,13357,0.999
3,aliarda/turkish_tokenizer_256k,256000,488267,2.5124,13631,13351,0.9795,12981,0.9523
4,aliarda/turkish_tokenizer,58526,451936,2.3406,13268,13170,0.9926,13256,0.9991
5,meta-llama/Llama-3.2-3B,128256,488535,3.1249,6823,3125,0.458,2109,0.3091
6,utter-project/EuroLLM-9B-Instruct,128000,497173,3.2019,5226,2457,0.4701,1838,0.3517
7,Qwen/Qwen2.5-7B-Instruct,151665,561866,3.315,5752,2320,0.4033,1734,0.3015
8,CohereForAI/aya-expanse-8b,255029,434526,2.7651,8562,4338,0.5067,2822,0.3296
9,openai-community/gpt2,50257,821139,4.3765,3454,1582,0.458,1119,0.324
