In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
org_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
org_tokenizer.save_pretrained("gemma3")

('gemma3/tokenizer_config.json',
 'gemma3/chat_template.jinja',
 'gemma3/tokenizer.json')

In [14]:
org_model = AutoModelForCausalLM.from_pretrained("google/embeddinggemma-300m")
org_model.save_pretrained("gemma3")
source_embeddings = org_model.model.embed_tokens.weight.to(org_model.device)
source_embeddings.shape[1]

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

768

In [13]:
org_model = AutoModelForCausalLM.from_pretrained("gemma3_cloned")
source_embeddings = org_model.model.embed_tokens.weight.to(org_model.device)
source_embeddings.shape

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

torch.Size([32768, 768])

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("alibayram/tabi-downstream-task-embeddinggemma")

query = "Yapay zeka gelecekte önemli olacak"
documents = [
    "Makine öğrenmesi geleceğin teknolojisidir",
    "Bugün hava çok güzel",
    "Türkiye'de turizm sektörü büyüyor"
]

query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)

similarities = model.similarity(query_embedding, doc_embeddings)
print(similarities)

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

tensor([[0.3616, 0.2702, 0.3002]])


In [7]:
import json

with open("org_vocab.json", "w", encoding="utf-8") as f:
  json.dump(org_tokenizer.get_vocab(), f, ensure_ascii=False, indent=4)



In [10]:
# Import the module directly
import turkish_tokenizer as tt

tokenizer = tt.TurkishTokenizer()

text = "yapılırken kırılırken okuyor söylüyordu başlanmıştır bilindik Ankara'yla renga renk Türkiye'nin Türkiye'nin Türkiye'de"

tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)

print(tokens)
print(ids)
print(tokenizer.decode(ids))

[' yapılır', 'ken', ' kır', 'ı', 'lır', 'ken', ' oku', 'yor', ' söylü', 'yor', 'du', ' başla', 'n', 'mış', 'tır', ' bilin', 'di', 'k', '<uppercase>', ' ankara', "'", 'yla', ' reng', 'a', ' renk', '<uppercase>', ' türkiye', "'", 'nin', '<uppercase>', ' türkiye', "'", 'nin', '<uppercase>', ' türkiye', "'", 'de']
[19767, 20007, 3766, 20034, 20071, 20007, 2656, 20041, 2306, 20041, 20026, 2206, 20040, 20016, 20028, 19736, 20026, 20108, 0, 3399, 20078, 20023, 245, 20037, 245, 0, 4563, 20078, 20022, 0, 4563, 20078, 20022, 0, 4563, 20078, 20024]
 yapılırkan kırılırkan okuyor söylüyordu başlanmüştür bilindük ankara'la renga renk türkiye'nın türkiye'nın türkiye'da


In [11]:
tokenizer.tokenize("Yapay zeka ve makine öğrenmesi")

['<uppercase>', ' yapay', ' zeka', ' ve', ' makine', ' öğren', 'me', 'si']

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("gemma3_cloned", custom_tokenizer=tokenizer)

query = "Yapay zeka gelecekte önemli olacak"
documents = [
    "Makine öğrenmesi geleceğin teknolojisidir",
    "Bugün hava çok güzel",
    "Türkiye'de turizm sektörü büyüyor"
]

query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)

similarities = model.similarity(query_embedding, doc_embeddings)
print(similarities)

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

tensor([[0.3636, 0.3003, 0.1316]])


In [9]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("magibu_cloned", custom_tokenizer=tokenizer)

query = "Yapay zeka gelecekte önemli olacak"
documents = [
    "Makine öğrenmesi geleceğin teknolojisidir",
    "Bugün hava çok güzel",
    "Türkiye'de turizm sektörü büyüyor"
]

query_embedding = model.encode(query)
doc_embeddings = model.encode(documents)

similarities = model.similarity(query_embedding, doc_embeddings)
print(similarities)

Loading weights:   0%|          | 0/314 [00:00<?, ?it/s]

tensor([[0.4850, 0.2780, 0.2800]])


In [37]:
import json

with open("kokler.json", "r", encoding="utf-8") as f:
    kokler = json.load(f)

with open("ekler.json", "r", encoding="utf-8") as f:
    ekler = json.load(f)

len(kokler), len(ekler)

(21761, 177)

In [38]:
with open("sorted_freq_cosmos.json", "r", encoding="utf-8") as f:
    sorted_freq_cosmos = json.load(f)

len(sorted_freq_cosmos)

88885

In [39]:
last_key = list(kokler.keys())[-1]
last_id = kokler[last_key]
last_key, last_id

(' token', 19529)

In [40]:
for token, _ in sorted_freq_cosmos.items():
  if token.startswith(" ") and token.islower() and token not in kokler:
    last_id += 1
    kokler[token] = last_id
    
    if last_id > 19998:
      break

In [26]:
with open("yeni_kokler.json", "w", encoding="utf-8") as f:
    json.dump(kokler, f, ensure_ascii=False, indent=4)

In [34]:
"1".islower(), "1".isdigit(), "!".isidentifier()

(False, True, False)

In [62]:
last_key = list(ekler.keys())[-1]
last_id = ekler[last_key]
last_key, last_id

('lır', 20071)

In [53]:
2 ** 15

32768

In [63]:
bpe_tokens = {}
for token, _ in sorted_freq_cosmos.items():
  token = token.lower()
  if len(token) < 4 and token not in kokler and token not in ekler and token not in bpe_tokens:
    last_id += 1
    bpe_tokens[token] = last_id

for token, _ in sorted_freq_cosmos.items():
  token = token.lower()
  if len(token) < 5 and token not in kokler and token not in ekler and token not in bpe_tokens:
    last_id += 1
    bpe_tokens[token] = last_id

    if last_id >= 2 ** 15:
      break


len(bpe_tokens)

12697

In [None]:
with open("yeni_bpe_tokenler.json", "w", encoding="utf-8") as f:
    json.dump(bpe_tokens, f, ensure_ascii=False, indent=4)

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("alibayram/TabiBERT-tokenizer-32k")
tokenizer.tokenize("Yapay zeka ve makine öğrenmesi")

['Yap', 'ay', 'Ġzek', 'aĠveĠ', 'makineĠ', 'Ã¶ÄŁren', 'mesi']

In [12]:
tokenizer.push_to_hub("alibayram/TabiBERT-tokenizer-32k")

CommitInfo(commit_url='https://huggingface.co/alibayram/TabiBERT-tokenizer-32k/commit/f18e474d7f6f6f686a0a7dda314cfb64c1d83d28', commit_message='Upload tokenizer', commit_description='', oid='f18e474d7f6f6f686a0a7dda314cfb64c1d83d28', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alibayram/TabiBERT-tokenizer-32k', endpoint='https://huggingface.co', repo_type='model', repo_id='alibayram/TabiBERT-tokenizer-32k'), pr_revision=None, pr_num=None)

In [7]:
tokenizer.save_pretrained("TabiBERT")

('TabiBERT/tokenizer_config.json', 'TabiBERT/tokenizer.json')

In [14]:
tokenizer.encode(text)

[4,
 13576,
 1682,
 14817,
 686,
 1355,
 25239,
 1784,
 2519,
 22607,
 1563,
 2357,
 599,
 8956,
 268,
 13867,
 1811,
 7676,
 6003,
 2007,
 268,
 1013,
 2007,
 268,
 1013,
 2007,
 268,
 548,
 5]

In [20]:
from transformers import AutoTokenizer

cosmos_tokenizer = AutoTokenizer.from_pretrained("ytu-ce-cosmos/turkish-gpt2-large")

cosmos_tokenizer.tokenize("Merhaba, nasılsınız?")

tokenizer_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

['Merhaba', ',', 'ĠnasÄ±l', 'sÄ±nÄ±z', '?']

In [21]:
cosmos_tokenizer.save_pretrained("cosmos_tokenizer")

('cosmos_tokenizer/tokenizer_config.json', 'cosmos_tokenizer/tokenizer.json')

In [23]:
import json

with open("./cosmos_tokenizer/tokenizer.json", "r") as f:
    tokenizer_json = json.load(f)
print(len(tokenizer_json))

9


In [24]:
# lets remove all tokens from the tokenizer.json file that with id bigger than 2**15 and their respected merges from the merges list.

vocab = tokenizer_json["model"]["vocab"]
merges = tokenizer_json["model"]["merges"]

len(vocab), len(merges)

(50257, 50000)

In [25]:
"".join(merges[0])

'ar'

In [26]:
new_vocab = {}
for token, id in vocab.items():
    if id >= 2 ** 15:
        for merge in merges:
            merged = "".join(merge)
            if merged == token:
                merges.remove(merge)
        if id % 1000 == 0:
            print(token, id)
        continue
    new_vocab[token] = id
    if id % 1000 == 0:
        print(token, id)

<|endoftext|> 0
orm 1000
ana 2000
ĠÃ¶deme 3000
ĠkapÄ 4000
yip 5000
ĠoranÄ±nda 6000
Ġker 7000
ĠÃĸrgÃ¼t 8000
ĠDel 9000
Ãĸncelikle 10000
ĠyapÄ±ya 11000
Ġkaydedildi 12000
ĠFiyatÄ± 13000
wood 14000
ĠSekreter 15000
ĠboÅŁan 16000
ĠsoÄŁutma 17000
Ġsuik 18000
iyordum 19000
ĠgÃ¶rerek 20000
ĠaÄŁzÄ± 21000
Ġisteniyor 22000
Ġimtihan 23000
ĠborsasÄ± 24000
Ġbilinmiyor 25000
ĠhoÅŁuna 26000
Ġserum 27000
Ġbiliyorsun 28000
Ġbitiminde 29000
Ġokunur 30000
ĠsalgÄ±nÄ±nÄ±n 31000
YouTube 32000
ĠSantr 33000
Alex 34000
Ġemirler 35000
ĠmeydanÄ 36000
Ġcephesi 37000
ĠGenetik 38000
Ted 39000
ĠANLAT 40000
ĠYayÄ±nlandÄ± 41000
aile 42000
Ġdavetliler 43000
ĠÅŀekilde 44000
Bot 45000
ĠkÃ¼mesi 46000
ĠRiyad 47000
General 48000
Ġdershane 49000
Ġnispet 50000


In [27]:
tokenizer_json["model"]["vocab"] = new_vocab
with open("./cosmos_tokenizer/tokenizer_n.json", "w", encoding="utf-8") as f:
    json.dump(tokenizer_json, f, ensure_ascii=False)

In [28]:
new_cosmos_tokenizer = AutoTokenizer.from_pretrained("cosmos_tokenizer")
print(new_cosmos_tokenizer.vocab_size)
new_cosmos_tokenizer.push_to_hub("alibayram/cosmosGPT2-tokenizer-32k")

32768


CommitInfo(commit_url='https://huggingface.co/alibayram/cosmosGPT2-tokenizer-32k/commit/19804c7091b340a69879b9dad326321460914cc9', commit_message='Upload tokenizer', commit_description='', oid='19804c7091b340a69879b9dad326321460914cc9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alibayram/cosmosGPT2-tokenizer-32k', endpoint='https://huggingface.co', repo_type='model', repo_id='alibayram/cosmosGPT2-tokenizer-32k'), pr_revision=None, pr_num=None)

In [None]:
"newmindai/Mursit-Base-TR-Retrieval"
mursit_tokenizer = AutoTokenizer.from_pretrained("newmindai/Mursit-Base-TR-Retrieval")

print(mursit_tokenizer.tokenize("Merhaba, nasılsınız?"))

mursit_tokenizer.save_pretrained("mursit_tokenizer")
with open("./mursit_tokenizer/tokenizer.json", "r") as f:
    tokenizer_json = json.load(f)
print(len(tokenizer_json))

vocab = tokenizer_json["model"]["vocab"]
merges = tokenizer_json["model"]["merges"]

print(len(vocab), len(merges))

new_vocab = {}
for token, id in vocab.items():
    if id >= 2**15:
        for merge in merges:
            merged = "".join(merge)
            if merged == token:
                merges.remove(merge)
        if id % 1000 == 0:
            print(token, id)
        continue
    new_vocab[token] = id
    if id % 1000 == 0:
        print(token, id)