In [67]:
# Load model directly
from transformers import AutoTokenizer, Gemma2ForCausalLM

hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", use_fast=True)
tr_tokenizer = AutoTokenizer.from_pretrained("alibayram/tr_tokenizer", use_fast=True)
hf_model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-2b-it")
hf_model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps

In [16]:
example_text = "Nasılsın, iyi misin? Türkçe cevap ver lütfen!"
input_ids = hf_tokenizer.encode(example_text, return_tensors="pt")
input_ids

tensor([[     2, 235300,  25725,   1560,   4119, 235269,  42233,   2586,    473,
         235336, 102823, 153384,    972,    533, 173077, 235341]])

In [18]:
# before changing of the embedding layer
output_token_ids = hf_model.generate(input_ids, max_length=100)
output_token_ids, hf_tokenizer.decode(output_token_ids[0])

(tensor([[     2, 235300,  25725,   1560,   4119, 235269,  42233,   2586,    473,
          235336, 102823, 153384,    972,    533, 173077, 235341,    109,    688,
          178229, 235341,   4658,    571, 190404,  14630,  56292, 235265,    139,
           13625,   7863, 235405,   1560,   4119, 235336, 206506,   2586,    473,
          235336,    688,    109,    688,   7510,  42233,  89249, 235269, 162625,
          165296, 235265,   5620,   7863, 235405,   1560,   4119, 235336,    688,
             109,    688, 235070,  89249, 235269, 162625, 165296, 235265,    139,
           13625,    679,  18168, 182564,    549, 235336,    688,    109,    688,
            7510,    581,   2843,  15128, 122469,  89249, 235269, 102823,  80781,
           21032,  44875, 235265,    139,  13625,    679,  18168, 182564,    549,
          235336,    688,    109,    688,   7510,    581, 102823,  80781,  21032,
           44875]]),
 '<bos>Nasılsın, iyi misin? Türkçe cevap ver lütfen!\n\n**Merhaba! Benim adım

In [40]:
hf_model.lm_head.weight.dtype

torch.float32

In [None]:
# torch.FloatTensor

In [25]:
import sys
from multiprocessing import Pool
from multiprocessing.reduction import ForkingPickler
from types import FunctionType
import cloudpickle

assert sys.version_info >= (3, 8), 'python3.8 or greater required to use reducer_override'

def reducer_override(obj):
    if type(obj) is FunctionType:
        return (cloudpickle.loads, (cloudpickle.dumps(obj),))
    else:
        return NotImplemented

# Monkeypatch our function reducer into the pickler for multiprocessing.
# Without this line, the main block will not work on windows or macOS.
# Alterntively, moving the defintionn of foo outside of the if statement
# would make the main block work on windows or macOS (when run from
# the command line).
ForkingPickler.reducer_override = staticmethod(reducer_override)

In [26]:
chunksize = 1000

chunks = [list(tr_tokenizer.vocab.keys())[i:i + chunksize] for i in range(0, len(tr_tokenizer.vocab), chunksize)]
len(chunks), chunks[0][:5]

(31, ['özellik', 'Tekmele', 'ejder', 'irdele', 'akordeon'])

In [None]:
counter = 0
if __name__ == '__main__':
  def find_tokens_map(tr_tokens):
    global counter
    token_maps = []
    for tr_token in tr_tokens:
        tr_token_id = tr_tokenizer.vocab[tr_token]
        cosmos_token_ids = hf_tokenizer.encode(tr_token)

        token_maps.append({
          "tr_token": tr_token,
          "tr_token_id": tr_token_id,
          "hf_token_ids": cosmos_token_ids
          })
        counter += 1
        print(counter, tr_token, tr_token_id, len(cosmos_token_ids))
    return token_maps
  
  with Pool(100) as p:
    token_maps_list = p.map(find_tokens_map, chunks)
    

In [28]:
import pandas as pd
token_list_for_df = []
for token_maps in token_maps_list:
  for token_map in token_maps:
    token_list_for_df.append(token_map)

df = pd.DataFrame(token_list_for_df)
df

Unnamed: 0,tr_token,tr_token_id,hf_token_ids
0,özellik,3095,"[2, 235397, 6085, 3871]"
1,Tekmele,26542,"[2, 36764, 42030]"
2,ejder,11233,"[2, 15117, 866]"
3,irdele,7765,"[2, 616, 37725]"
4,akordeon,26977,"[2, 738, 29141, 477]"
...,...,...,...
30153,bekri,26934,"[2, 18608, 505]"
30154,elan,16290,"[2, 89241]"
30155,model,3164,"[2, 2516]"
30156,Muti,16975,"[2, 235296, 2749]"


In [29]:
# remove first token from llama_token_ids and gemma2_token_ids
df["hf_token_ids"] = df["hf_token_ids"].apply(lambda x: x[1:])
df

Unnamed: 0,tr_token,tr_token_id,hf_token_ids
0,özellik,3095,"[235397, 6085, 3871]"
1,Tekmele,26542,"[36764, 42030]"
2,ejder,11233,"[15117, 866]"
3,irdele,7765,"[616, 37725]"
4,akordeon,26977,"[738, 29141, 477]"
...,...,...,...
30153,bekri,26934,"[18608, 505]"
30154,elan,16290,[89241]
30155,model,3164,[2516]
30156,Muti,16975,"[235296, 2749]"


In [68]:
hf_model.lm_head.weight.requires_grad

True

In [69]:
import torch

gemma_embeddings = hf_model.lm_head.weight
embeddings = torch.zeros(len(df), gemma_embeddings.shape[1])# torch.nn.parameter.Parameter(gemma_embeddings[:len(df)]) 
# torch.zeros(len(df), gemma_embeddings.shape[1])
type(embeddings), type(gemma_embeddings)

(torch.Tensor, torch.nn.parameter.Parameter)

In [57]:
# change first token embeddings
# embeddings[0] = gemma_embeddings[0]
# RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.


In [70]:
""" for token_map in token_list:
    index = token_map['tr_token_id']
    cosmos_token_ids = token_map['cosmos_token_ids']
    embedding = cosmos_embeddings[cosmos_token_ids[0]]
    sum_embedding = embedding
    for cosmos_token_id in cosmos_token_ids[1:]:
        embedding = embedding + cosmos_embeddings[cosmos_token_id]
    if len(cosmos_token_ids) > 1:
        embedding = embedding / len(cosmos_token_ids)        
    embeddings[index] = embedding

embeddings[0] """

for i, row in df.iterrows():
    tr_token_id = row["tr_token_id"]
    hf_token_ids = row["hf_token_ids"]
    embedding = gemma_embeddings[hf_token_ids[0]]
    sum_embedding = embedding
    for hf_token_id in hf_token_ids[1:]:
        embedding = embedding + gemma_embeddings[hf_token_id]
    if len(hf_token_ids) > 1:
        embedding = embedding / len(hf_token_ids)        
    embeddings[i] = embedding

    

In [71]:
p_embedding = torch.nn.Parameter(embeddings)
type(p_embedding), p_embedding.requires_grad

(torch.nn.parameter.Parameter, True)

In [72]:
# change the embedding layer
hf_model.lm_head.weight = p_embedding
hf_model.lm_head.weight.shape

torch.Size([30158, 2304])

In [73]:
hf_model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps

In [75]:
example_text = "Nasılsın, iyi misin? Türkçe cevap ver lütfen!"
input_ids = tr_tokenizer.encode(example_text, return_tensors="pt")
input_ids

tensor([[12312,  6055,    17,  2018,  6402,  1209,    36,  5813,    80,  1630,
         28927,  1344, 26387,     6]])

In [None]:
# after changing of the embedding layer
output_token_ids = hf_model.generate(input_ids, max_length=100)
output_token_ids, tr_tokenizer.decode(output_token_ids[0])