In [25]:
import os
import torch
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from turkish_tokenizer import TurkishTokenizer

# Tokenizer'ı başlat
tr_tokenizer = TurkishTokenizer()

gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m-it")
gemma_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m-it")
gemma_model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps

In [28]:
from gemma_claude import create_gemma3_model

model_claude = create_gemma3_model(vocab_size=32768)

model_save_path = "claude_model.pth"
# check if saved model exists
if os.path.exists(model_save_path):
    model_claude.load_state_dict(torch.load(model_save_path))
    print("loaded model from", model_save_path)
else:
    model_state = gemma_model.state_dict()
    model_state.pop('lm_head.weight')
    model_state.pop('model.embed_tokens.weight')
    model_claude.load_state_dict(model_state, strict=False)
    print("loaded model from gemma model")
    from tokenizer_matcher import TokenizerMatcher


    tokenizer_matcher = TokenizerMatcher(
        source_tokenizer=gemma_tokenizer,
        target_tokenizer=tr_tokenizer,
        source_model=gemma_model,
        target_model=model_claude
    )

    matched_embeddings = tokenizer_matcher.match_embeddings(adding_style="sum")
    tokenizer_matcher.change_target_model_embeddings(matched_embeddings)
    print("matched embeddings")
    # save claude model using torch 
    torch.save(model_claude.state_dict(), model_save_path)
    print("saved model to", model_save_path)

model_claude


loaded model from claude_model.pth


Gemma3ForCausalLM(
  (model): Gemma3Model(
    (embed_tokens): Gemma3ScaledWordEmbedding(32768, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm()
          (k_norm): Gemma3RMSNorm()
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): GELU(approximate='tanh')
        )
        (input_layernorm): Gemma3RMSNorm()
        (post_attention_layernorm): Gemma3RMS

In [None]:
# Automatic device detection
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
device

'mps'

In [None]:


prompt = "Nasılsın?"

torch.manual_seed(42)
tokens_ids = tr_tokenizer.encode(prompt)
outs = model_claude.generate(torch.tensor([tokens_ids]), temperature=1.0)
# '<bos>How are you? I really enjoyed being the Muse Association Dinner Bites Venture & Innovate Guyer ceremony\n> Got to do screens with screen mates\n> dreamed of being pencil swift samples night\n> like +5, picture-perfect elegance\n'

tr_tokenizer.decode(outs[0].tolist())

'Nasılsın?▁128536gi̇bi̇omurgalılarek_temp_20072kok_temp_19598kok_temp_19920kok_temp_19914ek_temp_20191553special_76kok_temp_1991619853381863kok_temp_19793kok_temp_19823kok_temp_19959kok_temp_19731295ek_temp_20108kok_temp_19706ek_temp_20229azımsabüyükanne480337ek_temp_20246kok_temp_199411872kok_temp_199781820kok_temp_195751851special_61236186128079ek_temp_20099ek_temp_20225444kok_temp_19976kok_temp_19619540kok_temp_19845kok_temp_19702'

In [None]:
print("embeddings of 157 and 30158 should be the same claude")
print(model_claude.model.embed_tokens(torch.tensor([157]))[:,:15])
print("-"*20, "gemma")
print(gemma_model.model.embed_tokens(torch.tensor([30158]))[:,:15])

print("weights of any layer should be the same claude")
print(model_claude.model.layers[0].self_attn.q_proj.weight[0:1, 0:10])
print("-"*20, "gemma")
print(gemma_model.model.layers[0].self_attn.q_proj.weight[0:1, 0:10])

embeddings of 157 and 30158 should be the same claude
tensor([[ 0.4416,  0.9512,  1.0129, -0.0926, -0.5219, -1.2538, -0.1451, -1.1735,
          1.4885, -0.5281,  1.1859, -0.0598,  0.4107,  1.3094,  0.4045]],
       grad_fn=<SliceBackward0>)
-------------------- gemma
tensor([[ 0.4416,  0.9512,  1.0129, -0.0926, -0.5219, -1.2538, -0.1451, -1.1735,
          1.4885, -0.5281,  1.1859, -0.0598,  0.4107,  1.3094,  0.4045]],
       grad_fn=<SliceBackward0>)
weights of any layer should be the same claude
tensor([[-0.0170,  0.0107, -0.0175, -0.0049,  0.0083,  0.0053,  0.0072, -0.0077,
         -0.0038,  0.0017]], grad_fn=<SliceBackward0>)
-------------------- gemma
tensor([[-0.0170,  0.0107, -0.0175, -0.0049,  0.0083,  0.0053,  0.0072, -0.0077,
         -0.0038,  0.0017]], grad_fn=<SliceBackward0>)


Target token 157 ('beniz') mapped to source tokens: [30158] (['benz'])
Target token 30158 ('ffffff') mapped to source tokens: [62923] (['ffffff'])
Changed 32763 embeddings out of 32763 matched tokens


32763

In [13]:
matched_tokens = tokenizer_matcher.match_tokens()
matched_tokens[2697]

[3555]

In [None]:
tokens_ids = gemma_tokenizer.encode("Nasılsın?")
torch.manual_seed(42)
outs = model_claude.generate(torch.tensor([tokens_ids]), temperature=1.0)
tr_tokenizer.decode(outs[0].tolist())

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


'<bos>Nasılsın?\n\n**Merhaba!**\n\nMerhaba, bana size nasıl yardımcı olacağını söyleyebilirim?\n\n'

In [None]:
from datasets import load_dataset

# Load the English dataset from the latest dump
ds_wiki = load_dataset("omarkamali/wikipedia-monthly", "20250703.tr", split="train")
ds_wiki

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset({
    features: ['id', 'url', 'title', 'raw_mediawiki', 'text'],
    num_rows: 641443
})

In [None]:
# first 100 to text with title
text = ""
for i in range(100):
    text += ds_wiki[i]["title"] + "\n" + ds_wiki[i]["text"] + "\n"

while "\n " in text:
    text = text.replace("\n ", "\n")

while "\n\n" in text:
    text = text.replace("\n\n", "\n")


with open("data.txt", "w", encoding="utf-8") as f:
    f.write(text)

print(len(text))

1394660


In [None]:
from text_dataset import create_dataset, create_dataloader

with open("data.txt", "r", encoding="utf-8") as f:
    text = f.read()

seq_len = 512
stride = 1
batch_size = 12
shuffle = False

train_dataset = create_dataset(text[:10000], tokenizer, seq_len, stride, device)
# dataset = create_or_load_dataset(text[:50000], tr_tokenizer, seq_len, stride, device, cache_path="dataset/wiki1")
train_dataloader = create_dataloader(train_dataset, batch_size, shuffle)

valid_dataset = create_dataset(text[10000:12000], tokenizer, seq_len, stride, device)
valid_dataloader = create_dataloader(valid_dataset, batch_size, shuffle)

TextDataset: 2896 tokens, seq_len=512, stride=1
Calculated sequences: 2385
TextDataset: 577 tokens, seq_len=512, stride=1
Calculated sequences: 66


In [None]:
from gemma_trainer_claude import TrainingConfig

config = TrainingConfig(
    learning_rate=5e-5,
    batch_size=batch_size,
    gradient_accumulation_steps=2,
    max_epochs=10,
    seq_len=seq_len,
    logging_steps=50,
    eval_steps=200,
    save_steps=500,
    output_dir="./test_checkpoints",
    device="auto",
    mixed_precision=True,
    eval_dataset_ratio=0.1,
    do_eval=True,
    warmup_steps=100,
    max_grad_norm=1.0,
    weight_decay=0.01
)

Found Apple Silicon GPU (MPS), using MPS device


In [None]:
from gemma_trainer_claude import create_trainer


trainer = create_trainer(
        model=model_claude,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        config=config,
        tokenizer=tokenizer
    )

2025-08-17 21:34:01,048 - gemma_trainer_claude - INFO - Using Apple Silicon GPU (MPS)
2025-08-17 21:34:01,049 - gemma_trainer_claude - INFO - Mixed precision training enabled (MPS)
2025-08-17 21:34:01,050 - gemma_trainer_claude - INFO - Model moved to mps
2025-08-17 21:34:01,051 - gemma_trainer_claude - INFO - Total parameters: 435,870,336
2025-08-17 21:34:01,051 - gemma_trainer_claude - INFO - Trainable parameters: 435,870,336
2025-08-17 21:34:01,053 - gemma_trainer_claude - INFO - Optimizer: adamw
2025-08-17 21:34:01,053 - gemma_trainer_claude - INFO - Scheduler: cosine_with_warmup
2025-08-17 21:34:01,053 - gemma_trainer_claude - INFO - Total training steps: 990


In [None]:
import os


try:
    trainer.train()
    print("\n=== Training Completed Successfully! ===")
except KeyboardInterrupt:
    print("\n=== Training Interrupted by User ===")
    # Save final checkpoint
    final_checkpoint_dir = os.path.join("./test_checkpoints", "interrupted_model")
    trainer.save_checkpoint(final_checkpoint_dir)
    print(f"Checkpoint saved to: {final_checkpoint_dir}")
except Exception as e:
    print(f"\n=== Training Failed: {e} ===")
    raise

2025-08-17 21:35:17,329 - gemma_trainer_claude - INFO - Starting training...
2025-08-17 21:35:17,333 - gemma_trainer_claude - INFO - Training config: {'model_name': 'gemma3', 'learning_rate': 5e-05, 'min_learning_rate': 1e-06, 'weight_decay': 0.01, 'max_epochs': 10, 'max_steps': None, 'warmup_steps': 100, 'warmup_ratio': 0.1, 'seq_len': 512, 'batch_size': 12, 'gradient_accumulation_steps': 2, 'max_grad_norm': 1.0, 'logging_steps': 50, 'eval_steps': 200, 'save_steps': 500, 'save_total_limit': 3, 'output_dir': './test_checkpoints', 'logging_dir': './test_checkpoints/logs', 'device': 'mps', 'mixed_precision': True, 'compile_model': False, 'eval_dataset_ratio': 0.1, 'do_eval': True, 'optimizer_type': 'adamw', 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-08, 'scheduler_type': 'cosine_with_warmup'}
2025-08-17 21:35:17,339 - gemma_trainer_claude - INFO - Epoch 1/10



=== Training Interrupted by User ===


2025-08-17 21:44:15,901 - gemma_trainer_claude - INFO - Checkpoint saved to ./test_checkpoints/interrupted_model


Checkpoint saved to: ./test_checkpoints/interrupted_model


In [None]:
%pip install turkish-tokenizer -U


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [None]:

tokenizer.name_or_path

'google/gemma-3-270m-it'

In [None]:
model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps

In [None]:
from gemma_model import GemmaForCausalLM, get_config_for_270m

config_270m = get_config_for_270m("float32")

gemma_model = GemmaForCausalLM(config_270m, tokenizer)
gemma_model.load_weights_from_hf(model.model.state_dict())
# gemma_model.from_pretrained("gemma-3-270m-hf-it")


HF model vocab size: 262144, Current model vocab size: 262144
Missing keys: ['local_freqs_cis', 'global_freqs_cis']
Successfully loaded 236 weights


In [None]:
import torch

min_prompt_len = 5
output_positions_tensor = torch.tensor([min_prompt_len - 1], dtype=torch.long)

hidden_states = torch.randn(1, 8)
print(hidden_states)
hidden_states = hidden_states.index_select(1, output_positions_tensor).squeeze(dim=1)
hidden_states

tensor([[-0.4189, -0.0205, -1.4623,  0.2724,  0.2938, -0.0337, -1.2584,  2.4883]])


tensor([0.2938])

In [None]:
gemma_model

GemmaForCausalLM(
  (embedder): Embedding(262144, 640)
  (model): GemmaModel(
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (query_norm): RMSNorm()
          (key_norm): RMSNorm()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
        (pre_feedforward_layernorm): RMSNorm()
        (post_feedforward_layernorm): RMSNorm()
  

In [None]:
gemma_model.state_dict().keys()

odict_keys(['local_freqs_cis', 'global_freqs_cis', 'model.embedder.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.self_attn.query_norm.weight', 'model.layers.0.self_attn.key_norm.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.0.pre_feedforward_layernorm.weight', 'model.layers.0.post_feedforward_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.query_norm.weight', 'model.layers.1.self_attn.key_norm.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.dow

In [None]:
prompt = "Merhaba, nasılsın?"

'mps'

In [None]:
gemma_model.generate(prompt)


'\n\nBen, "Öyle bir şey bulmak istiyorsan, bir arkadaşım veya bir ofis çalışanının yanında bulunabilirsin. Konuşmak için çok mu hızlı bir şekilde ve kolay bir şekilde, kısa süzen bir zaman ardında, olumlu bir ortamda çalışabilirsin?"\n\nBu metni kendi özür ifadeleriyle sunun.\nBu komutu farklı bir konuma veya ihtiyaçlara göre uyarlayabilirsiniz.\nÖ puissiezize veya ihtiyacınızın ne'

In [None]:
gemma_model.save_pretrained("gemma-3-270m-hf-it")

Model saved to gemma-3-270m-hf-it


In [None]:


# Metin tokenizasyonu
text = """
Ali Ata Bak▁ aliler ahmetler selmanlar da bizde onlar da testte kitapta kitabını okudum bu işe
 
 burnunu sokma
Burun buruna bir kaza\toldu
 """
tokens = tr_tokenizer.encode(text)
print("Token IDs:", tokens)

# Token'ları metne geri çevir
decoded_text = tr_tokenizer.decode(tokens)
print("Decoded:", decoded_text)

Token IDs: [3, 0, 2697, 2, 0, 2212, 2, 0, 2794, 1, 2, 18194, 20043, 2, 6766, 20000, 2, 17321, 20000, 2, 20024, 2, 2595, 20024, 2, 2627, 2, 20024, 2, 3045, 20024, 2, 227, 20024, 2, 227, 15247, 2, 2656, 10572, 2, 2503, 2, 2599, 20038, 3, 3, 2, 165, 20021, 20035, 2, 20064, 4373, 20002, 3, 0, 165, 2, 165, 20037, 2, 2501, 2, 3303, 4, 2502, 20026, 3, 2]
Decoded: 
Ali Ata Bak▁u▁ aliler ahmetler selmanlar da bizde onlar da testte kitapta kitabını okudum bu işe

 burnunu sokma
Burun burna bir kaza	oldu
 


In [None]:
from text_dataset import create_dataset, create_dataloader

with open("data.txt", "r", encoding="utf-8") as f:
    text = f.read()

seq_len = 512
stride = 1
batch_size = 12
shuffle = False

train_dataset = create_dataset(text[:10000], tr_tokenizer, seq_len, stride, device)
# dataset = create_or_load_dataset(text[:50000], tr_tokenizer, seq_len, stride, device, cache_path="dataset/wiki1")
train_dataloader = create_dataloader(train_dataset, batch_size, shuffle)



In [None]:
valid_dataset = create_dataset(text[10000:12000], tr_tokenizer, seq_len, stride, device)
valid_dataloader = create_dataloader(valid_dataset, batch_size, shuffle)

In [None]:
train_dataset.get_sequence_info(), valid_dataset.get_sequence_info()

({'seq_len': 512,
  'stride': 1,
  'num_sequences': 3875,
  'tokenizer_name': 'unknown'},
 {'seq_len': 512,
  'stride': 1,
  'num_sequences': 503,
  'tokenizer_name': 'unknown'})

In [None]:
from gemma_model import GemmaForCausalLM, get_config_for_270m_tr_tokenizer


config_270m = get_config_for_270m_tr_tokenizer("float32")

gemma_model = GemmaForCausalLM(config_270m, tr_tokenizer, device)
# gemma_model.load_weights_from_hf(model.model.state_dict())
gemma_model.from_pretrained("gemma-3-270m-tr-tokenizer-it")


Model loaded from gemma-3-270m-tr-tokenizer-it


In [None]:
gemma_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embedder): Embedding(32768, 640)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (query_norm): RMSNorm()
          (key_norm): RMSNorm()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
        (pre_feedforward_layernorm): RMSNorm()
        (post_feedforward_layernorm): RMSNorm()
 

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from model_trainer import ModelTrainer

optimizer = optim.AdamW(gemma_model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

# 3. Instantiate and Run the Trainer
trainer = ModelTrainer(
    model=gemma_model,
    train_loader=train_dataloader,
    val_loader=valid_dataloader,
    optimizer=optimizer,
    criterion=criterion,
    scheduler=scheduler,
    device=device,
    epochs=10,
    patience=3
)

trainer.train()

print("\nTrainer has finished. The `trainer.model` now holds the best weights.")

Starting training on device: mps


                                                                           

Epoch 01/10 | Train Loss: 0.5769 | Val Loss: 13.9190 | LR: 0.001000
Validation loss decreased (inf --> 13.9190). Saving model...


                                                                          

KeyboardInterrupt: 

In [None]:
gemma_model.generate(prompt)

TypeError: index_copy_() received an invalid combination of arguments - got (int, NoneType, Tensor), but expected one of:
 * (int dim, Tensor index, Tensor source)
      didn't match because some of the arguments have invalid types: (int, !NoneType!, Tensor)
 * (name dim, Tensor index, Tensor source)
      didn't match because some of the arguments have invalid types: (!int!, !NoneType!, Tensor)


In [None]:

# Generate text (no device parameter needed)
response = gemma_model.generate(
    prompts="Hello, how are you?",
    output_len=50,
    temperature=0.7
)

In [None]:
response

'ek_temp_20212ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244ek_temp_20244'

In [None]:
gemma_model.generate(prompt)


'ek_temp_20200ek_temp_20084ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_200

In [None]:
import torch

gemma_model.model.embedder(torch.tensor([157]).to(gemma_model.device))[:,:15]

tensor([[-1.7912,  0.1602,  0.1088, -2.0236,  1.2162, -0.8047, -0.3950,  0.3219,
         -0.0877,  0.2307,  0.3475,  0.3288,  0.6641,  1.6956, -0.2081]],
       device='mps:0', grad_fn=<SliceBackward0>)

In [None]:
gemma_model.embedder.weight.shape

torch.Size([32768, 640])

In [None]:
model.model.embed_tokens(torch.tensor([30158]))[:,:15]

tensor([[ 0.4416,  0.9512,  1.0129, -0.0926, -0.5219, -1.2538, -0.1451, -1.1735,
          1.4885, -0.5281,  1.1859, -0.0598,  0.4107,  1.3094,  0.4045]],
       grad_fn=<SliceBackward0>)

In [None]:
model.model.layers[4].mlp.gate_proj.weight[234][:15], gemma_model.model.layers[4].mlp.gate_proj.weight[234][:15]

(tensor([ 0.0264, -0.0087,  0.0054,  0.0137, -0.0089,  0.0084,  0.0043, -0.0322,
          0.0408,  0.0034, -0.0100,  0.0056, -0.0010,  0.0203, -0.0325],
        grad_fn=<SliceBackward0>),
 tensor([ 0.0264, -0.0087,  0.0054,  0.0137, -0.0089,  0.0084,  0.0043, -0.0322,
          0.0408,  0.0034, -0.0100,  0.0056, -0.0010,  0.0203, -0.0325],
        device='mps:0', grad_fn=<SliceBackward0>))

In [None]:
gemma_model.save_pretrained("gemma-3-270m-tr-tokenizer-it")

Model saved to gemma-3-270m-tr-tokenizer-it


In [None]:
import torch
tensors = tokenizer.encode(prompt)
tensors = torch.tensor(tensors)
tensors = tensors.unsqueeze(0)
tensors = tensors.to("cpu")

ids = model.generate(tensors)
ids

In [None]:
tokenizer.decode(ids[0])

In [None]:
model_1b = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-pt")
model_1b

In [None]:
all_tr_tokens = tr_tokenizer.get_vocab()

print(len(all_tr_tokens))
print(len(tr_tokenizer.reverse_dict))
print(tr_tokenizer.vocab_size)

In [None]:
all_tr_tokens["ali"]

In [None]:
from tokenizer_matcher import TokenizerMatcher


tokenizer_matcher = TokenizerMatcher(tokenizer, tr_tokenizer, model, gemma_model)
matched_embeddings = tokenizer_matcher.match_embeddings()

tokenizer_matcher.change_target_model_embeddings(matched_embeddings)

gemma_model.generate(prompt)

Changed 32763 embeddings out of 32763 matched tokens


'ek_temp_20200ek_temp_20084ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20074ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_20077ek_temp_200

In [None]:
matched_tokens = tokenizer_matcher.match_tokens()
matched_tokens[157]


[2, 30158]

In [None]:
tokenizer.vocab_size

262144

In [None]:
token = "soğucuk"
ids = tokenizer.encode(token)
tr_ids = tr_tokenizer.encode(token)

print(ids)
print(tr_ids)

print(tokenizer.tokenize(token))
print(tr_tokenizer.tokenize(token))
print(tr_tokenizer.decode(tr_ids))

In [None]:
example_ids = [157, 165, 5]


In [None]:
tokenizer.eos_token_id

In [None]:
model.model.state_dict().keys()


In [None]:
model.model.embed_tokens.weight[30158][:15]

tensor([ 0.0175,  0.0376,  0.0400, -0.0037, -0.0206, -0.0496, -0.0057, -0.0464,
         0.0588, -0.0209,  0.0469, -0.0024,  0.0162,  0.0518,  0.0160],
       grad_fn=<SliceBackward0>)

In [None]:
model.lm_head.weight[30158][:15]

tensor([ 0.0175,  0.0376,  0.0400, -0.0037, -0.0206, -0.0496, -0.0057, -0.0464,
         0.0588, -0.0209,  0.0469, -0.0024,  0.0162,  0.0518,  0.0160],
       grad_fn=<SliceBackward0>)