

```python
GemmaForCausalLM(
  (embedder): Embedding()
  (model): GemmaModel(
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): GemmaAttention(
          (qkv_proj): Linear()
          (o_proj): Linear()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear()
          (up_proj): Linear()
          (down_proj): Linear()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
        (pre_feedforward_layernorm): RMSNorm()
        (post_feedforward_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (sampler): Sampler()
)
```

In [2]:
len(hf_tokenizer.vocab), hf_tokenizer.tokenize("Deneme 123")

(256000, ['D', 'ene', 'me', '▁', '1', '2', '3'])

In [4]:
import torch
from sampler import Sampler
from gemma_functions import precompute_freqs_cis, apply_rotary_emb
from linear import Linear
from embedding import Embedding
from rms_norm import RMSNorm
from gemma_mlp import GemmaMLP
from gemma_attention import GemmaAttention
from gemma_config import get_config_for_2b_v2
from gemma2_decoder_layer import Gemma2DecoderLayer
from gemma_model import GemmaModel
from gemma_for_causal_lm import GemmaForCausalLM


# Load model directly
from transformers import AutoTokenizer, Gemma2ForCausalLM

hf_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", use_fast=True)
hf_model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-2b-it")


# Load the configuration
config = get_config_for_2b_v2()
config.vocab_size = len(hf_tokenizer.vocab)
torch.set_default_dtype(torch.float32)

embedding = Embedding(config.vocab_size, config.hidden_size)
# embedding.load_from_path("../model_weights/embedder.pth")
embedding.weight = hf_model.lm_head.weight


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model_weights_path = "../model_weights/model/"

layers = []
for i in range(25):
    layer_path = model_weights_path + f"layers/layer_{i}/"
    attn_type = (
        config.attn_types[i]
        if config.attn_types is not None
        else config.AttentionType.GLOBAL
    )
    num_heads = config.num_attention_heads
    num_kv_heads = config.num_key_value_heads
    
    qkv_proj = Linear(
        config.hidden_size,
        (num_heads + 2 * num_kv_heads) * config.head_dim
    )
    qkv_proj.load_from_path(layer_path + "qkv_proj.pth")

    o_proj = Linear(
        num_heads * config.head_dim,
        config.hidden_size
    )
    o_proj.load_from_path(layer_path + "o_proj.pth")

    attention = GemmaAttention(
        hidden_size=config.hidden_size,
        num_heads=num_heads,
        num_kv_heads=num_kv_heads,
        attn_logit_softcapping=config.attn_logit_softcapping,
        query_pre_attn_scalar=config.query_pre_attn_scalar,
        head_dim=config.head_dim,
        attn_type=attn_type,
        qkv_proj=qkv_proj,
        o_proj=o_proj,
        sliding_window_size=config.sliding_window_size,
    )
    gate_proj = Linear(config.hidden_size, config.intermediate_size)
    gate_proj.load_from_path(layer_path + "gate_proj.pth")

    up_proj = Linear(config.hidden_size, config.intermediate_size)
    up_proj.load_from_path(layer_path + "up_proj.pth")

    down_proj = Linear(config.intermediate_size, config.hidden_size)
    down_proj.load_from_path(layer_path + "down_proj.pth")

    # Initialize the GemmaMLP
    mlp = GemmaMLP(gate_proj=gate_proj, up_proj=up_proj, down_proj=down_proj)
    
    input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    input_layernorm.load_from_path(layer_path + "input_layernorm.pth")

    post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    post_attention_layernorm.load_from_path(layer_path + "post_attention_layernorm.pth")

    pre_feedforward_layernorm = (
        RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        if config.use_pre_ffw_norm
        else None
    )
    if pre_feedforward_layernorm is not None:
        pre_feedforward_layernorm.load_from_path(layer_path + "pre_feedforward_layernorm.pth")

    post_feedforward_layernorm = (
        RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        if config.use_post_ffw_norm
        else None
    )
    if post_feedforward_layernorm is not None:
        post_feedforward_layernorm.load_from_path(layer_path + "post_feedforward_layernorm.pth")

    decoder_layer = Gemma2DecoderLayer(
        self_attn=attention,
        mlp=mlp,
        input_layernorm=input_layernorm,
        post_attention_layernorm=post_attention_layernorm,
        pre_feedforward_layernorm=pre_feedforward_layernorm,
        post_feedforward_layernorm=post_feedforward_layernorm,
    )
    layers.append(decoder_layer)

  weight_tensor = torch.load(path)
  weight_tensor = torch.load(path)


In [6]:
model = GemmaModel(config, layers)
model.to("cpu")

sampler = Sampler(config.vocab_size)

gemma_model = GemmaForCausalLM(config, tokenizer=hf_tokenizer, embedding=embedding, model=model, sampler=sampler)
gemma_model.to("cpu")
gemma_model

GemmaForCausalLM(
  (embedder): Embedding()
  (model): GemmaModel(
    (layers): ModuleList(
      (0-24): 25 x Gemma2DecoderLayer(
        (self_attn): GemmaAttention(
          (qkv_proj): Linear()
          (o_proj): Linear()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear()
          (up_proj): Linear()
          (down_proj): Linear()
        )
        (input_layernorm): RMSNorm()
        (post_attention_layernorm): RMSNorm()
        (pre_feedforward_layernorm): RMSNorm()
        (post_feedforward_layernorm): RMSNorm()
      )
    )
    (norm): RMSNorm()
  )
  (sampler): Sampler()
)

In [10]:
gemma_model.generate("Nasılsın iyi misin?", device='cpu')

prompt_tokens: [[2, 235300, 25725, 1560, 4119, 42233, 2586, 473, 235336]]
kv_caches len: 26 prompt_tokens len: 1


'\n'

In [8]:
hf_tokenizer_b = AutoTokenizer.from_pretrained("alibayram/tr_tokenizer", is_fast=True)

In [9]:
hf_model.lm_head

Linear(in_features=2304, out_features=256000, bias=False)

In [11]:
embedding = Embedding(config.vocab_size, config.hidden_size)
embedding.load_from_path("../model_weights/embedder.pth")
hf_model.lm_head.weight = embedding.weight
hf_model

  weight_tensor = torch.load(path)


Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps

In [12]:
input_ids = hf_tokenizer_b.encode("Nasılsın, iyi misin?", return_tensors="pt")
input_ids

tensor([[12312,  6055,    17,  2018,  6402,  1209,    36]])

In [13]:
output = hf_model.generate(input_ids)
hf_tokenizer_b.decode(output[0])

The 'max_batch_size' argument of HybridCache is deprecated and will be removed in v4.46. Use the more precisely named 'batch_size' argument instead.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


'Nasıl sın , iyi mis in ? <eos>'

In [10]:
# Define dimensions
batch_size = 2
seq_len = 8
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size
num_heads = config.num_attention_heads
num_kv_heads = config.num_key_value_heads
head_dim = config.head_dim

input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
pre_feedforward_layernorm = (
    RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    if config.use_pre_ffw_norm
    else None
)
post_feedforward_layernorm = (
    RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    if config.use_post_ffw_norm
    else None
)

# Initialize decoder layer
decoder_layer = Gemma2DecoderLayer(
    self_attn=attention,
    mlp=mlp,
    input_layernorm=input_layernorm,
    post_attention_layernorm=post_attention_layernorm,
    pre_feedforward_layernorm=pre_feedforward_layernorm,
    post_feedforward_layernorm=post_feedforward_layernorm,
)

# Create input tensors
hidden_states = torch.randn(batch_size, seq_len, hidden_size)
freqs_cis = torch.randn(seq_len, head_dim // 2, dtype=torch.complex64)
kv_write_indices = torch.arange(seq_len)
kv_cache = (torch.zeros(batch_size, seq_len, num_kv_heads, head_dim),
            torch.zeros(batch_size, seq_len, num_kv_heads, head_dim))
mask = torch.zeros(batch_size, num_heads, seq_len, seq_len)

# Forward pass
output = decoder_layer(
    hidden_states=hidden_states,
    freqs_cis=freqs_cis,
    kv_write_indices=kv_write_indices,
    kv_cache=kv_cache,
    mask=mask,
)

# Print output shape
print("Output shape:", output.shape)  # Expected: (batch_size, seq_len, hidden_size)


xq.shape, xk.shape, xv.shape: torch.Size([2, 8, 8, 256]) torch.Size([2, 8, 4, 256]) torch.Size([2, 8, 4, 256])
Output shape: torch.Size([2, 8, 2304])


In [8]:
# Define dimensions
batch_size = 2
seq_len = 8
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size
num_heads = config.num_attention_heads
num_kv_heads = config.num_key_value_heads
head_dim = config.head_dim
attn_logit_softcapping = config.attn_logit_softcapping
sliding_window_size = config.sliding_window_size
attn_type = config.attn_types[0]  # Use the first attention type in the sequence

# Create input tensors
hidden_states = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float32)
freqs_cis = torch.randn(seq_len, head_dim // 2, dtype=torch.complex64)  # Ensure seq_len matches
kv_write_indices = torch.arange(seq_len)  # Example indices for key-value caching
k_cache = torch.zeros(batch_size, seq_len, num_kv_heads, head_dim, dtype=torch.float32)
v_cache = torch.zeros(batch_size, seq_len, num_kv_heads, head_dim, dtype=torch.float32)
kv_cache = (k_cache, v_cache)  # Cache for keys and values
mask = torch.zeros(batch_size, num_heads, seq_len, seq_len, dtype=torch.float32)  # Example mask

# Define linear projections for QKV and output
qkv_proj = Linear(
    hidden_size,
    (num_heads + 2 * num_kv_heads) * head_dim
)
o_proj = Linear(
    num_heads * head_dim,
    hidden_size
)

# Initialize the GemmaAttention module
attention = GemmaAttention(
    hidden_size=hidden_size,
    num_heads=num_heads,
    num_kv_heads=num_kv_heads,
    attn_logit_softcapping=attn_logit_softcapping,
    query_pre_attn_scalar=config.query_pre_attn_scalar,
    head_dim=head_dim,
    attn_type=attn_type,
    qkv_proj=qkv_proj,
    o_proj=o_proj,
    sliding_window_size=sliding_window_size,
)

# Forward pass through the attention module
output = attention(
    hidden_states=hidden_states,
    freqs_cis=freqs_cis,
    kv_write_indices=kv_write_indices,
    kv_cache=kv_cache,
    mask=mask,
)

# Print the output shape for validation
print("Output shape:", output.shape)  # Expected: (batch_size, seq_len, hidden_size)

xq.shape, xk.shape, xv.shape: torch.Size([2, 8, 8, 256]) torch.Size([2, 8, 4, 256]) torch.Size([2, 8, 4, 256])
Output shape: torch.Size([2, 8, 2304])


In [9]:
# Import the necessary modules
import torch
from linear import Linear  # Assumes `Linear` is defined in `linear.py`

# Define dimensions
batch_size = 2
seq_len = 8
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size
num_heads = config.num_attention_heads
num_kv_heads = config.num_key_value_heads
head_dim = config.head_dim

# Create a random input tensor
x = torch.randn(batch_size, seq_len, hidden_size)

gate_proj = Linear(hidden_size, intermediate_size)
up_proj = Linear(hidden_size, intermediate_size)
down_proj = Linear(intermediate_size, hidden_size)
# Initialize the GemmaMLP
mlp = GemmaMLP(gate_proj=gate_proj, up_proj=up_proj, down_proj=down_proj)

# Apply the MLP to the input tensor
output = mlp(x)

# Print the shapes for verification
print("Input shape:", x.shape)  # Expected: (2, 4, 8)
print("Output shape:", output.shape)  # Expected: (2, 4, 8)

Input shape: torch.Size([2, 8, 2304])
Output shape: torch.Size([2, 8, 2304])


In [6]:
# Define dimensions
batch_size = 2
seq_len = 4
dim = 8  # Embedding dimension

# Create a random input tensor
x = torch.randn(batch_size, seq_len, dim)

# Initialize RMSNorm
rms_norm = RMSNorm(dim=dim, eps=1e-6, add_unit_offset=True)

# Apply RMSNorm to the input tensor
normalized_x = rms_norm(x)

# Print the shapes for verification
print("Input shape:", x.shape)  # Expected: (2, 4, 8)
print("Output shape:", normalized_x.shape)  # Expected: (2, 4, 8)

# Verify that the mean square of the normalized output is close to 1
print("Mean square of normalized output:", normalized_x.pow(2).mean(-1))

Input shape: torch.Size([2, 4, 8])
Output shape: torch.Size([2, 4, 8])
Mean square of normalized output: tensor([[1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<MeanBackward1>)


In [7]:
embedding = Embedding(10, 10)
linear = Linear(10, 10)
sampler = Sampler(10)

embedding, linear, sampler

(Embedding(), Linear(), Sampler())

In [8]:
in_features = 4  
out_features = 2  
linear = Linear(in_features, out_features) 
linear.weight.data = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.float32)

example_x = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.float32)
example_y = linear(example_x)
print(example_y)

tensor([[ 30.,  70.],
        [ 70., 174.]])


In [10]:
# 1. Basit Kullanım Örneği  
def simple_example():  
    # Model oluşturma  
    in_features = 4  
    out_features = 2  
    linear = Linear(in_features, out_features)  
  
    
    # Forward pass  
    input_tensor = torch.randn(3, in_features)  # 3 örnek, her biri 4 özellikli  
    output = linear(input_tensor)  
    
    print("Giriş şekli:", input_tensor.shape)  
    print("Çıkış şekli:", output.shape)  
    print("Ağırlık şekli:", linear.weight.shape)  
    
    return linear, input_tensor, output 

# Örnekleri çalıştırma  
print("Basit Örnek:")  
linear, inputs, outputs = simple_example()  

Basit Örnek:
Giriş şekli: torch.Size([3, 4])
Çıkış şekli: torch.Size([3, 2])
Ağırlık şekli: torch.Size([2, 4])


In [11]:


def example_usage():  
    # Parametreleri belirleme  
    batch_size = 2  
    seq_length = 16  
    num_heads = 4  
    head_dim = 8  # Her head için boyut  
    
    # 1. Frekansları hesaplama  
    # Not: head_dim'in yarısını kullanıyoruz çünkü kompleks sayılarla çalışıyoruz  
    freqs_cis = precompute_freqs_cis(dim=head_dim, end=seq_length)  
    print("Frekans şekli:", freqs_cis.shape)  
    
    # 2. Örnek query tensörü oluşturma  
    query = torch.randn(batch_size, num_heads, seq_length, head_dim)  
    print("Query şekli:", query.shape)  
    
    # 3. Rotary embedding uygulama  
    query_rotary = apply_rotary_emb(query, freqs_cis)  
    print("Rotary sonrası query şekli:", query_rotary.shape)  
    
    return query, query_rotary, freqs_cis  

# Örneği çalıştırma  
query, query_rotary, freqs = example_usage()  

# Sonuçları kontrol etme  
print("\nÖrnek değerler:")  
print("İlk query değerleri:", query[0, 0, 0, :5])  
print("Rotary sonrası ilk değerler:", query_rotary[0, 0, 0, :5])  
print("İlk frekans değerleri:", freqs[0, :5])

Frekans şekli: torch.Size([16, 4])
Query şekli: torch.Size([2, 4, 16, 8])


RuntimeError: The size of tensor a (4) must match the size of tensor b (16) at non-singleton dimension 2

In [12]:

# Define dimensions
batch_size = 2
seq_len = 4
dim = 8  # Must be even for rotary embeddings

# Precompute rotary embeddings
freqs_cis = precompute_freqs_cis(dim=dim, end=seq_len)

# Create a dummy input tensor
x = torch.randn(batch_size, seq_len, dim)

# Apply rotary embeddings
x_with_rotary = apply_rotary_emb(x, freqs_cis)

# Print shapes for validation
print("Input shape:", x.shape)  # Expected: (2, 4, 8)
print("Output shape:", x_with_rotary.shape)  # Expected: (2, 4, 8)

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 2

In [16]:

# Example vocabulary
vocab = {0: "<pad>", 1: "hello", 2: "world", 3: "I", 4: "am", 5: "a", 6: "token", 7: "example", 8: "for", 9: "you"}

# Define input parameters
vocab_size = len(vocab)
batch_size = 2
seq_len = 3  # Sequence length for hidden_states
hidden_size = 8
temperature_value = 0.8
top_p_value = 0.9
top_k_value = 3

# Create inputs
embedding = torch.randn(vocab_size, hidden_size)  # (vocab_size, hidden_size)
hidden_states = torch.randn(batch_size, seq_len, hidden_size)  # (batch_size, seq_len, hidden_size)
output_positions = torch.tensor([i % seq_len for i in range(batch_size)])  # (batch_size,)
temperatures = torch.tensor([temperature_value] * batch_size)  # (batch_size,)
top_ps = torch.tensor([top_p_value] * batch_size)  # (batch_size,)
top_ks = torch.tensor([top_k_value] * batch_size)  # (batch_size,)
embedding_bias = torch.zeros(vocab_size)  # (vocab_size,)

# Initialize the Sampler and call forward
example = Sampler(vocab_size)
next_token_ids, logits = example.forward(
    embedding=embedding,
    hidden_states=hidden_states,
    output_positions=output_positions,
    temperatures=temperatures,
    top_ps=top_ps,
    top_ks=top_ks,
    embedding_bias=embedding_bias
)

# Print next tokens
next_tokens = [vocab[token_id.item()] for token_id in next_token_ids]
print("Next tokens:", next_tokens)

Next tokens: ['hello', '<pad>']
