In [10]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth  # Do this in local & cloud setups
else:
    import torch; v = re.match(r'[\d]{1,}\.[\d]{1,}', str(torch.__version__)).group(0)
    xformers = 'xformers==' + {'2.9':'0.0.33.post1','2.8':'0.0.32.post2'}.get(v, "0.0.33.post1")
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth
!pip install transformers==4.56.2 && pip install --no-deps trl==0.22.2

Unsloth

In [11]:
from unsloth import FastSentenceTransformer

fourbit_models = [
    "unsloth/all-MiniLM-L6-v2",
    "unsloth/embeddinggemma-300m",
    "unsloth/Qwen3-Embedding-4B",
    "unsloth/Qwen3-Embedding-0.6B",
    "unsloth/all-mpnet-base-v2",
    "unsloth/gte-modernbert-base",
    "unsloth/bge-m3"

] # More models at https://huggingface.co/unsloth

model = FastSentenceTransformer.from_pretrained(
    model_name = "unsloth/embeddinggemma-300m",
    max_seq_length = 1024,   # Choose any for long context!
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

==((====))==  Unsloth 2026.1.4: Fast Gemma3 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


In [12]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 1024, 'do_lower_case': False, 'architecture': 'Gemma3TextModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 3072, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Dense({'in_features': 3072, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (4): Normalize()
)


In [13]:
print(model[0].auto_model)

Gemma3TextModel(
  (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 768, padding_idx=0)
  (layers): ModuleList(
    (0-23): 24 x Gemma3DecoderLayer(
      (self_attn): Gemma3Attention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=256, bias=False)
        (v_proj): Linear(in_features=768, out_features=256, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
        (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
        (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
      )
      (mlp): Gemma3MLP(
        (gate_proj): Linear(in_features=768, out_features=1152, bias=False)
        (up_proj): Linear(in_features=768, out_features=1152, bias=False)
        (down_proj): Linear(in_features=1152, out_features=768, bias=False)
        (act_fn): PytorchGELUTanh()
      )
      (input_layernorm): Gemma3RMSNorm((768,), eps=1e-06)
      (post_attention_layernorm): Gemma3RMSNorm((768,), eps=

## Summary:

### Q&A
The `unsloth/embeddinggemma-300m` model processes an input sentence by first tokenizing it and converting tokens into initial 768-dimensional embeddings using the `Gemma3TextScaledWordEmbedding` layer. These embeddings are then refined through 24 `Gemma3DecoderLayer` blocks, which include `Gemma3Attention` for contextual understanding, `Gemma3MLP` for non-linear transformations, `Gemma3RMSNorm` for stabilization, and `Gemma3RotaryEmbedding` for positional information. The sequence of token embeddings is then condensed into a single 768-dimensional sentence embedding by a `Pooling` layer using mean pooling. This pooled embedding undergoes further linear transformations by two `Dense` layers, expanding to 3072 dimensions and then contracting back to 768 dimensions. Finally, a `Normalize` layer applies L2 normalization to the embedding. The final output embedding is a 768-dimensional, L2-normalized vector that represents the semantic meaning of the entire input sentence, optimized for robust similarity comparisons.

### Data Analysis Key Findings

*   **Input Embedding:** The `Gemma3TextScaledWordEmbedding` layer converts tokens into initial 768-dimensional embeddings, supporting a vocabulary of 262,144 unique tokens.
*   **Transformer Architecture:** The model employs 24 `Gemma3DecoderLayer` blocks for processing the sequence.
    *   **`Gemma3Attention`:** Uses Query, Key, and Value projections where Query projection maps 768 features to 768, while Key and Value projections map 768 features to 256. `Gemma3RMSNorm` (256-dim) is applied to Query and Key projections.
    *   **`Gemma3MLP`:** Expands the feature space from 768 to 1152 dimensions (via `gate_proj` and `up_proj`) and then contracts it back to 768 dimensions (via `down_proj`), incorporating `PytorchGELUTanh()` as an activation function.
    *   **`Gemma3RMSNorm`:** Multiple 768-dimensional `Gemma3RMSNorm` layers are strategically placed within `Gemma3DecoderLayer` blocks to stabilize training.
    *   **`Gemma3RotaryEmbedding`:** Injects positional information into the attention mechanism using rotation matrices, aiding in understanding sequential order.
*   **Pooling:** The `Pooling` layer utilizes `pooling_mode_mean_tokens` to calculate the element-wise average of all token embeddings, resulting in a single 768-dimensional sentence embedding.
*   **Dense Layers:** Two `Dense` layers refine the pooled embedding:
    *   The first expands the 768-dimensional input to 3072 dimensions.
    *   The second contracts the 3072-dimensional output back to 768 dimensions. Both layers perform purely linear transformations (`bias: False`, `activation_function: 'torch.nn.modules.linear.Identity'`).
*   **Normalization:** The final `Normalize` layer applies L2 normalization to the 768-dimensional embedding, scaling it to a unit length. This is critical for ensuring that similarity measures (like cosine similarity) are based purely on vector direction, improving performance in tasks like similarity search.

### Insights or Next Steps

*   The architecture's design, particularly the final L2 normalization, indicates a strong optimization for applications requiring robust vector similarity comparisons, such as semantic search, retrieval, and clustering.
*   The use of `Identity` activation in the `Dense` layers suggests that these stages primarily focus on linear projection and dimensionality manipulation to refine the embedding space, rather than introducing additional non-linear complexities after the transformer layers.


We now add LoRA adapters so we only need to update a small amount of parameters!

In [14]:
model = FastSentenceTransformer.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    task_type = "FEATURE_EXTRACTION"
)

Unsloth: Making `model.base_model.model` require gradients


In [16]:
print(model[0].auto_model)

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): Gemma3TextModel(
      (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 768, padding_idx=0)
      (layers): ModuleList(
        (0-23): 24 x Gemma3DecoderLayer(
          (self_attn): Gemma3Attention(
            (q_proj): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=False)
              (lora_dropout): ModuleDict(
                (default): Identity()
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=32, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=32, out_features=768, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
              (lora_magnitude_vector): ModuleDict()
            )
            (k_proj): lora.Linear(
              (base_layer): Linear(in_feature