In [1]:
from google.colab import userdata
import os
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [2]:
LM_MODEL = "gemma-3-270m-it"
HF_MODEL_REPO = "mayurmadnani/gemma-3-270m-microfables"

## Model

In [3]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [4]:
from unsloth import FastModel
import torch
max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = f"unsloth/{LM_MODEL}",
    max_seq_length = max_seq_length, # for long context
    load_in_4bit = False,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # A bit more accurate, uses 2x memory
    full_finetuning = False,
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.12: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

In [5]:
model

Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps

In [6]:
model.save_pretrained(f"{LM_MODEL}-base")

In [7]:
model = FastModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA
    loftq_config = None, # LoftQ
)

Unsloth: Making `model.base_model.model.model` require gradients


In [8]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=640, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=640, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
             

## Tokenizer

In [9]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

In [10]:
print(tokenizer.get_chat_template())

{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
    {%- if messages[0]['content'] is string -%}
        {%- set first_user_prefix = messages[0]['content'] + '

' -%}
    {%- else -%}
        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '

' -%}
    {%- endif -%}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {%- set first_user_prefix = "" -%}
    {%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
    {%- endif -%}
    {%- if (message['role'] == 'assistant') -%}
        {%- set role = "model" -%}
    {%- else -%}
        {%- set role = message['role'] -%}
    {%- endif -%}
    {{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
    {%- if message['content'] is string -%}
        {{ message['content'] | trim }}


In [11]:
from datasets import load_dataset
dataset = load_dataset("json", data_files= "stories_dataset.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
def convert_to_chatml(example):
    return {
        "conversations": [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["response"]}
        ]
    }

dataset = dataset.map(
    convert_to_chatml
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
dataset["train"][10]

{'instruction': 'Write a story about a turtle who wins a race against a rabbit.',
 'response': 'Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.',
 'conversations': [{'content': 'Write a story about a turtle who wins a race against a rabbit.',
   'role': 'user'},
  {'content': 'Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.',
   'role': 'assistant'}]}

In [14]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
print(dataset["train"][10]['text'])

<start_of_turn>user
Write a story about a turtle who wins a race against a rabbit.<end_of_turn>
<start_of_turn>model
Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.<end_of_turn>



In [16]:
tokenizer.save_pretrained(f"{LM_MODEL}-base")

('gemma-3-270m-it-base/tokenizer_config.json',
 'gemma-3-270m-it-base/special_tokens_map.json',
 'gemma-3-270m-it-base/chat_template.jinja',
 'gemma-3-270m-it-base/tokenizer.model',
 'gemma-3-270m-it-base/added_tokens.json',
 'gemma-3-270m-it-base/tokenizer.json')

## Fine-Tune

In [17]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 10,
        # max_steps = 50,
        learning_rate = 5e-5,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir="outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

In [18]:
trainer.train_dataset[10]

{'instruction': 'Write a story about a turtle who wins a race against a rabbit.',
 'response': 'Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.',
 'conversations': [{'content': 'Write a story about a turtle who wins a race against a rabbit.',
   'role': 'user'},
  {'content': 'Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.',
   'role': 'assistant'}],
 'text': '<start_of_turn>user\nWrite a story about a turtle who wins a race against a rabbit.<end_of_turn>\n<start_of_turn>model\nLong ago, a fast rabbit laug

In [19]:
print(tokenizer.decode(trainer.train_dataset[10]["input_ids"]))

<bos><start_of_turn>user
Write a story about a turtle who wins a race against a rabbit.<end_of_turn>
<start_of_turn>model
Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.<end_of_turn>



In [20]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=6):   0%|          | 0/100 [00:00<?, ? examples/s]

In [21]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[10]["labels"]]).replace(tokenizer.pad_token, " ")

'                       Long ago, a fast rabbit laughed at a slow turtle. They raced on a sunny morning. The rabbit ran far ahead and stopped to nap. The turtle kept walking, step by step. At the finish line, the turtle crossed first. The rabbit woke too late. The turtle smiled, proud of patience and steady steps.<end_of_turn>\n'

In [22]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 10 | Total steps = 130
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 30,375,936 of 298,474,112 (10.18% trained)


Step,Training Loss
1,4.3947
2,4.2789
3,3.9427
4,3.3882
5,2.9871
6,2.6287
7,2.8352
8,2.7396
9,2.6179
10,2.4539


## Generation

In [23]:
def gen(user_prompt):
  messages = [
      {"role" : 'user', 'content' : user_prompt}
  ]
  text = tokenizer.apply_chat_template(
      messages,
      tokenize = False,
      add_generation_prompt = True, # Must add for generation
  ).removeprefix('<bos>')

  from transformers import TextStreamer
  return model.generate(
      **tokenizer(text, return_tensors = "pt").to("cuda"),
      max_new_tokens = 125,
      temperature = 1, top_p = 0.95, top_k = 64,
      streamer = TextStreamer(tokenizer, skip_prompt = True),
  )

In [24]:
token_ids = gen(dataset['train']['conversations'][10][0]['content'])

A slow turtle hopped slowly and waited for the race. Suddenly, the rabbit ran in front of it, leaping high! The turtle froze, afraid it would fall. The rabbit cleared its throat and said, â€˜No.â€™ The turtle cheered, proud of patience and slow steps.<end_of_turn>


In [25]:
print(tokenizer.decode(token_ids[0]))

<bos><start_of_turn>user
Write a story about a turtle who wins a race against a rabbit.<end_of_turn>
<start_of_turn>model
A slow turtle hopped slowly and waited for the race. Suddenly, the rabbit ran in front of it, leaping high! The turtle froze, afraid it would fall. The rabbit cleared its throat and said, â€˜No.â€™ The turtle cheered, proud of patience and slow steps.<end_of_turn>


## Model Artefacts

In [26]:
model.save_pretrained(f"{LM_MODEL}-microfables")

In [27]:
model.save_pretrained_merged(f"{LM_MODEL}-microfables", tokenizer=tokenizer, save_method="lora")

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `gemma-3-270m-it-microfables`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.58s/it]


Successfully copied all 1 files from cache to `gemma-3-270m-it-microfables`
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `gemma-3-270m-it-microfables`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 59.65it/s]


Successfully copied all 1 files from cache to `gemma-3-270m-it-microfables`


Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 11814.94it/s]
Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:04<00:00,  4.45s/it]


Unsloth: Merge process complete. Saved to `/content/gemma-3-270m-it-microfables`


In [28]:
model.save_pretrained_gguf(f"{LM_MODEL}-microfables-gguf", tokenizer, quantization_method = "q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `gemma-3-270m-it-microfables-gguf`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:01<00:00,  1.55s/it]


Successfully copied all 1 files from cache to `gemma-3-270m-it-microfables-gguf`
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `gemma-3-270m-it-microfables-gguf`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 120.68it/s]


Successfully copied all 1 files from cache to `gemma-3-270m-it-microfables-gguf`


Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 13189.64it/s]
Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:04<00:00,  4.43s/it]


Unsloth: Merge process complete. Saved to `/content/gemma-3-270m-it-microfables-gguf`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['gemma-3-270m-it.F16.

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['gemma-3-270m-it.Q8_0.gguf']
Unsloth: example usage for text only LLMs: llama-cli --model gemma-3-270m-it.Q8_0.gguf -p "why is the sky blue?"
Unsloth: Saved Ollama Modelfile to current directory
Unsloth: convert model to ollama format by running - ollama create model_name -f ./Modelfile - inside current directory.


{'save_directory': 'gemma-3-270m-it-microfables-gguf',
 'gguf_files': ['gemma-3-270m-it.Q8_0.gguf'],
 'modelfile_location': '/content/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': True}

In [29]:
tokenizer.push_to_hub(HF_MODEL_REPO)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mp2q_z48hq/tokenizer.json:  75%|#######5  | 25.2MB / 33.4MB            

  ...p2q_z48hq/tokenizer.model: 100%|##########| 4.69MB / 4.69MB            

In [30]:
model.push_to_hub(HF_MODEL_REPO)

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:  41%|####1     | 50.3MB /  122MB            

Saved model to https://huggingface.co/mayurmadnani/gemma-3-270m-microfables


In [31]:
model.push_to_hub_gguf(HF_MODEL_REPO, tokenizer, quantization_method = "Q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Converting model to GGUF format...
Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `/tmp/unsloth_gguf_mjin8rn9`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:06<00:00,  6.72s/it]


Successfully copied all 1 files from cache to `/tmp/unsloth_gguf_mjin8rn9`
Checking cache directory for required files...


Unsloth: Copying 1 files from cache to `/tmp/unsloth_gguf_mjin8rn9`: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 165.89it/s]


Successfully copied all 1 files from cache to `/tmp/unsloth_gguf_mjin8rn9`


Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 11155.06it/s]
Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:27<00:00, 27.20s/it]


Unsloth: Merge process complete. Saved to `/tmp/unsloth_gguf_mjin8rn9`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: llama.cpp found in the system. Skipping installation.
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['gemma-3-270m-it.F16.gguf']
Unsloth: [2] Converting GGUF f16 into q8_0. This might take 10 minutes...


Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['gemma-3-270m-it.Q8_0.gguf']
Unsloth: example usage for text only LLMs: llama-cli --model gemma-3-270m-it.Q8_0.gguf -p "why is the sky blue?"
Unsloth: Saved Ollama Modelfile to current directory
Unsloth: convert model to ollama format by running - ollama create model_name -f ./Modelfile - inside current directory.
Unsloth: Uploading GGUF to Huggingface Hub...
Uploading gemma-3-270m-it.Q8_0.gguf...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  gemma-3-270m-it.Q8_0.gguf   :  17%|#7        | 50.3MB /  292MB            

Uploading config.json...
Uploading Ollama Modelfile...
Unsloth: Successfully uploaded GGUF to https://huggingface.co/mayurmadnani/gemma-3-270m-microfables
Unsloth: Cleaning up temporary files...


'mayurmadnani/gemma-3-270m-microfables'

In [32]:
merged_model = model.merge_and_unload()

In [33]:
merged_model.save_pretrained(f"{LM_MODEL}-microfables-merged")
tokenizer.save_pretrained(f"{LM_MODEL}-microfables-merged")

('gemma-3-270m-it-microfables-merged/tokenizer_config.json',
 'gemma-3-270m-it-microfables-merged/special_tokens_map.json',
 'gemma-3-270m-it-microfables-merged/chat_template.jinja',
 'gemma-3-270m-it-microfables-merged/tokenizer.model',
 'gemma-3-270m-it-microfables-merged/added_tokens.json',
 'gemma-3-270m-it-microfables-merged/tokenizer.json')

## Serving

In [34]:
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13281    0 13281    0     0  38419      0 --:--:-- --:--:-- --:--:-- 38384
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [35]:
import subprocess
import time

# Start ollama serve in the background
ollama_process = subprocess.Popen(['ollama', 'serve'])

# Give the processes a moment to start
time.sleep(5)

In [36]:
!ollama create gemma-3-270m-microfables -f ./Modelfile

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[A[1G[?25h[?2026l[?2026h[?25l[A[A[1G[?25h[?2026l


In [37]:
!ollama ls

NAME                               ID              SIZE      MODIFIED               
gemma-3-270m-microfables:latest    2aac8b44638a    291 MB    Less than a second ago    


In [38]:
!curl http://localhost:11434/api/generate -d '{ \
  "model": "gemma-3-270m-microfables", \
  "prompt": "Write a short story about a brave knight.", \
  "stream": false \
}'

{"model":"gemma-3-270m-microfables","created_at":"2025-10-31T18:05:23.502822654Z","response":"A knight rode into battle, bold and strong. His armor gleamed under the sun. The king raised his sword, and the army cheered. The knight faced down an enemy king, his courage shining like the morning sun. His bravery saved lives and brought peace to all.","done":true,"done_reason":"stop","context":[105,2364,107,6974,496,2822,3925,1003,496,36711,52482,236761,106,107,105,4368,107,107,236776,52482,38965,1131,10041,236764,16627,532,3188,236761,4923,38119,20508,15608,1208,506,3768,236761,669,9615,8675,914,26114,236764,532,506,14093,120899,236761,669,52482,17175,1679,614,13550,9615,236764,914,23648,36489,1133,506,5597,3768,236761,4923,101666,10683,6176,532,6111,8118,531,784,236761],"total_duration":2695001066,"load_duration":2172676571,"prompt_eval_count":19,"prompt_eval_duration":14644714,"eval_count":56,"eval_duration":372159392}

In [39]:
!ollama cp gemma-3-270m-microfables mayurmadnani/gemma-3-270m-microfables

copied 'gemma-3-270m-microfables' to 'mayurmadnani/gemma-3-270m-microfables'


In [41]:
!ollama push mayurmadnani/gemma-3-270m-microfables

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h