In [1]:
# ===== Colab 5: Continued Pretraining (New "Language") with Unsloth =====

!pip install -q "unsloth>=2025.3.0" "unsloth_zoo" "trl>=0.9.6" "datasets" "accelerate" "transformers>=4.44.0"

from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
import torch

# 1) Tiny "new language" / domain corpus
corpus = [
    {"text": "Nikhilang phrase: zomu kaya linor. It means: hello friend."},
    {"text": "Nikhilang phrase: tara novi lumek. It means: I love machine learning."},
    {"text": "Nikhilang phrase: sumi dera plon. It means: this is a brand new language."},
    {"text": "Nikhilang phrase: veri tomo qel. It means: have a wonderful day."},
]

dataset = Dataset.from_list(corpus)
max_seq_length = 256

# 2) Load base model with QLoRA-style PEFT for continued pretraining
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = "unsloth/SmolLM2-135M-Instruct",  # public, no key
    max_seq_length  = max_seq_length,
    load_in_4bit    = True,       # 4-bit + LoRA (cheap)
    full_finetuning = False,      # parameter-efficient CPT
    dtype           = None,
)

# Add LoRA adapters (let Unsloth decide target modules)
model = FastLanguageModel.get_peft_model(
    model,
    r                           = 16,
    lora_alpha                  = 16,
    lora_dropout                = 0,
    bias                        = "none",
    use_gradient_checkpointing  = "unsloth",
    max_seq_length              = max_seq_length,
)

# 3) SFTConfig used as "continued pretraining" on raw text
training_args = SFTConfig(
    output_dir                  = "smollm2_continued_pretrain",
    num_train_epochs            = 2,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1,
    learning_rate               = 2e-4,
    logging_steps               = 1,
    save_strategy               = "no",
    max_seq_length              = max_seq_length,
    report_to                   = "none",  # disable wandb
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",   # raw text, no instruction tags required
    args               = training_args,
)

trainer.train()

# 4) Inference demo in the "new language"
model.eval()

prompt = "Nikhilang phrase: "
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens = 40,
        do_sample      = True,
        temperature    = 0.8,
        use_cache      = False,  # avoid KV cache dtype issues
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.3/64.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.7/358.7 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.4/284.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.11.4 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.
num_proc must be <= 4. Reducing num_proc to 4 for dataset of size 4.


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4 | Num Epochs = 2 | Total steps = 8
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss
1,6.0824
2,6.8074
3,6.5584
4,5.9625
5,6.4414
6,5.7482
7,6.379
8,5.5791


Nikhilang phrase: 
Your Honored Sir, 

The previous query for the following:

How do you suggest that I prepare for the following:

- In the coming years, I will be making
