In [1]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3  peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [36]:
from unsloth import FastLanguageModel
import torch


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen3-0.6B",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = False, # quantization
)

==((====))==  Unsloth 2025.5.9: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4070 SUPER. Num GPUs = 1. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [37]:
tokenizer.clean_up_tokenization_spaces = False

In [38]:
# More info about parameters: https://huggingface.co/docs/peft/v0.11.0/en/package_reference/lora#peft.LoraConfig
target_modules =  ["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"]

# When adding special tokens
train_embeddings = False

if train_embeddings:
  target_modules = target_modules + ["lm_head"]

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # rank of lora matrices according to paper not much loss when set relatively low
    target_modules = target_modules,  # On which modules of the llm the lora weights are used
    lora_alpha = 16, # scales the weights of the adapters (more influence on base model), 16 was recommended on reddit
    lora_dropout = 0, # Default on 0.05 in tutorial but unsloth says 0 is better
    bias = "none",    # "none" is optimized
    use_gradient_checkpointing = "unsloth", #"unsloth" for very long context, decreases vram
    random_state = 3407,
    use_rslora = False,  # scales lora_alpha with 1/sqrt(r), huggingface says this works better
    loftq_config = None, # And LoftQ
)


In [39]:
empty_prompt = """
{ascii_art}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func_no_prompt(examples):
  ascii_art_samples = examples["ascii"]
  training_prompts = []
  for ascii_art in ascii_art_samples:
      training_prompt = empty_prompt.format(ascii_art=ascii_art) + EOS_TOKEN
      training_prompts.append(training_prompt)
  return { "text" : training_prompts, }


from datasets import load_dataset
dataset = load_dataset("pookie3000/ascii-cats", split = "train")
dataset = dataset.map(formatting_prompts_func_no_prompt, batched = True)

In [40]:
for i, sample in enumerate(dataset):
    print(f"\n------ Sample {i + 1} ----")
    print(sample["text"])
    if i > 2:
      break


------ Sample 1 ----

    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
<|im_end|>

------ Sample 2 ----

|\---/|
| o_o |
 \_^_/
<|im_end|>

------ Sample 3 ----

 |\__/,|   (`\
 |_ _  |.--.) )
 ( T   )     /
(((^_(((/(((_/
<|im_end|>

------ Sample 4 ----

   |\---/|
   | ,_, |
    \_`_/-..----.
 ___/ `   ' ,""+ \  
(__...'   __\    |`.___.';
  (_,...'(_,.`__)/'.....+
<|im_end|>


In [41]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, # process 4 batches before updating parameters (parameter update == step)
    num_train_epochs = 5, # between 1 - 3 to prevent overfitting
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none"
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 2048,
    dataset_num_proc = 2,
    args = args,
)

In [42]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 201 | Num Epochs = 5 | Total steps = 125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 10,092,544/606,142,464 (1.67% trained)


Step,Training Loss
1,4.6684
2,4.5327
3,5.3933
4,4.7331
5,4.2324
6,4.7379
7,4.6123
8,4.494
9,4.0738
10,4.161


Inference

In [34]:
from transformers import TextStreamer

def generate_ascii_art(model):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(" ", return_tensors = "pt").to("cuda")
    print(inputs)
    text_streamer = TextStreamer(tokenizer)
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationMixin
    # https://huggingface.co/docs/transformers/v4.49.0/en/main_classes/text_generation#transformers.GenerationConfig
    for token in model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100):
        print(token)
        pass

In [35]:
for _ in range(3):
  generate_ascii_art(model)

{'input_ids': tensor([[220]], device='cuda:0'), 'attention_mask': tensor([[1]], device='cuda:0')}
  /\_/\
// //   \   _/|
// ` ^   /  ((
))  `~'  `  \ \
\  ;   /    (  ((
 )  (_)/    (  ((
))) _ )  ((  ((
))) ) )  )  (  ((
))) ) ) )  )  (  ((
))) ) ) )  )  (  ((
))) ) ) )  )  (  ((
)))
tensor([  220, 23536, 50295,  5661,   322,   442,   256,  1124,   256,   716,
           14,  7360,   322,  1565,  6306,   256,   608,   220,   320,  1006,
          593,   220,  1565,    93,     6,   220,  1565,   220,  1124,  3044,
           59,   220,  2587,   256,   608,   262,   320,   220,   320,  1006,
          873,   220,  5453,  5620,   262,   320,   220,   320,  1006,  7705,
          716,   873,   220,  1781,   220,   320,  1006,  7705,   873,   873,
          220,   873,   220,   320,   220,   320,  1006,  7705,   873,   873,
          873,   220,   873,   220,   320,   220,   320,  1006,  7705,   873,
          873,   873,   220,   873,   220,   320,   220,   320,  1006,  7705,
          8