In [1]:
import os
os.environ["WANDB_PROJECT"] = "qwen3-8b-recall-writer-0.1"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "false"  # log all model checkpoints

In [2]:
from unsloth import FastLanguageModel
import torch

fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    "unsloth/Qwen3-8B-Base-unsloth-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    max_seq_length = 2048,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-04 02:28:18 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


<a name="Data"></a>
### Data Prep
Qwen3 has both reasoning and a non reasoning mode. So, we should use 2 datasets:

1. We use the [Open Math Reasoning]() dataset which was used to win the [AIMO](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard) (AI Mathematical Olympiad - Progress Prize 2) challenge! We sample 10% of verifiable reasoning traces that used DeepSeek R1, and whicht got > 95% accuracy.

2. We also leverage [Maxime Labonne's FineTome-100k](https://huggingface.co/datasets/mlabonne/FineTome-100k) dataset in ShareGPT style. But we need to convert it to HuggingFace's normal multiturn format as well.

In [4]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

REPO_ID   = "ping98k/squad-qwq-recall-1k"
TEST_SIZE = 108
SEED      = 3407

USER_PROMPT = """
Convert the given question and knowledge into memory recall traces.

[question]
{question}
[/question]

[knowledge]
{knowledge}
[/knowledge]
""".strip()


raw = load_dataset(REPO_ID, split="train")

exploded_rows = []
for ex in raw:
    for sample in ex.get("recall", []):
        know  = sample.get("knowledge", "").strip()
        think = sample.get("think", "").strip()
        if know and think:
            exploded_rows.append({
                "id": ex["id"],
                "question": ex["question"],
                "knowledge": know,
                "think": think
            })

exploded = Dataset.from_list(exploded_rows)

def to_conv_rows(example):
    q = example["question"].replace("{", "{{").replace("}", "}}")
    k = example["knowledge"].replace("{", "{{").replace("}", "}}")
    problem  = USER_PROMPT.format(question=q, knowledge=k)
    solution = "<think>\n\n</think>\n\n" + example["think"]
    return {
        "conversation": [
            {"role": "user", "content": problem},
            {"role": "assistant", "content": solution}
        ]
    }

conv_ds = exploded.map(to_conv_rows, remove_columns=exploded.column_names)

reasoning_conversations = tokenizer.apply_chat_template(
    conv_ds["conversation"], tokenize=False
)

combined_dataset = Dataset.from_dict({"text": reasoning_conversations}).shuffle(SEED)

splits = combined_dataset.train_test_split(test_size=TEST_SIZE, seed=SEED)
train_dataset, test_dataset = splits["train"], splits["test"]


Map:   0%|          | 0/3922 [00:00<?, ? examples/s]

In [5]:
train_dataset[0]

{'text': '<|im_start|>user\nConvert the given question and knowledge into memory recall traces.\n\n[question]\nWhat was one of the things the Federal Food, Drug and Cosmetic Act do?\n[/question]\n\n[knowledge]\nThe Federal Food, Drug, and Cosmetic Act (FD&C Act) of 1938 is a U.S. law that regulates foods, drugs, medical devices, cosmetics, and radiation-emitting products. Key provisions include:\n\n1. **Drug Regulation**: Requires drug manufacturers to prove safety (and later efficacy, per 1962 amendments) before marketing. Established the New Drug Approval process under the FDA.\n\n2. **Food Safety**: Prohibits the sale of adulterated or misbranded foods. Mandates truthful labeling, including ingredients and nutritional information.\n\n3. **Cosmetics Standards**: Requires cosmetics to be labeled truthfully and prohibits the sale of "adulterated" cosmetics (those with harmful ingredients under normal use). Does not require premarket approval except for color additives.\n\n4. **Medical 

Finally combine both datasets:

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

In [6]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 12,
        gradient_accumulation_steps = 3, # Use GA to mimic batch size!

        eval_strategy = "steps",
        do_eval = True,
        eval_steps = 10,
        per_device_eval_batch_size = 12,
        eval_accumulation_steps = 3,
        save_steps = 10,
        warmup_steps = 5,
        
        num_train_epochs = 4, # Set this for 1 full training run.
        # max_steps = 400,
        learning_rate = 2e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/3814 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/108 [00:00<?, ? examples/s]

In [7]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 24.0 GB.
8.391 GB of memory reserved.


Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [8]:
trainer_stats = trainer.train(resume_from_checkpoint = False)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,814 | Num Epochs = 4 | Total steps = 424
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 3
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 3 x 1) = 36
 "-____-"     Trainable parameters = 349,175,808/8,000,000,000 (4.36% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mping98k[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.3871,1.377538
20,1.2242,1.209283
30,1.1428,1.137502
40,1.0932,1.096837
50,1.0734,1.076403
60,1.0595,1.063903
70,1.0741,1.054843
80,1.0576,1.047689
90,1.0244,1.041925
100,1.0515,1.036906


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [9]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

30826.7258 seconds used for training.
513.78 minutes used for training.
Peak reserved memory = 19.814 GB.
Peak reserved memory for training = 11.423 GB.
Peak reserved memory % of max memory = 82.558 %.
Peak reserved memory for training % of max memory = 47.596 %.


<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Qwen-3` team, the recommended settings for reasoning inference are `temperature = 0.6, top_p = 0.95, top_k = 20`

For normal chat based inference, `temperature = 0.7, top_p = 0.8, top_k = 20`

In [10]:
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

We are given the equation:

$$
(x + 2)^2 = 0
$$

### Step 1: Take the square root of both sides

Since both sides are squared, we can take the square root of both sides. Remember that taking the square root introduces both a positive and negative solution.

$$
\sqrt{(x + 2)^2} = \sqrt{0}
$$

$$
|x + 2| = 0
$$

### Step 2: Solve the absolute value equation

The absolute value of a number is zero only when the number itself is zero:

$$
x + 2 = 0
$$

### Step 3: Solve for $ x $

$$
x = -2
$$

### ✅ Final Answer:

$$
\boxed{-2}
$$<|im_end|>


In [5]:
messages = [
    {"role" : "user", "content" : """
Convert the given question and knowledge into memory recall traces.

[question]
What was one of the things the Federal Food, Drug and Cosmetic Act do?
[/question]

[knowledge]
The Federal Food, Drug, and Cosmetic Act (FD&C Act) of 1938 is a U.S. law that regulates foods, drugs, medical devices, cosmetics, and radiation-emitting products. Key provisions include:

1. **Drug Regulation**: Requires drug manufacturers to prove safety (and later efficacy, per 1962 amendments) before marketing. Established the New Drug Approval process under the FDA.

2. **Food Safety**: Prohibits the sale of adulterated or misbranded foods. Mandates truthful labeling, including ingredients and nutritional information.

3. **Cosmetics Standards**: Requires cosmetics to be labeled truthfully and prohibits the sale of "adulterated" cosmetics (those with harmful ingredients under normal use). Does not require premarket approval except for color additives.

4. **Medical Devices**: Later amended in 1976 to establish classification and regulatory oversight for medical devices, requiring safety and effectiveness demonstrations for Class III devices.

5. **Definitions**: 
   - "Adulterated" applies to products contaminated, mixed with harmful substances, or produced under unsanitary conditions.
   - "Misbranded" applies to products with false or misleading labels, or those lacking required information.

6. **Enforcement**: Grants authority to the FDA to inspect facilities, issue recalls, and take legal action against violations. Authorizes civil and criminal penalties for noncompliance.

7. **Amendments**: 
   - **Kefauver-Harris Amendments (1962)**: Added drug efficacy requirements and clinical trial oversight.
   - **Orphan Drug Act (1983)**: Provided incentives for drugs targeting rare diseases.
   - **Food Safety Modernization Act (2011)**: Shifted FDA focus to prevention of foodborne illnesses.

8. **Color Additive Provisions**: Requires preapproval of synthetic color additives in foods, drugs, and cosmetics.

The FD&C Act forms the foundation of U.S. consumer product safety regulation, balancing industry oversight with public health protection.
[/knowledge]
"""}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

Okay, I need to recall as much knowledge as possible to prepare for any upcoming questions. Let me start by thinking about different areas where I might be asked questions. The example given was about the Federal Food, Drug and Cosmetic Act. I remember that this act is related to food and drug safety in the US. It was enacted in 1938, right? It probably regulates the safety of drugs, cosmetics, and food. Maybe it requires labeling information, sets standards for ingredients, or prevents the sale of harmful products. I think it also established the Food and Drug Administration (FDA) as a regulatory body. There might be amendments over time, like the Kefauver-Harris Amendments in 1962 which added requirements for drug efficacy. The act could have provisions about adulterated or misbranded products. Also, it might cover medical devices now. I should also recall other laws related to consumer protection, like the Pure Food and Drug Act of 1906, which was earlier. The FD&C Act might have ex

In [12]:
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = True, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

<think>
Okay, so I need to solve the equation (x + 2)^2 = 0. Hmm, let me think. First, I remember that when you have something squared equals zero, that usually means the inside has to be zero because any real number squared is non-negative, and the only way for it to be zero is if the inside is zero. So, maybe I can start by taking the square root of both sides? Wait, but if I take the square root of both sides, I have to consider both the positive and negative roots, right? But since the square of something is zero, maybe there's only one solution?

Let me write it out step by step. Starting with (x + 2)^2 = 0. If I take the square root of both sides, that would give me sqrt((x + 2)^2) = sqrt(0). The left side simplifies to |x + 2|, because the square root of a square is the absolute value. The right side is sqrt(0) which is 0. So, |x + 2| = 0. The absolute value of something is zero only when that something is zero. Therefore, x + 2 = 0. Solving for x, subtract 2 from both sides: x 

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [13]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("ping98k/qwen3-8b-recall-writer-4e") 
tokenizer.push_to_hub("ping98k/qwen3-8b-recall-writer-4e")

README.md:   0%|          | 0.00/586 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/1.40G [00:00<?, ?B/s]

Saved model to https://huggingface.co/ping98k/qwen3-8b-recall-writer-4e


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [1]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = 2048,
        load_in_4bit = True,
    )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-04 13:24:34 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Qwen3 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 24.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth 2025.4.7 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [15]:
# Merge to 16bit
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: # Pushing to HF Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: # Pushing to HF Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: # Pushing to HF Hub
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [16]:
# Save to 8bit Q8_0
if False:
    model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: # Pushing to HF Hub
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: # Pushing to HF Hub
    model.push_to_hub_gguf("ping98k/qwen3-8b-recall-writer-1e-gguf", tokenizer, quantization_method = "q4_k_m")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

Now, use the `model.gguf` file or `model-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui)

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!

<div class="align-center">
  <a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>

  Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
