In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
%%capture
!pip install wandb

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4095 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the `Phi-3` format for conversation style finetunes. We use [Open Assistant conversations](https://huggingface.co/datasets/philschmid/guanaco-sharegpt-style) in ShareGPT style. Phi-3 renders multi turn conversations like below:

```
<|user|>
Hi!<|end|>
<|assistant|>
Hello! How are you?<|end|>
<|user|>
I'm doing great! And you?<|end|>

```

**[NOTE]** To train only on completions (ignoring the user's input) read Unsloth's docs [here](https://github.com/unslothai/unsloth/wiki#train-on-completions--responses-only-do-not-train-on-inputs).

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old` and our own optimized `unsloth` template.

Note ShareGPT uses `{"from": "human", "value" : "Hi"}` and not `{"role": "user", "content" : "Hi"}`, so we use `mapping` to map it.

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [None]:
from google.colab import userdata
import os

os.environ["HF_HUB_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [None]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_HUB_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset, Features, Value, DatasetDict, concatenate_datasets

features = Features({
    'input': Value('string'),
    'output': Value('string'),
    'instruct': Value('string'),
    'dataset_type': Value('string'),
    'dataloader_name': Value('string')
})

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
)

def formatting_prompts_func(row):
    convos = [
        {
        "content": f"{row['instruct']}\n{row['input']}",
        "role": "user"
    }, {
        "content": f"{row['output']}",
        "role": "assistant"
    }]
    texts = tokenizer.apply_chat_template(convos, tokenize = False, add_generation_prompt = False)
    return { "text" : texts, }

In [None]:
from collections import Counter

def custom_sample(dataset, sampling_fractions):
    sampled_datasets = []
    for name, fraction in sampling_fractions.items():
        filtered_dataset = dataset.filter(lambda x: x['dataloader_name'] == name)
        if fraction < 1.0:
            sampled_dataset = DatasetDict({split: filtered_dataset[split].train_test_split(train_size=fraction, seed=42)['train'] for split in filtered_dataset})
        else:
            sampled_dataset = filtered_dataset
        sampled_datasets.append(sampled_dataset)

    return DatasetDict({split: concatenate_datasets([d[split] for d in sampled_datasets]) for split in filtered_dataset})


def value_counts(data):
  value_counts = Counter(data)
  print(value_counts)

In [None]:
artificial_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-artificial", data_dir="small", features=features)

value_counts(artificial_ds["train"]['dataloader_name'])

Counter({'PapersDataset': 61100, 'UbertextV2Dataset': 34708})


In [None]:
sampling_fractions = {
    "PapersDataset": 0.35,
    "UbertextV2Dataset": 1.0,
}


artificial_ds = artificial_ds.filter(lambda x: x['dataloader_name'] != "null")
artificial_ds = custom_sample(artificial_ds, sampling_fractions)
artificial_ds = artificial_ds.shuffle(seed=42)

artificial_ds = DatasetDict({
    "train": artificial_ds["train"].shuffle(seed=42).select(range(6000)),
    "test": artificial_ds["test"].shuffle(seed=42).select(range(500)),
})

value_counts(artificial_ds["train"]['dataloader_name'])

Counter({'UbertextV2Dataset': 3704, 'PapersDataset': 2296})


In [None]:
golden_ds = load_dataset("ostapbodnar/ua-gec-pos-ner-golden", features=features)

value_counts(golden_ds["train"]['dataloader_name'])

Counter({'NewsKeywordDataset': 96350, 'NewsTopicClassificationDataset': 96317, 'UaSqaudDataset': 8869, 'UaGecDataset': 4937, 'MovaInstPosDataset': 4627, 'ZnoDataset': 2443, 'NerDataset': 224, 'WscDataset': 177, None: 16})


In [None]:

sampling_fractions = {
    "UaGecDataset": 0.65,
    "UaSqaudDataset": 1.0,
    "NewsTopicClassificationDataset": 0.02,
    "NewsKeywordDataset": 0.02,
    "WscDataset": 1.0,
    "NerDataset": 1.0,
    'MovaInstPosDataset': 0.65,
    'ZnoDataset': 1.0,
}


golden_ds = golden_ds.filter(lambda x: x['dataloader_name'] != "null")
golden_ds = custom_sample(golden_ds, sampling_fractions)
golden_ds = golden_ds.shuffle(seed=42)

golden_ds = DatasetDict({
    "train": golden_ds["train"].shuffle(seed=42).select(range(20000)),
    "test": golden_ds["test"].shuffle(seed=42).select(range(1000)),
})

value_counts(golden_ds["train"]['dataloader_name'])

In [None]:
mixed_ds = DatasetDict({
    "train": concatenate_datasets([golden_ds["train"], artificial_ds["train"]]).shuffle(seed=42),
    "test": concatenate_datasets([golden_ds["test"], artificial_ds["test"]]).shuffle(seed=42),
})

dataset = mixed_ds
dataset = dataset.map(formatting_prompts_func, batched = False,)


Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Let's see how the `Phi-3` format works by printing the 5th element

In [None]:
value_counts(mixed_ds["train"]['dataloader_name'])

Counter({'UaSqaudDataset': 8130, 'UbertextV2Dataset': 3704, 'UaGecDataset': 2962, 'MovaInstPosDataset': 2761, 'PapersDataset': 2296, 'ZnoDataset': 2250, 'NewsKeywordDataset': 1780, 'NewsTopicClassificationDataset': 1745, 'NerDataset': 206, 'WscDataset': 166})


In [None]:
dataset['train'][5]

{'input': 'Запитання: Яку посаду обіймав Даррел Девіс в AFL?, контекст: Після успіхів пробних ігор Фостер просувався вперед зі своєю ідеєю футболу на арені. Він заснував футбольну лігу «Арена» з чотирма командами: «Пітсбургський гладіатор», «Денвер Динаміт», «Вашингтон Командос» та «Чикаго Бруізерс». Фостер призначив легендарного Даррела "Мишу" Девіса, хрещеного батька "бігай і стріляй", виконавчим директором з футбольних операцій. Девіс працевлаштував першокласних тренерів і створив оригінальні багатогранні посібники з наступу.',
 'output': 'виконавчим директором з футбольних операцій',
 'instruct': 'Відповідай на запитання, опираючись на додатковий контекст.',
 'dataset_type': '5',
 'dataloader_name': 'UaSqaudDataset',
 'text': '<|user|>\nВідповідай на запитання, опираючись на додатковий контекст.\nЗапитання: Яку посаду обіймав Даррел Девіс в AFL?, контекст: Після успіхів пробних ігор Фостер просувався вперед зі своєю ідеєю футболу на арені. Він заснував футбольну лігу «Арена» з чоти

In [None]:
print(dataset['train'][5]["text"])

<|user|>
Відповідай на запитання, опираючись на додатковий контекст.
Запитання: Яку посаду обіймав Даррел Девіс в AFL?, контекст: Після успіхів пробних ігор Фостер просувався вперед зі своєю ідеєю футболу на арені. Він заснував футбольну лігу «Арена» з чотирма командами: «Пітсбургський гладіатор», «Денвер Динаміт», «Вашингтон Командос» та «Чикаго Бруізерс». Фостер призначив легендарного Даррела "Мишу" Девіса, хрещеного батька "бігай і стріляй", виконавчим директором з футбольних операцій. Девіс працевлаштував першокласних тренерів і створив оригінальні багатогранні посібники з наступу.<|end|>
<|assistant|>
виконавчим директором з футбольних операцій<|end|>



In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruct', 'dataset_type', 'dataloader_name', 'text'],
        num_rows: 26000
    })
    test: Dataset({
        features: ['input', 'output', 'instruct', 'dataset_type', 'dataloader_name', 'text'],
        num_rows: 1500
    })
})

If you're looking to make your own chat template, that also is possible! You must use the Jinja templating regime. We provide our own stripped down version of the `Unsloth template` which we find to be more efficient, and leverages ChatML, Zephyr and Alpaca styles.

More info on chat templates on [our wiki page!](https://github.com/unslothai/unsloth/wiki#chat-templates)

In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"


<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
# WANDB configuration (optional)

import wandb
import os

os.environ["PROJECT"]="phi3.5-mini-ua-mixed"

project_name = os.environ["PROJECT"]

wandb.init(project=project_name, name = project_name)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mostapbodnar[0m ([33mostap-bodnar[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


new_model_id="ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset['test'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 12,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 12,
        gradient_accumulation_steps = 6,
        per_device_eval_batch_size=4,
        warmup_steps = 5,
        num_train_epochs=3,
        learning_rate = 2e-4,
        push_to_hub=True,
        hub_model_id=new_model_id,
        hub_strategy="every_save",
        save_strategy="steps",
        save_steps=100,
        do_eval=True,
        evaluation_strategy="steps",
        eval_steps=100,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb",
    ),
)



Map (num_proc=12):   0%|          | 0/26000 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
2.285 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 26,000 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 12 | Gradient Accumulation steps = 6
\        /    Total batch size = 72 | Total steps = 1,083
 "-____-"     Number of trainable parameters = 29,884,416


Step,Training Loss,Validation Loss
100,0.7204,0.627141
200,0.6382,0.579082
300,0.5838,0.555844
400,0.5259,0.54222


Step,Training Loss,Validation Loss
100,0.7204,0.627141
200,0.6382,0.579082
300,0.5838,0.555844
400,0.5259,0.54222
500,0.5671,0.529531
600,0.6041,0.519263
700,0.5661,0.51136
800,0.5391,0.505066
900,0.5156,0.499729
1000,0.5204,0.496238


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

59953.9967 seconds used for training.
999.23 minutes used for training.
Peak reserved memory = 23.791 GB.
Peak reserved memory for training = 21.506 GB.
Peak reserved memory % of max memory = 60.133 %.
Peak reserved memory for training % of max memory = 54.357 %.


<a name="Inference"></a>
### Inference
Let's run the model! Since we're using `Phi-3`, use `apply_chat_template` with `add_generation_prompt` set to `True` for inference.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Продовжи послідовність фібоначі: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|user|> Продовжи послідовність фібоначі: 1, 1, 2, 3, 5, 8,<|end|><|assistant|> 13<|end|><|endoftext|>']

## Upload 16bit version

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [None]:
from google.colab import userdata
import os

os.environ["HF_HUB_TOKEN"] = userdata.get('HF_TOKEN')
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')

In [None]:
from huggingface_hub import login
import os

login(token=os.getenv("HF_HUB_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4095 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/120M [00:00<?, ?B/s]

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32064, 3072)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
       

In [None]:
import torch

tokenized_input = tokenizer.apply_chat_template(
            [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Давай відповіді на запитання. \n Input: Яка столиця України? Output:"},
    ],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")

with torch.no_grad():
    output = model.generate(
        input_ids=tokenized_input,
        # max_tokens=2096,
        # stop=["\nUser:", "<|endoftext|>", "</s>"],
    )

full_response = tokenizer.decode(output[0], skip_special_tokens=True)
full_response

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'You are a helpful assistant Давай відповіді на запитання. \n Input: Яка столиця України? Output: Київ'

In [None]:
model.push_to_hub_merged("ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit", tokenizer, save_method = "merged_16bit")

Unsloth: You are pushing to hub, but you passed your HF username = ostapbodnar.
We shall truncate ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit to Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.3G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 55.86 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 72.62it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


README.md:   0%|          | 0.00/600 [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Done.
Saved merged model to https://huggingface.co/ostapbodnar/Phi3.5-mini-instruct-UA-lora-unsloth-mixed-16bit
