In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install wandb
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install "unsloth[colab_ampere] @ git+https://github.com/unslothai/unsloth.git"
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"
pass

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit", # "unsloth/tinyllama" for 16bit loading
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.1
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.22.post7. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


model.safetensors:   0%|          | 0.00/762M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.1 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [None]:
# @title prepare data

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [None]:
dataset_dict = dataset.train_test_split(test_size=0.004)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 51552
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 208
    })
})

In [None]:
# @title wandb init
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import os
%env WANDB_WATCH=all
%env WANDB_SILENT=true
os.environ.get("WANDB_SILENT")

env: WANDB_WATCH=all
env: WANDB_SILENT=true


'true'

In [None]:
import os

from trl import SFTTrainer
from transformers import TrainingArguments
from transformers.utils import logging
import wandb

logging.set_verbosity_info()
project_name = "tiny-llama"
entity = "wandb"
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"

wandb.init(project=project_name, name = "new_run2")

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Packs short sequences together to save time!
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps = 4,
        evaluation_strategy="steps",
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 2e-5,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="wandb",  # enable logging to W&B
        # run_name="tiny-llama-alpaca-run6",  # name of the W&B run (optional)
        logging_steps=1,  # how often to log to W&B
        logging_strategy = 'steps',
        save_total_limit=2,
    ),
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁
train/loss,▁

0,1
eval/loss,2.40896
eval/runtime,7.2294
eval/samples_per_second,1.522
eval/steps_per_second,0.277
train/epoch,0.01
train/global_step,1.0
train/learning_rate,0.0
train/loss,2.3798


using `logging_steps` to initialize `eval_steps` to 1
PyTorch: setting up devices


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Using auto half precision backend


In [None]:
trainer.eval_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 11
})

In [None]:
trainer_stats = trainer.train()
wandb.finish()

***** Running training *****
  Num examples = 3,000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 375
  Number of trainable parameters = 25,231,360
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
1,2.4387,2.408967
2,2.3981,2.408967
3,2.3892,2.40881
4,2.3791,2.408331
5,2.389,2.407179
6,2.4149,2.405215
7,2.3692,2.402019
8,2.3226,2.397344
9,2.3293,2.390943
10,2.397,2.38292


***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 

Step,Training Loss,Validation Loss
1,2.4387,2.408967
2,2.3981,2.408967
3,2.3892,2.40881
4,2.3791,2.408331
5,2.389,2.407179
6,2.4149,2.405215
7,2.3692,2.402019
8,2.3226,2.397344
9,2.3293,2.390943
10,2.397,2.38292


***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 11
  Batch size = 2
***** Running Evaluation *****
  Num examples = 

Save model

In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving

Inference

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

inputs = tokenizer(
[
    alpaca_prompt.format(
        "capital of France?", # instruction
        "", # input
        "", # output - leave this blank for a generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Stream Responses

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

inputs = tokenizer(
[
    alpaca_prompt.format(
        "capital of France?", # instruction
        "", # input
        "", # output - leave this blank for a generation!
    )
]*1, return_tensors = "pt").to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)