Connect to T4 GPU!!! Takuto was here

Notebooks: most code was copied from the first notebook, except the first cell for import which was copied from the second notebook (since the first notebook caused errors)

1. Finetuning Llama 3.1: https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb

2. GRPO (R1 reasoning) with Llama 3.1: https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb#scrollTo=GPgXROedCcqP


Useful links:
- https://docs.unsloth.ai/basics/continued-pretraining
- https://docs.unsloth.ai/get-started/fine-tuning-guide

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
import os
import numpy as np

max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!

model_name = "unsloth/Meta-Llama-3.1-8B" # default
# model_name = "./model" # loading a pretrained saved model

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True,    # Use 4bit quantization to reduce memory usage. Can be False.
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank: Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
prompt = """
### Instruction:
Below is a math problem, please solve it as best you can step by step.

### Input:
{}

### Response:
{}
"""

level = 5 # 1-5

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_func_gsm8k(examples):
    question = examples['question']
    answer = examples['answer']
    texts = []

    for q, a in zip(question, answer):
        text = prompt.format(q, a) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

def formatting_func_MATH(examples):
    question = examples['problem']
    level = examples['level']
    answer = examples['solution']
    texts = []

    for q, a in zip(question, answer):
        text = prompt.format(q, a) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


from datasets import load_dataset
gsm8k = load_dataset("openai/gsm8k", "main")
gsm8k_train = gsm8k['train']
gsm8k_test = gsm8k['test']
gsm8k_train = gsm8k_train.map(formatting_func_gsm8k, batched=True)
gsm8k_test = gsm8k_test.map(formatting_func_gsm8k, batched=True)

MATH = load_dataset("nlile/hendrycks-MATH-benchmark")
MATH_train = MATH['train']
MATH_test = MATH['test']
MATH_train = MATH_train.map(formatting_func_MATH, batched=True)
MATH_test = MATH_test.map(formatting_func_MATH, batched=True)

In [None]:
from trl import SFTTrainer, SFTConfig
# from peft import LoraConfig
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    # train_dataset = gsm8k_train,
    # eval_dataset = gsm8k_test,
    train_dataset = MATH_train,
    eval_dataset = MATH_test,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 4, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
trainer_stats = trainer.train()

In [None]:
# !zip -r outputs.zip outputs

# save the training progress (loss vs steps)
plot_data = list(map(lambda e: (e['step'], e['loss']), trainer.state.log_history[:-1]))
steps, loss = zip(*plot_data) # steps and loss are tuples

np.savez("training.npz", steps=steps, loss=loss)

# save the LORA updated weights (do we need to save the tokenizer too?)
model.save_pretrained_merged("model", tokenizer, save_method="lora")


In [None]:
!zip -r model.zip model training.npz

save_name = input(prompt="Save name: ")
#save_name = "gsm8k_math_0"
import os
from google.colab import files

os.rename("model.zip", f"{save_name}.zip")
files.download(f'{save_name}.zip')

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# sample = gsm8k_test[int(np.random.randint(0, gsm8k_test.num_rows))]
sample = MATH_test[int(np.random.randint(0, MATH_test.num_rows))]

inputs = tokenizer(
[
    prompt.format(
        # f"{sample['question']}", # instruction
        f"{sample['problem']}", # instruction
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

# print(f"\n\nSample Answer: {sample['answer']}")
print(f"\n\nSample Answer: {sample['solution']}")