In [1]:
%%capture
# Install necessary packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton
!pip install evaluate

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

<a name="Data"></a>
### Data Prep

In [None]:
squad_prompt = """Below is a context, followed by a question. Write a response that extracts the answer from the context.

### Context:
{}

### Question:
{}

### Answer:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    contexts      = examples["context"]
    questions     = examples["question"]
    answers       = [e["text"][0] for e in examples["answers"]] # examples["answers"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = squad_prompt.format(context, question, answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset, concatenate_datasets
dataset = load_dataset("rajpurkar/squad", split="train") #, split=[f"train[{k}%:{k+1}%]" for k in range(0, 100, 100)])
dataset = dataset.select(range(0, len(dataset), 25))

dataset = dataset.map(formatting_prompts_func, batched = True,)

val_dataset = load_dataset("rajpurkar/squad", split="validation")#, split=[f"validation[{k}%:{k+1}%]" for k in range(0, 100, 100)])
val_dataset = val_dataset.select(range(0, len(val_dataset), 100))
val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)

In [4]:
from tqdm.notebook import tqdm

In [None]:
from transformers import AutoTokenizer
from evaluate import load
from unsloth import FastLanguageModel

# Load the SQuAD evaluation metric
squad_metric = load("squad")

# Prepare lists to store predictions and references
predictions_baseline = []
references_baseline = []

# Ensure the model is in inference mode
FastLanguageModel.for_inference(model)

squad_prompt_template = """Below is a context, followed by a question. Write a response that extracts the answer from the context.

### Context:
{}

### Question:
{}

### Answer:
{}"""
# squad_prompt_template = """Below is a question, paired with a context. Write a response that extracts the answer from the context. Only include the extracted text from the context in the answer without any extra words.

# ### Question:
# Which individual worked on projects at Notre Dame that eventually created neoprene?

# ### Context:
# In 1882, Albert Zahm (John Zahm's brother) built an early wind tunnel used to compare lift to drag of aeronautical models. Around 1899, Professor Jerome Green became the first American to send a wireless message. In 1931, Father Julius Nieuwland performed early work on basic reactions that was used to create neoprene. Study of nuclear physics at the university began with the building of a nuclear accelerator in 1936, and continues now partly through a partnership in the Joint Institute for Nuclear Astrophysics.

# ### Answer:
# Father Julius Nieuwland

# ### Question:
# What did Beyoncé announce in January 2010?

# ### Context:
# Beyoncé announced a hiatus from her music career in January 2010, heeding her mother's advice, "to live life, to be inspired by things again". During the break she and her father parted ways as business partners. Beyoncé's musical break lasted nine months and saw her visit multiple European cities, the Great Wall of China, the Egyptian pyramids, Australia, English music festivals and various museums and ballet performances.

# ### Answer:
# a hiatus

# ### Question:
# {}

# ### Context:
# {}

# ### Answer:
# """

# Loop over the first 100 examples of the validation dataset
for i, example in tqdm(enumerate(val_dataset), total=len(val_dataset)):
    # Format the SQuAD prompt using the current example's question and context
    squad_prompt = squad_prompt_template.format(
        example['context'],  # context
        example['question'],   # question
        ""                    # answer - leave blank for generation
    )

    # Tokenize the input
    inputs = tokenizer(
        [squad_prompt],
        return_tensors="pt"
    ).to("cuda")

    # Generate predictions from the model
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, do_sample=False)

    # Decode the generated tokens into text
    generated_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # only get the answer:"
    generated_answer = generated_answer[len(squad_prompt):]

    # Append the generated answer in the required format (id and prediction_text)
    predictions_baseline.append({
        "id": example["id"],  # use the id from the dataset
        "prediction_text": generated_answer.strip()  # the generated text
    })

    # Append the ground truth in the required format (id and answers)
    references_baseline.append({
        "id": example["id"],  # use the same id for the reference
        "answers": {
            "text": example['answers']['text'],  # list of true answers
            "answer_start": example['answers']['answer_start']  # positions of the answers
        }
    })

# Compute the SQuAD metric using predictions and references
results_baseline = squad_metric.compute(predictions=predictions_baseline, references=references_baseline)

# Print the results
print(results_baseline)

In [10]:
# combine val_dataset (to_dict) with predictions
data = val_dataset.to_dict()
data["predictions"] = [p["prediction_text"] for p in predictions_baseline]

import pickle

with open("predictions_baseline_worse.pickle", "wb") as f:
    pickle.dump(data, f)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

FastLanguageModel.for_training(model)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = val_dataset,
    # compute_metrics=squad_metric.compute,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        # max_steps = 200,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # evaluation_strategy = "steps",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from transformers import AutoTokenizer
from evaluate import load
from unsloth import FastLanguageModel

# Load the SQuAD evaluation metric
squad_metric = load("squad")

# Prepare lists to store predictions and references
predictions_finetuned = []
references_finetuned = []

# Ensure the model is in inference mode
FastLanguageModel.for_inference(model)

squad_prompt_template = "Context: {}\Question: {}\nAnswer:"

# Loop over the first 100 examples of the validation dataset
for i, example in tqdm(enumerate(val_dataset), total=len(val_dataset)):
    # Format the SQuAD prompt using the current example's context and question
    squad_prompt = squad_prompt_template.format(
        example['context'],  # context
        example['question'],   # question
        ""                    # answer - leave blank for generation
    )

    # print(squad_prompt)

    # Tokenize the input
    inputs = tokenizer(
        [squad_prompt],
        return_tensors="pt"
    ).to("cuda")

    # Generate predictions from the model
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, do_sample=False)

    # Decode the generated tokens into text
    generated_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # only get the answer part:
    generated_answer = generated_answer[len(squad_prompt):]
    # print(generated_answer)

    # Append the generated answer in the required format (id and prediction_text)
    predictions_finetuned.append({
        "id": example["id"],  # use the id from the dataset
        "prediction_text": generated_answer.strip()  # the generated text
    })

    # Append the ground truth in the required format (id and answers)
    references_finetuned.append({
        "id": example["id"],  # use the same id for the reference
        "answers": {
            "text": example['answers']['text'],  # list of true answers
            "answer_start": example['answers']['answer_start']  # positions of the answers
        }
    })

In [None]:
# Compute the SQuAD metric using predictions and references
results_finetuned = squad_metric.compute(predictions=predictions_finetuned, references=references_finetuned)

# Print the results
print(results_finetuned)

In [None]:
# combine val_dataset (to_dict) with predictions
data = val_dataset.to_dict()
data["predictions"] = [p["prediction_text"] for p in predictions_finetuned]

import pickle

with open("predictions_finetuned.pickle", "wb") as f:
    pickle.dump(data, f)

In [None]:
# save trainer_stats
import pickle

with open("trainer_stats.pickle", "wb") as f:
    pickle.dump(trainer_stats, f)

In [None]:
for i in range(10):
  print(100*'-')
  print(predictions[i]['prediction_text'], "\n", references[i]['answers']['text'])