https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da

In [None]:
!huggingface-cli login



In [None]:
!pip install --quiet accelerate==0.21.0
!pip install --quiet peft==0.4.0
!pip install --quiet bitsandbytes==0.40.2
!pip install --quiet transformers==4.31.0
!pip install --quiet trl==0.4.7
!pip install --quiet wandb
!pip install --quiet evaluate
!pip install --quiet xformers

In [None]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import time

In [None]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-sentiment-classifier"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

output_dir = "./results"
num_train_epochs = 3
fp16 = False
bf16 = False
per_device_train_batch_size = 2 * 8
per_device_eval_batch_size = 2 * 8
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

max_seq_length = None
packing = False
device_map = {"": 0}

In [None]:
from datasets import load_dataset

imdb = load_dataset("imdb")
imdb = imdb.shuffle(seed=42)
dataset = imdb['train'].train_test_split(test_size=0.01, shuffle=False)

In [None]:
dataset["test"][:3]

{'text': ['There\'s no shortage of bad dialogue in David and Bathsheba \x96 "I was quite a hand with a slingshot," "The King of all Israel out there in the darkness exposing himself to the enemy" (full marks to Dennis Hooey for delivering that one with a straight face), "Go and sit with the concubines." And somehow I doubt a bored David ever told the prophet Nathan "Whatever you say." He even tries the old "My kingdom doesn\'t understand me" routine on desperate housewife Bathsheba at one point. So it\'s probably a tribute to Henry King\'s direction that the film isn\'t at all bad despite the pitfalls much of the first third provide. Maybe it\'s the censor-baiting nature of the plot \x96 a married man kills a femme fatale\'s husband and gets away with it! \x96 but King brings out the growing moral and theological complexities in Phillip Dunne\'s script rather than upping the sin and sandals hokum. This is the conflicted David on the downhill slope, abandoned by a vengeful God he no lon

In [None]:
def format_dolly(sample):
    sys_prompt = """<<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>"""

    instruction = f"### Instruction\nRate the sentiment of the following text, using a 0 or 1. 1 is a good sentiment and 0 is a bad one. Do not respond with anything other than a 0 or 1."
    context = f"### Context\n{sample['text']}"
    response = f"### Answer\n{sample['label']}"
    prompt = "\n\n".join([i for i in [sys_prompt, instruction, context, response] if i is not None])
    return prompt

from random import randrange
print(format_dolly(dataset["train"][randrange(len(dataset["train"]))]))

<<SYS>>You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>

### Instruction
Rate the sentiment of the following text, using a 0 or 1. 1 is a good sentiment and 0 is a bad one. Do not respond with anything other than a 0 or 1.

### Context
"Happenstance" is the most New York-feeling Parisian film I've seen since "When the Cat's Away (Chacun cherche son chat). "<br /><br />A film from last year released now to capitalize on the attention Audrey Tatou is getting for "Amelie," its French title is more apt: "Le Battement d'ailes du papillon (The Beating of the Butterfly's Wings)" as in summarizing chaos theo

util for getting predictions

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

train_dataset = dataset["train"].map(template_dataset, remove_columns=['label'])
eval_dataset = dataset["test"].map(template_dataset, remove_columns=['label'])

Map:   0%|          | 0/24750 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="wandb",
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model='eval_loss',
    save_total_limit=2
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing
)



Map:   0%|          | 0/24750 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
trainer.train()
trainer.model.save_pretrained(new_model)

[34m[1mwandb[0m: Currently logged in as: [33msegmed_matt[0m ([33msegmed[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.686,1.600287
2,1.604,1.593363
3,1.5358,1.593754


In [None]:
# Free memory for merging weights
del model
torch.cuda.empty_cache()

In [None]:
!nvidia-smi | grep MiB

| N/A   34C    P0    65W / 400W |   6713MiB / 40960MiB |      0%      Default |


In [None]:
new_model

'llama-2-7b-sentiment-classifier'

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(new_model, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/foobar8675/llama-2-7b-sentiment-classifier/commit/9f6672a70557aaae7df36e0c8db12e828df411a3', commit_message='Upload tokenizer', commit_description='', oid='9f6672a70557aaae7df36e0c8db12e828df411a3', pr_url=None, pr_revision=None, pr_num=None)