In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl

In [None]:
import os
import torch

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

In [None]:
# Let's check the GPU configuraiton and instance type
!nvidia-smi

In [None]:
DATASET_NAME = "mlabonne/guanaco-llama2-1k"
dataset = load_dataset(DATASET_NAME, split="train")

In [None]:
# Let's take a look at one of the entries in our dataset
dataset[5]

In [None]:
# Initialize quantization configuration. While you may play around with the
# configuraiton, this is not a design choice, simply depends on your GPU setup.
USE_4BIT = True
COMPUTE_DTYPE = "float16"
QUANTIZATION_TYPE = "nf4"
USE_NESTED_QUANTIZATION = False

bnb_config = BitsAndBytesConfig(
    load_in_4bit=USE_4BIT,
    bnb_4bit_quant_type=QUANTIZATION_TYPE,
    bnb_4bit_compute_dtype=COMPUTE_DTYPE,
    bnb_4bit_use_double_quant=USE_NESTED_QUANTIZATION,
)

# Check GPU compatibility with bfloat16
major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)

In [None]:
!pip install --upgrade accelerate

In [None]:
# Load base model
# MODEL_NAME = "distilgpt2"
# MODEL_NAME = "NousResearch/Llama-2-7b-chat-hf"
MODEL_NAME = "facebook/opt-1.3b"

DEVICE_MAP = {"": 0}

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map=DEVICE_MAP
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    use_fast=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Take a look at your model
model

In [None]:
# Initialize your LoRA hyperparameters
ADAPTER_NAME = "lora_adapter"

### BEGIN YOUR CODE ###

LORA_DROPOUT = 0.05
LORA_ALPHA = 32
LORA_R = 4

###### GPT-2
# Hint: in the above output, you want to focus on the names in parantheses, i.e.(resid_dropout) in GPT2
# TARGET_MODULES = ["wte", "wpe", "c_attn", "c_proj", "c_fc", "lm_head"] # , p8adam: 2-3 epochs gibberish output, started from 6.5 and then went to 13 and came to 6
# TARGET_MODULES = ["wte", "wpe", "c_attn", "c_proj", "c_fc"] # - 1.93, 0, 0,

# TARGET_MODULES = ["c_attn", "c_proj", "c_fc"] #, - p8adam: gibberish, loss goes from 6 to 4ish

# TARGET_MODULES = ["lm_head"] - # 1.93, 0....

# Attention and linear
# TARGET_MODULES = ["c_attn", "c_proj", "c_fc", "lm_head"] # 1.94, 0, ....

# TARGET_MODULES = ["c_attn", "c_proj", "c_fc"]

##### LAMA

# TARGET_MODULES = ["k_proj", "v_proj", "q_proj", "out_proj", "fc1", "fc2"]

# first and last
# TARGET_MODULES = ["embed_tokens", "lm_head"]

#### opt
# TARGET_MODULES = ["embed_tokens", "embed_positions", "k_proj", "v_proj", "q_proj", "out_proj", "fc1", "fc2", "lm_head"]

# TARGET_MODULES = ["embed_tokens", "lm_head"]

### best option:
TARGET_MODULES = ["k_proj", "v_proj", "q_proj", "out_proj", "fc1", "fc2"]

### END YOUR CODE ###

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    target_modules=TARGET_MODULES,
    task_type="CAUSAL_LM",
    bias="none"
)

# Add the LoRA adapter to your model and freeze all other weights
model.add_adapter(peft_config, adapter_name=ADAPTER_NAME)
model.set_adapter(ADAPTER_NAME)

In [None]:
# Now we have applied the adapter, let's take another look at our model!
model

In [None]:
# Number of training epochs
num_train_epochs = 1

### BEGIN YOUR CODE ###

# Select hyperparameters for learning rate
optimizer = "adamw_hf"                    # Type of optimizer
max_grad_norm = 1.0                 # Maximum gradient normal (gradient clipping)
learning_rate = 2.5e-4                # Initial learning rate
weight_decay = 1e-4               # Weight decay to apply to all layers except bias/LayerNorm weights

### END YOUR CODE ###

# Select hyperparameters for learning rate scheduler
lr_scheduler_type = "cosine"          # Learning rate schedule type
warmup_ratio = 0.03                   # Ratio of steps for a linear warmup (from 0 to learning rate)

# Etc. training configurations (ajudst for your compute requirements accordingly)
fp16 = False                          # Enable fp16/bf16 training
bf16 = False
if MODEL_NAME == "distilgpt2":
    per_device_train_batch_size = 8   # Batch size per GPU for training
elif MODEL_NAME == "NousResearch/Llama-2-7b-chat-hf":
    per_device_train_batch_size = 1
elif MODEL_NAME == "facebook/opt-1.3b":
    per_device_train_batch_size = 2
gradient_accumulation_steps = 1       # Number of update steps to accumulate the gradients for
gradient_checkpointing = True         # Enable gradient checkpointing
save_steps = 0                        # Save checkpoint every X updates steps
logging_steps = 25                    # Log every X updates steps

# Options for supervised fine-tuning with TRL
max_seq_length = 512
group_by_length = True                # Group sequences into batches with same length
packing = False                       # Pack multiple short examples in the same input sequence to increase efficiency

training_arguments = TrainingArguments(
    output_dir='.',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optimizer,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=-1,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type
)

In [None]:
# Begin our training loop with the managed SFT library
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

trainer.train()

In [None]:
def gpt_inference(model, tokenizer, text, text_preprocessing_fn=None):
    """
    Evaluates the model's greedy or beam responses on eval_conversations

    Args:
        model: A sequence-to-sequence model.
        tokenizer: A tokenizer for the model.
        text (str): Input prompt to model.
        text_preprocessing_fn (optional): Function for preprocessing text string.

    Returns: generated_text
        generated_text (str): output code generated by model
    """
    model.eval()

    if text_preprocessing_fn is not None:
        text = text_preprocessing_fn(text)

    generated_text = None

    ### BEGIN YOUR CODE ###

    # text = "[INST] " + text + " [\INST]"

    with torch.no_grad():
      tokenized_data = tokenizer.encode(text, return_tensors="pt", truncation=True, padding=True)
      # print(tokenized_data.shape)
      outputs = model.generate(tokenized_data.to(device),
                              pad_token_id=tokenizer.eos_token_id, max_new_tokens=50,
                              early_stopping=True)
      print(outputs[0].shape)
      generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    ### END YOUR CODE ###

    return generated_text

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id

sample_texts = ['Tell me about your day.',
                'Hi, how are you?',
                'We have to stop him before he blows up the village!',
                'It\'s a matter of life and death.',
                'We really should get going.']

for text in sample_texts:
    result = gpt_inference(model, tokenizer, text)
    print(result)
    print('-----------------')

In [None]:
# Save the LoRA model weights
trainer.model.save_pretrained("my-new-dialogue-model")

# Load the original model again without quantization (so we can apply the weights to the full precision model)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

# Merge the model weights
model = PeftModel.from_pretrained(base_model, "my-new-dialogue-model")
model = model.merge_and_unload()

# Save model locally
model.save_pretrained('.')