In [1]:
#IMPORTANT - to ensure package loading, first add the path of the utils folder to your system path
import os
import sys

module_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(module_dir, os.pardir, "utils")))

In [2]:
import os
import gc
import time

import tqdm
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq,
                          LlamaForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig)
import pandas as pd
import torch
import numpy as np
import random
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import wandb
from trl import SFTTrainer

from dataload_utils import load_full_dataset, load_dataset_task_prompt_mappings

  from .autonotebook import tqdm as notebook_tqdm


[2024-04-19 17:21:17,478] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
WANDB_PROJECT_NAME = "llama2_annotations_llm_comparison"
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf" #Name of the model to finetune (this script was tested on LLAMA-2 70b, LLAMA-2 13b, and OASST-LLAMA 30b)

In order to run LLAMA-2 models, you need to register yourself at the HuggingFace model page (https://huggingface.co/meta-llama/Llama-2-70b-chat-hf). Then, you can either insert the token here (not recommended if sharing a repository on GitHub), or input it in the hf_token.txt as done here and ensure it is included in the .gitignore.

In [5]:
with open(os.path.join(module_dir, "hf_token.txt"), "r") as file:
    hf_token = file.read().strip()

FileNotFoundError: [Errno 2] No such file or directory: '/scratch/mkorob/OpenSource-LLM-Practitioner-Guide/src/LLAMA/hf_token.txt'

This is an optional parameter to run if your default transformers cache location does not contain enough storage to load the LLAMA models. Otherwise, you can keep it as is.

In [None]:
#cache_location = os.environ['HF_HOME']
cache_location = "your/path/to/large/storage"

os.environ['TRANSFORMERS_CACHE'] = cache_location
os.environ['HF_HOME'] = cache_location

### Setup Arguments and Data

In [None]:
# Configuration Variables

# Type of task to run inference on
task = 1  # Choices: [1,2,3,4,5,6]

# Dataset to run inference on
dataset = 1  # Choices: [1, 2, 3, 4]

# Size of the sample to generate
sample_size = '250'  # Choices: ['50','100','250','500','1000','1500']

# Path to the directory to store the generated predictions
output_dir = '../../data'

# Path to the directory to store the models (make sure this location is included in the .gitignore if using GitHub)
model_dir = '../../data'

# Name of the model to finetune (this script was tested on LLAMA-2 70b, LLAMA-2 13b, and OASST-LLAMA 30b)
MODEL_NAME = "meta-llama/Llama-2-13b-chat-hf"

#If using LLAMA2, keep True. Set False only for OASST-LLAMA.
use_llama2_prompt = True

#This is relevant for LLAMA2, where the system and user message are separated by context tokens. You should count how many lines your user message takes (in this case, 3)
system_user_prompt_division_line = 3

# Random seed to use
seed = 2019

# Path to the directory containing the datasets
data_dir = '../../data'

# Whether to use the full label
not_use_full_labels = False

# Path to the dataset-task mappings file
dataset_task_mappings_fp = os.path.normpath(os.path.join(module_dir, '..', 'dataset_task_mappings.csv'))

#Maximum length of prompt to be taken by the model as input (check documentation for current maximum length)
max_prompt_len = 4096

# Number of epochs to train the model
n_epochs = 3

# Batch size
batch_size = 4

#Gradient accumulation steps
gradient_accumulation_steps = 2

#run name - Optional Argument if you want it to be called something else than the default way (defined below)
run_name = ""

#maximum length of sequence to produce
max_new_tokens = 100

### LoRA specific parameters
temp = 0.05
top_p = 0.75
top_k = 40
lora_config_alpha = 32
lora_config_dropout = 0.05
lora_config_r = 16
lora_config_no_target_modules = False

### Define Utility Functions

In [None]:
def tokenize(element, tokenizer, dataset_text_field, max_seq_len, batch_size):
    outputs = tokenizer(
        element[dataset_text_field],
        truncation=True,
        padding=False,
        max_length=max_seq_len,
        return_overflowing_tokens=False,
        return_length=False,
        return_tensors="pt"
    )

    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        padding='longest',
        pad_to_multiple_of=batch_size
    )

    data_collator.collate_batch([outputs])

    return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}


In [None]:
def set_all_seeds(seed: int = 123):
    # tf.random.set_seed(123)
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)

    # Set seed with the `transformers` library
    # set_seed(seed)

## Main Implementation

In [None]:
exp_name = run_name if run_name != '' else f'{MODEL_NAME}_ds_{dataset}_task_{int(task)}_sample_{sample_size}_epochs_{n_epochs}_prompt_max_len_{max_prompt_len}_batch_size_{batch_size}_grad_acc_{gradient_accumulation_steps}'
exp_name = exp_name.replace('.', '_')

# Initialize the Weights and Biases run
wandb.init(
    # set the wandb project where this run will be logged
    project=WANDB_PROJECT_NAME,
    name=exp_name,
    # track hyperparameters and run metadata
    config={
        "model": MODEL_NAME,
        "dataset": dataset,
        "task": task,
        "epochs": n_epochs,
        "max_prompt_len": max_prompt_len,
        "batch_size": batch_size,
        "gradient_accumulation_steps": gradient_accumulation_steps
    }
)

In [None]:
if not_use_full_labels:
    exp_name += '_label_abbreviation'
    labelset_col = 'labelset'
else:
    labelset_col = 'labelset_fullword'

In [None]:
print('Running exp:', exp_name)

### Load Data and the prompt

In [None]:
prompt_col = 'zero_shot_prompt'
# Load the prompt
dataset_idx, dataset_task_mappings = load_dataset_task_prompt_mappings(
    dataset_num=dataset, task_num=task, dataset_task_mappings_fp=dataset_task_mappings_fp)

# Get information specific to the dataset
label_column = dataset_task_mappings.loc[dataset_idx, "label_column"]
labelset = dataset_task_mappings.loc[dataset_idx, labelset_col].split("; ")
labelset = [label.strip() for label in labelset]
prompt = dataset_task_mappings.loc[dataset_idx, prompt_col]

# Get the system or instruction prompt and the user prompt format
system_prompt = ('\n'.join(prompt.split('\n')[:-system_user_prompt_division_line])).strip()
user_prompt_format = ('\n'.join(prompt.split('\n')[-system_user_prompt_division_line:])).strip()

# Log the system prompt and user_prompt_format as files in wandb
prompts_artifact = wandb.Artifact('prompts', type='prompts')
with prompts_artifact.new_file('system_prompt.txt', mode='w') as f:
    f.write(system_prompt)
with prompts_artifact.new_file('user_prompt_format.txt', mode='w') as f:
    f.write(user_prompt_format)
wandb.run.log_artifact(prompts_artifact)

# Load the train and eval datasets with the full prompt format
print(f'label_columns: {label_column}')
print(f'labelset: {labelset}')

datasets = load_full_dataset(
    data_dir=data_dir, dataset_num=dataset, task_num=task,
    label_column=label_column, labelset=labelset, full_label=not not_use_full_labels,
    sample_size=sample_size, system_prompt=system_prompt, user_prompt_format=user_prompt_format,
    llama_2=use_llama2_prompt, zero_shot_full_dataset=False
)

In [None]:
# Print examples from train set, eval set and evalset without completion
print(f"Train set example with completion ({len(datasets['train'])} rows): ")
print("-" * 50 + '\n')
print(datasets["train"]["text"][0])
print('\n\n')

print(f"Eval set example with completion ({len(datasets['eval'])} rows): ")
print("-" * 50 + '\n')
print(datasets["eval"]["text"][0])
print('\n\n')

print(f"Eval set without completion ({len(datasets['eval_wo_completion'])} rows): ")
print("-" * 50 + '\n')
print(datasets["eval_wo_completion"]["text"][0])
print('\n\n')

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, truncation_side="left", use_fast=False, token=hf_token, cache_dir = "../cache")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load the model
use_4bit = True                         # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16"      # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4"             # Quantization type (fp4 or nf4)
use_nested_quant = False                # Activate nested quantization for 4-bit base models (double quantization)

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

model = LlamaForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config,
                                            device_map="auto", token=hf_token, cache_dir = "../cache")

In [None]:
########################################################################
# Set LoRA configuration and add the adapters to  the model
########################################################################

# Add the adapter with the PeFT library
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=lora_config_r, lora_alpha=lora_config_alpha, lora_dropout=lora_config_dropout,
    bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"] if not lora_config_no_target_modules else None
)
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

########################################################################
# Train the model and persist it
########################################################################
path_model_dir = os.path.join(model_dir, exp_name)

# Define training args
fp16 = False
bf16 = True  # set bf16 to True with an A100
training_args = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=n_epochs,
    load_best_model_at_end=False,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
    fp16=fp16,
    bf16=bf16,
    report_to=["wandb"]
)

# Create Trainer instance
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    max_seq_length=max_prompt_len,
    train_dataset=datasets["train"],
    dataset_text_field="text",
)

print("Training the model...")
trainer.train()
trainer.save_model(path_model_dir)

########################################################################
# Merge LoRA weights with the model
########################################################################
# Free up memory to reload the model and merge the weight
del trainer
del model

# Run garbage collector and empty cache manually
torch.cuda.empty_cache()
gc.collect()
time.sleep(10)

print("Merging LoRA weights...")

# Load base LLM model
llama_model = LlamaForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map='auto',
    token=hf_token,
    cache_dir = "../cache"
)

# It needs enough memory to load it in 16 bit, a little over 80GB, so more than 1 A100
model = PeftModel.from_pretrained(
    llama_model,
    path_model_dir,
    torch_dtype=torch.float16,
)

# Merge LoRA and base model
merged_model_path = os.path.join(path_model_dir, "merged_model")
print(f"saving model merged with adapters in: {merged_model_path}")
model = model.merge_and_unload()
model.save_pretrained(merged_model_path, safe_serialization=True)

# Free up memory
del llama_model
del model
torch.cuda.empty_cache()
gc.collect()
time.sleep(10)

# Reload the model in its quantized version
model = LlamaForCausalLM.from_pretrained(
    merged_model_path, quantization_config=bnb_config, device_map="auto", token=hf_token)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, truncation_side="left", use_fast=False, token=hf_token)
# tokenizer.pad_token = tokenizer.unk_token         # Uncomment if using UNK as padding token for batched inference
# tokenizer.padding_side = "left"                   # Uncomment for batched inference

# Default params from alpaca-lora generate script (commonly used)
generation_config = GenerationConfig(
    temperature=temp,
    top_p=top_p,
    top_k=top_k,
    do_sample=True,
    max_new_tokens=max_new_tokens
)

with torch.no_grad():
    predictions_out = []
    for i, input_text_i in tqdm.tqdm(enumerate(datasets["eval_wo_completion"]["text"])):
        # Tokenize the text
        tokenized_text_i = tokenizer(
            text_target=input_text_i,
            padding=False,
            max_length=max_prompt_len,
            truncation=True,
            return_tensors="pt"
        )

        # Generate the completions
        outputs = model.generate(
            input_ids=tokenized_text_i["input_ids"].cuda(),
            generation_config=generation_config
        )

        generated_text_minibatch = tokenizer.batch_decode(
            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )

        predictions_out += generated_text_minibatch

        if i == 0:
            print("Sample prediction: ")
            print(predictions_out[0])

In [None]:
predictions_dir = os.path.join(output_dir, 'predictions', os.path.basename(MODEL_NAME),
                                exp_name.replace("/", "_"))
os.makedirs(predictions_dir, exist_ok=True)
file_name = f'ds_{dataset}__task_{task}_eval_set.csv'
print(f'The filename that will be used is: {file_name}')
eval_df = pd.read_csv(os.path.join(data_dir, file_name))
eval_df['prediction_ds'] = predictions_out

In [None]:
eval_df.to_csv(os.path.join(
    predictions_dir,
    f'ds_{dataset}__task_{task}__sample_size_{sample_size}_eval_set.csv'))

### Terminate WandB

In [None]:
wandb.finish()