<a href="https://colab.research.google.com/github/minji-yoo-accounting/llm-trust-issue/blob/main/Finetuning_generative_models_(setting_5)ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Preperation

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/llm_trust/bank.csv')
df.columns = ['prompt','completion']
df.head()

Unnamed: 0,prompt,completion
0,"According to Gran , the company has no plans t...",neutral
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split into training+validation and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(df['prompt'], df['completion'], test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Print the sizes of each set
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 2071
Validation set size: 691
Testing set size: 691


In [None]:
df_train = pd.DataFrame({'prompt': X_train, 'completion': y_train})
df_val = pd.DataFrame({'prompt': X_val, 'completion': y_val})
df_test = pd.DataFrame({'prompt': X_test, 'completion': y_test})

In [None]:
df_train.value_counts('completion')

completion
neutral     1300
positive     513
negative     258
Name: count, dtype: int64

In [None]:
neutral = df_train[df_train['completion'] == 'neutral']
positive = df_train[df_train['completion'] == 'positive']
negative = df_train[df_train['completion'] == 'negative']
neutral = neutral.sample(n=500, random_state=42)
df_train = pd.concat([neutral, positive, negative]).sample(frac=1).reset_index(drop=True)
df_train

Unnamed: 0,prompt,completion
0,"After the acquisition , Basware 's preliminary...",neutral
1,We are very pleased with the fine co-operation...,positive
2,We went to the market with yield guidance of t...,neutral
3,In the Baltic states the company reports net s...,negative
4,The respondents praised Finnair 's reliability...,positive
...,...,...
1266,"Profitability ( EBIT % ) was 13.6 % , compared...",negative
1267,"Thus , SysOpen Digia has , in accordance with ...",neutral
1268,This resulted in improved sales figures in Swe...,positive
1269,Net sales surged by 18.5 % to EUR167 .8 m. Tel...,positive


In [None]:
df_train.value_counts('completion')

completion
positive    513
neutral     500
negative    258
Name: count, dtype: int64

In [None]:
neutral = df_val[df_val['completion'] == 'neutral']
positive = df_val[df_val['completion'] == 'positive']
negative = df_val[df_val['completion'] == 'negative']
neutral = neutral.sample(n=180, random_state=42)
df_val = pd.concat([neutral, positive, negative]).sample(frac=1).reset_index(drop=True)
df_val

Unnamed: 0,prompt,completion
0,Protalix closed at $ 10.71 on Friday on the Am...,neutral
1,The availability of the Internet services is h...,neutral
2,Rivals say Qualcomm has fewer patents on 3G ph...,negative
3,"The Baltimore Police and Fire Pension , which ...",negative
4,"For the first nine months of 2010 , the compan...",positive
...,...,...
453,"( ADP News ) - Dec 11 , 2008 - Finnish constru...",positive
454,The Insolvency Act regulates the amount of deb...,neutral
455,"Furthermore , our fully electrically driven cr...",positive
456,Cost savings will then rise to some 20 mln eur...,positive


In [None]:
df_val.value_counts('completion')

completion
positive    188
neutral     180
negative     90
Name: count, dtype: int64

In [None]:
df_val.value_counts('completion')

completion
neutral     413
positive    188
negative     90
Name: count, dtype: int64

In [None]:
df_test.value_counts('completion')

completion
neutral     433
positive    186
negative     72
Name: count, dtype: int64

In [None]:
import json

def convert_to_new_format(old_data):
    new_data = []

    for _, row in old_data.iterrows():
      new_entry = {
          "messages": [
               {"role": "system", "content": 'Would the news positively, negatively, or neutrally influence the stock price?'},
               {"role": "user", "content": row["prompt"]},
               {"role": "assistant", "content": row["completion"]}
          ]
       }
      new_data.append(new_entry)

    return new_data

In [None]:
# Convert the old data to the new format
# converted_data = convert_to_new_format(df_train)
converted_data = convert_to_new_format(df_val)
# converted_data = convert_to_new_format(df_test)

# The file to write to
# file_path = 'train_gpt_trust_issue.jsonl'
file_path = 'val_gpt_trust_issue.jsonl'
# file_path = 'test_gpt_trust_issue.jsonl'


# Open the file in write mode
with open(file_path, 'w') as file:
    for record in converted_data:
        # Convert the dictionary to a JSON string
        json_record = json.dumps(record)
        # Write the JSON string to the file with a newline character
        file.write(json_record + '\n')



In [None]:
!cp train_gpt_trust_issue.jsonl '/content/drive/MyDrive/llm_trust'
!cp val_gpt_trust_issue.jsonl '/content/drive/MyDrive/llm_trust'
!cp test_gpt_trust_issue.jsonl '/content/drive/MyDrive/llm_trust'

In [None]:
df_train = df_train.rename(columns={'prompt':'input','completion':'output'})
df_train['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'
df_train.to_csv('/content/drive/MyDrive/llm_trust/train_llama2_trust_issue.csv', sep=',', index=False)

df_val = df_val.rename(columns={'prompt':'input','completion':'output'})
df_val['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'
df_val.to_csv('/content/drive/MyDrive/llm_trust/val_llama2_trust_issue.csv', sep=',', index=False)

df_test = df_test.rename(columns={'prompt':'input','completion':'output'})
df_test['instruction'] = 'Would the news positively, negatively, or neutrally influence the stock price?'

df_test.to_csv('/content/drive/MyDrive/llm_trust/test_llama2_trust_issue.csv', sep=',', index=False)

# Finetuning GPT 3.5 turbo

In [None]:
!pip install -Uq openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/337.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pathlib import Path
from tqdm import tqdm
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "your-api-key"

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)


upload files

In [None]:
client.files.create(
    file=Path("/content/drive/MyDrive/llm_trust/train_gpt_trust_issue.jsonl"),
    purpose="fine-tune",
)


FileObject(id='file-1b3dF7K4HkwuGoewPW7krV1I', bytes=422006, created_at=1722116385, filename='train_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
client.files.create(
    file=Path("/content/drive/MyDrive/llm_trust/val_gpt_trust_issue.jsonl"),
    purpose="fine-tune",
)

FileObject(id='file-SqiUIi9jaWiowp7hq3oADuUY', bytes=152282, created_at=1722116387, filename='val_gpt_trust_issue.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
import openai

try:
    client.fine_tuning.jobs.create(
        model="gpt-3.5-turbo-1106",
        training_file="file-1b3dF7K4HkwuGoewPW7krV1I",
        validation_file="file-SqiUIi9jaWiowp7hq3oADuUY"
    )
except openai.APIConnectionError as e:
    print("The server could not be reached")
    print(e.__cause__)  # an underlying Exception, likely raised within httpx.
except openai.RateLimitError as e:
    print("A 429 status code was received; we should back off a bit.")
except openai.APIStatusError as e:
    print("Another non-200-range status code was received")
    print(e.status_code)
    print(e.response)

In [None]:
# now you can check the status on openai website

# Finetuning Llama2

**This part of script is built upon the guide by [Kshitiz Sahay](https://www.linkedin.com/in/k-kshitiz26/)**


**Part 1: Setting up and Preparing for Fine-Tuning**
1. Installing and loading the required modules
2. Steps to get approval for Meta's Llama 2 family of models
3. Setting up Hugging Face CLI and user authentication
4. Loading a pre-trained model and its associated tokenizer
5. Loading the training dataset
6. Preprocessing the training dataset for model fine-tuning

**Part 2: Fine-Tuning and Open-Sourcing**
1. Configuring PEFT (Parameter Efficient Fine-Tuning) method QLoRA for efficient fine-tuning
2. Fine-tuning the pre-trained model
3. Saving the fine-tuned model and its associated tokenizer
4. Pushing the fine-tuned model to the Hugging Face Hub for public usage


### Installing Required Libraries


`transformers`: for loading a large language model and fine-tuning it.

`bitsandbytes`: for loading the model in 4-bit precision.

`accelerate`: for training models and performing inference at scale.

`peft`: for fine-tuning a small number of parameters.

`trl`: for training transformer language models using Reinforcement Learning.


In [None]:
!pip install -q accelerate==0.21.0 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.40.2 --progress-bar off
!pip install -q transformers==4.31.0 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off

In [None]:
!pip install --upgrade pip



### Loading Required Libraries

Next, we will load the required libraries for fine-tuning a Large Language Model (LLM) like Llama 2. We will look at each imported class in greater detail in subsequent sections.

In [None]:
!pip install scipy



In [None]:
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          HfArgumentParser,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForLanguageModeling,
                          EarlyStoppingCallback,
                          pipeline,
                          logging,
                          set_seed)

import bitsandbytes as bnb
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer


In [None]:
torch.cuda.empty_cache()
# check GPU if it is A100
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))


There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


In [None]:
import csv
import pandas as pd

def check_gpu_mem():
    '''
    Uses Nvidia's SMI tool to check the current GPU memory usage.
    Reported values are in "MiB". 1 MiB = 2^20 bytes = 1,048,576 bytes.
    '''

    # Run the command line tool and get the results.
    buf = os.popen('nvidia-smi --query-gpu=memory.total,memory.used --format=csv')

    # Use csv module to read and parse the result.
    reader = csv.reader(buf, delimiter=',')

    # Use a pandas table just for nice formatting.
    df = pd.DataFrame(reader)

    # Use the first row as the column headers.
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header #set the header row as the df header

    # Display the formatted table.
    # display(df)

    return df


In [None]:
check_gpu_mem()

Unnamed: 0,memory.total [MiB],memory.used [MiB]
1,40960 MiB,5 MiB


### Hugging Face Hub Login


In [None]:
!huggingface-cli login --token hf_your_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Creating Bitsandbytes Configuration

Quantization is a technique used to compress deep learning models by reducing the number of bits used to represent their weights and activations. This compression allows for faster inference and reduced memory consumption, making it possible to deploy these models on edge devices with limited resources.

Parameters:

`load_in_4bit`: Load the model in 4-bit precision, i.e., divide memory usage by 4.

`bnb_4bit_use_double_quant`: Use nested quantization techniques for more memory-efficient inference at no additional cost.

`bnb_4bit_quant_type`: Set quantization data type. The options are either FP4 (4-bit precision), which is the default quantization data type, or NF4 (Normal Float 4), a new 4-bit data type adapted for weights that have been initialized using a normal distribution.

`bnb_4bit_compute_dtype`: Set the computational data type for 4-bit models. Default value: torch.float32

In [None]:
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
    """
    Configures model quantization method using bitsandbytes to speed up training and inference

    :param load_in_4bit: Load model in 4-bit precision mode
    :param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
    :param bnb_4bit_quant_type: Quantization data type for 4-bit model
    :param bnb_4bit_compute_dtype: Computation data type for 4-bit model
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    return bnb_config

### Loading Hugging Face Model and Tokenizer



In [None]:
def load_model(model_name, bnb_config):
    """
    Loads model and model tokenizer

    :param model_name: Hugging Face model name
    :param bnb_config: Bitsandbytes configuration
    """
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config = bnb_config,
        device_map = "auto",
        num_labels = 2, # binary classification
        offload_folder="offload",
        trust_remote_code=True
    )
    # for llamda2, we need to add the padding tokens as it is not defined by default
    model.config.pad_token_id = model.config.eos_token_id

    # Load model tokenizer with the user authentication token
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)

    # Set padding token as EOS token'
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return model, tokenizer

### Initializing Transformers and Bitsandbytes Parameters


In [None]:
################################################################################
# transformers parameters
################################################################################

# The pre-trained model from the Hugging Face Hub to load and fine-tune
model_name = "meta-llama/Llama-2-7b-hf"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16

Finally, we will call the above functions to get `model` and `tokenizer` objects.

In [None]:
# Load model from Hugging Face Hub with model name and bitsandbytes configuration

bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)

model, tokenizer = load_model(model_name, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Loading Dataset


In [None]:
# Load dataset

train = load_dataset("csv", data_files = "/content/drive/MyDrive/llm_trust/train_llama2_trust_issue.csv")
eval = load_dataset("csv", data_files = "/content/drive/MyDrive/llm_trust/val_llama2_trust_issue.csv")

# Access the actual datasets
train = train['train']
eval = eval['train']


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
def create_prompt_formats(sample):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset

    :param sample: Prompt or sample from the instruction dataset
    """

    # Initialize static strings for the prompt template
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n\n".join(parts)

    # Store the formatted prompt template in a new key "text"
    sample["text"] = formatted_prompt

    return sample

In [None]:
create_prompt_formats(train[randrange(len(train))])

{'input': 'The proportion of Estonian and Lithuanian passengers on the Tallinn-Helsinki route also grew in July .',
 'output': 'positive',
 'instruction': 'Would the news positively, negatively, or neutrally influence the stock price?',
 'text': '### Instruction:\nWould the news positively, negatively, or neutrally influence the stock price?\n\nInput:\nThe proportion of Estonian and Lithuanian passengers on the Tallinn-Helsinki route also grew in July .\n\n### Response:\npositive\n\n### End'}

### Tokenizing Dataset Batch

The user-defined `preprocess_batch` function will tokenize a batch of the input dataset (`batch`) using the `tokenizer` object. We will set the maximum sequence length using the `max_length` parameter, which will control the maximum length used by the padding or truncation parameter. `truncation = True` will truncate the input to the maximum length provided by the `max_length` parameter. Similarly, `padding = max_length` will pad the input to the maximum length provided. This function will be called in the `preprocess_dataset` function defined next.

In [None]:
def get_max_length(model):
    """
    Extracts maximum token length from the model configuration

    :param model: Hugging Face model
    """

    # Pull model configuration
    conf = model.config
    # Initialize a "max_length" variable to store maximum sequence length as null
    max_length = None
    # Find maximum sequence length in the model configuration and save it in "max_length" if found
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    # Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [None]:
def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizes dataset batch

    :param batch: Dataset batch
    :param tokenizer: Model tokenizer
    :param max_length: Maximum number of tokens to emit from the tokenizer
    """

    return tokenizer(
        batch["text"],
        max_length = max_length,
        truncation = True,
    )

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """
    Tokenizes dataset for fine-tuning

    :param tokenizer (AutoTokenizer): Model tokenizer
    :param max_length (int): Maximum number of tokens to emit from the tokenizer
    :param seed: Random seed for reproducibility
    :param dataset (str): Instruction dataset
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)

    # Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
    _preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched = True,
        remove_columns = ["instruction", "input", "output", "text"],
    )

    # Filter out samples that have "input_ids" exceeding "max_length"
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed = seed)

    return dataset

### Preprocessing Dataset

To preprocess the complete dataset for fine-tuning, we will define the `preprocess_dataset` function, which will perform the following operations:

1. Create the formatted prompts against each prompt in the instruction dataset using the `create_prompt_formats` function.
2. Tokenize the dataset in batches using the `preprocess_batch` function and removing the original dictionary keys (instruction, input, output, and text).
3. Filter out prompts with input token sizes exceeding the maximum length.
4. Shuffle the dataset using a random seed.

In [None]:
# Random seed
seed = 42
max_length = get_max_length(model)
train_dataset = preprocess_dataset(tokenizer, max_length, seed, train)
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, eval)

Found max lenth: 4096
Preprocessing dataset...


Map:   0%|          | 0/2071 [00:00<?, ? examples/s]

Map:   0%|          | 0/2071 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2071 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/691 [00:00<?, ? examples/s]

Map:   0%|          | 0/691 [00:00<?, ? examples/s]

Filter:   0%|          | 0/691 [00:00<?, ? examples/s]

We can now look at the preprocessed dataset, which contains tokens or IDs.

In [None]:
print(eval_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 691
})


In [None]:
print(eval_dataset[0])

{'input_ids': [1, 835, 2799, 4080, 29901, 13, 29956, 483, 278, 9763, 13686, 3598, 29892, 3480, 6703, 29892, 470, 11553, 29878, 635, 9949, 278, 10961, 8666, 29973, 13, 13, 4290, 29901, 13, 1576, 373, 17696, 2060, 988, 323, 1416, 433, 28771, 1973, 338, 1641, 1304, 338, 278, 478, 18321, 1222, 6335, 654, 11319, 1641, 8906, 491, 512, 1655, 295, 10863, 414, 349, 21908, 19806, 29899, 2687, 29963, 29934, 6154, 512, 14867, 5313, 1973, 669, 8010, 29879, 19806, 669, 315, 1367, 3217, 869, 13, 13, 2277, 29937, 13291, 29901, 13, 17821, 1705, 13, 13, 2277, 29937, 2796], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Creating PEFT Configuration


Fine-tuning pretrained LLMs on downstream datasets results in huge performance gains when compared to using the pretrained LLMs out-of-the-box. However, as models get larger and larger, full fine-tuning becomes infeasible to train on consumer hardware. In addition, storing and deploying fine-tuned models independently for each downstream task becomes very expensive, because fine-tuned models are the same size as the original pretrained model. Parameter-Efficient Fine-tuning (PEFT) approaches are meant to address both problems!


PEFT approaches only fine-tune a small number of (extra) model parameters while freezing most parameters of the pretrained LLMs, thereby greatly decreasing the computational and storage costs. It also helps in portability, wherein users can tune models using PEFT methods to get tiny checkpoints worth a few MB compared to the large checkpoints of full fine-tuning.


**In short, PEFT approaches enable you to get performance comparable to full fine-tuning while only having a small number of trainable parameters.**


Hugging Face provides the PEFT library, which provides the latest Parameter-Efficient Fine-tuning techniques seamlessly integrated with Hugging Face Transformers and Hugging Face Accelerate.


There are several PEFT methods. In the next cell, we will use QLoRA, one of the latest methods that reduces the memory usage of LLM finetuning without performance tradeoffs, using the `LoraConfig` class from the `peft` library.


QLoRA uses 4-bit quantization to compress a pretrained language model. The LM parameters are then frozen, and a relatively small number of trainable parameters are added to the model in the form of Low-Rank Adapters. During finetuning, QLoRA backpropagates gradients through the frozen 4-bit quantized pretrained language model into the Low-Rank Adapters. The LoRA layers are the only parameters being updated during training.

In [None]:
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Creates Parameter-Efficient Fine-Tuning configuration for the model

    :param r: LoRA attention dimension
    :param lora_alpha: Alpha parameter for LoRA scaling
    :param modules: Names of the modules to apply LoRA to
    :param lora_dropout: Dropout Probability for LoRA layers
    :param bias: Specifies if the bias parameters should be trained
    """
    config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return config

### Fine-tuning the Pre-trained Model

We will create `fine_tune`, our final function, to wrap everything we have done so far and initiate the fine-tuning process. This function will perform the following model preprocessing operations to prepare it for training:


1. Enable gradient checkpointing to reduce memory usage during fine-tuning.
2. Use the `prepare_model_for_kbit_training` function from PEFT to prepare the model for fine-tuning.
3. Call find_all_linear_names` to get the module names to apply LoRA to.
4. Create LoRA configuration by calling the `create_peft_config` function.
5. Wrap the base Hugging Face model for fine-tuning to PEFT using the `get_peft_model` function.
6. Print the trainable parameters.


For training, we will instantiate a `Trainer()` object within the `fine_tune` function. This class requires the model, preprocessed dataset, and training arguments, listed below.


`per_device_train_batch_size`: The batch size per GPU/TPU/CPU for training.


`gradient_accumulation_steps`: Number of update steps to accumulate the gradients for, before performing a backward/update pass.


`warmup_steps`: Number of steps used for a linear warmup from 0 to `learning_rate`.


`max_steps`: If set to a positive number, the total number of training steps to perform.


`learning_rate`: The initial learning rate for Adam.


`fp16`: Whether to use 16-bit (mixed) precision training (through NVIDIA apex) instead of 32-bit training.


`logging_steps`: Number of update steps between two logs.


`output_dir`: The output directory where the model predictions and checkpoints will be written.


`optim`: The optimizer to use for training.


Next, we will use the `train` method on the trainer` object to start the training and log and save the model metrics on the training dataset. Finally, we will save the model checkpoint (model weights, configuration file, and tokenizer) in the output directory and delete the model to free up memory. You can load the model for inference later using its saved checkpoint.

In [None]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)

In [None]:
def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [None]:
import os
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers.trainer_utils import IntervalStrategy

def fine_tune(model,
              tokenizer,
              train_dataset,
              eval_dataset,
              lora_r,
              lora_alpha,
              lora_dropout,
              bias,
              task_type,
              per_device_train_batch_size,
              gradient_accumulation_steps,
              warmup_steps,
              max_steps,
              learning_rate,
              logging_steps,
              output_dir,
              optim,
              bf16 #if A100 set true
              ):
    """
    Prepares and fine-tune the pre-trained model.

    :param model: Pre-trained Hugging Face model
    :param tokenizer: Model tokenizer
    :param train_dataset: Preprocessed training dataset
    :param eval_dataset: Preprocessed validation dataset
    """

    # Enable gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # Prepare the model for training
    model = prepare_model_for_kbit_training(model)

    # Get LoRA module names
    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup_steps,
        max_steps=max_steps,
        learning_rate=learning_rate,
        bf16=bf16,  # Enable BF16 training
        logging_steps=logging_steps,
        output_dir=output_dir,
        optim=optim,
        evaluation_strategy=IntervalStrategy.STEPS,  # Evaluate every logging_steps
        save_steps=logging_steps,
        eval_steps=logging_steps,
        logging_dir=f'{output_dir}/logs',
        save_total_limit=2,  # Save only the 2 most recent checkpoints
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model="eval_loss",  # Use validation loss to determine the best model
        greater_is_better=False,  # Lower validation loss is better
        lr_scheduler_type="reduce_lr_on_plateau",  # Reduce learning rate on plateau
    )

    trainer = Trainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping
    )

    model.config.use_cache = False

    do_train = True

    # Launch training and log metrics
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    # Perform validation
    print("Validating...")

    eval_metrics = trainer.evaluate()
    trainer.log_metrics("eval", eval_metrics)
    trainer.save_metrics("eval", eval_metrics)
    print(eval_metrics)

    # Save model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()


Initializing QLoRA and TrainingArguments parameters below for training.

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
# For memory efficiency, better to set as a multiple of 8
# the higher the more trainable parameters and more use of memory (GPU RAM)
# the most binding constraint given a single GPU environment (NVIDIA A100 40G is the best I can have)
lora_r = 16

# Alpha parameter for LoRA scaling
# higher alpha means more emphasis on new information from finetuning process than what it already knows from pretraining
# does not interact with memory (GPU RAM) and does not need to be a multiple of 8 or lora R.
lora_alpha = 256

# Dropout probability for LoRA layers
lora_dropout = 0.05

# Bias
bias = "none"

# Task type
task_type = 'CAUSAL_LM'

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################
# optimal qlora hyperparameter choice from exprements: https://lightning.ai/pages/community/lora-insights/

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/content/drive/MyDrive/llm_trust/llama2_finetuned"

# target effective batch size (subject to memory constraint)= train_batch_size x gradient_accumulation_steps

# Batch size per GPU for training
per_device_train_batch_size = 4

# Number of update steps to accumulate the gradients for
# the higher the better for memory constraint given the target batch size

gradient_accumulation_steps = 1

# MAX steps
max_steps = 1000 # for reference default steps for GPT 3.5 finetuning = 1500

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-5

# Optimizer to use
optim = "paged_adamw_8bit"

# Linear warmup steps from 0 to learning_rate
warmup_steps = 10

# Enable fp16/bf16 training (set bf16 to True with an A100)
# fp16 = True
bf16 = True

# Log every X updates steps
logging_steps = 20

Calling the `fine_tune` function below to fine-tune

In [None]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2071
})

In [None]:
# label_to_int = {'negative':0, 'neutral':1, 'positive':2}
# def label_to_int_function(example):
#     example['output'] = label_to_int[example['output']]
#     return example

# train_dataset = train_dataset.map(label_to_int_function)
# eval_dataset = eval_dataset.map(label_to_int_function)


In [None]:
fine_tune(model,
      tokenizer,
      train_dataset,
      eval_dataset,
      lora_r,
      lora_alpha,
      lora_dropout,
      bias,
      task_type,
      per_device_train_batch_size,
      gradient_accumulation_steps,
      warmup_steps,
      max_steps,
      learning_rate,
      logging_steps,
      output_dir,
      optim,
      bf16=True)


LoRA module names: ['q_proj', 'down_proj', 'gate_proj', 'up_proj', 'o_proj', 'k_proj', 'v_proj']
All Parameters: 3,540,389,888 || Trainable Parameters: 39,976,960 || Trainable Parameters %: 1.1291682911958425
Training...


Step,Training Loss,Validation Loss
20,1.7893,1.100594
40,1.0387,0.99457
60,1.0153,0.975876
80,0.938,0.962299
100,0.9703,0.959168
120,0.8842,0.954918
140,0.9246,0.94856
160,0.9295,0.945496
180,0.9845,0.943274
200,1.0062,0.935764




KeyboardInterrupt: 

### Merging Weights & Pushing to Hugging Face

After saving the fine-tuned weights, we can create our fine-tuned model by merging the fine-tuned weights and saving it to a new directory with its tokenizer. By performing this step, we can have a memory-efficient, fine-tuned model and tokenizer for inference. We will also push the fine-tuned model and its associated tokenizer to Hugging Face Hub for public usage.


In [None]:
# reload the base model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        device_map = "auto",
        num_labels = 3,
        offload_folder="offload",
        torch_dtype = torch.bfloat16,
        trust_remote_code=True
    )

model.config.pad_token_id = model.config.eos_token_id

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Fetch the finetuned LORA Layers
adapter = "/content/drive/MyDrive/llm_trust/llama2_finetuned/checkpoint-260" #check trainer_state.json and use the best model
output_merged_dir = "/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint"

# Merge the LoRA layers with the base model
model = PeftModel.from_pretrained(model, adapter)

# Save fine-tuned model at a new location
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)

# Save tokenizer for easy inference
tokenizer.save_pretrained(output_merged_dir)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



('/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint/tokenizer_config.json',
 '/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint/special_tokens_map.json',
 '/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint/tokenizer.model',
 '/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint/added_tokens.json',
 '/content/drive/MyDrive/llm_trust/llama2_finetuned/final_merged_checkpoint/tokenizer.json')

In [None]:
# Fine-tuned model name on Hugging Face Hub
new_model = "minjiyoo/llama-2-7b-hf-trust-issue-causal-val_90"

In [None]:
# Push fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_auth_token = True)
tokenizer.push_to_hub(new_model, use_auth_token = True)

adapter_model.bin:   0%|          | 0.00/640M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/minjiyoo/llama-2-7b-hf-trust-issue-causal-val_90/commit/989c44f89b356948cfe5e6df31c85e9cdbacce13', commit_message='Upload tokenizer', commit_description='', oid='989c44f89b356948cfe5e6df31c85e9cdbacce13', pr_url=None, pr_revision=None, pr_num=None)