#### Load Dataset

Dataset is created in the 'create_dataset.ipynb' notebook

In [1]:
import pandas as pd

In [2]:
train_path = r'training_samples.xlsx'
test_path = r'testing_samples.xlsx'

train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,prompt,response
0,0,You are the CEO of a tech startup. Your compan...,"Given the situation, it's crucial to consider ..."
1,1,You are a city mayor and your city has been ex...,Addressing a rising crime rate requires a bala...
2,2,You are a school principal and your school's s...,Addressing low standardized test scores requir...
3,3,You are the manager of a manufacturing plant a...,Addressing production delays requires a carefu...
4,4,You are the director of a hospital and you're ...,Addressing the issue of an overwhelmed hospita...


### Install necessary libraries

In [4]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [5]:
import os
import torch
import json
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2024-04-28 14:09:39.205022: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 14:09:39.205045: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 14:09:39.205050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-28 14:09:39.209025: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _warn(("h5py is running agains

#### Define Hyperparameters

In [17]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # use this if you have access to the official LLaMA 2 model "meta-llama/Llama-2-7b-chat-hf", though keep in mind you'll need to pass a Hugging Face key argument
dataset_name = "train.jsonl"
finetuned_model_name = "llama-3-8b-lora"
raw_model_name = "llama_3_8b_raw"

# LoRA configuration
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# 4 bit quantization parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

num_train_epochs = 5
fp16 = False
bf16 = False

per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True

max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"

max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
output_dir = "./results"   # save tensorboard logs
# device_map = {"": 0}
device_map = "auto"


#### Load Datasets and Train

In [7]:
# Load datasets
train_dataset = load_dataset('json', data_files='train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='test.jsonl', split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# 'system_message' is created from the 'create_dataset.ipynb' notebook
system_message = "Given the complex scenario and multiple factors, provide a well-reasoned recommendation or action plan."

In [9]:
# 1: Load Model from huggingface and save it in the local folder

### UNCOMMENT FOR FIRST RUN ###
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
#     force_download=True, 
#     resume_download=False,
#     use_auth_token=True,
# )


# # Save in the first run, then comment out the saving line.
# model.save_pretrained(raw_model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(raw_model_name, device_map=device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Test the model before trining
results = []
for index, row in test_df.iterrows():
    instruction_prompt =  row['prompt']

    prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{instruction_prompt}[/INST]"
    print(f"=== {index} ===")
    print(f'{prompt}\n')
    gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=1000)
    result = gen(prompt)
    generated_text = result[0]['generated_text'].replace(prompt, '')


    print(generated_text)
    
    result_dict = {
        'instruction': prompt,
        'generated_response': generated_text,
        'GPT4_response': row['response']
    }

    # Append the result dictionary to the list of results
    results.append(result_dict)
    
# Save the results as a JSON file
with open('LLAMA3_raw_output.json', 'w') as json_file:
    json.dump(results, json_file)

## **Training**

In [12]:
# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [13]:
'''
4-bit quantization with 16-bit floating point precision for computations (training)

Configuration Used
- Using 4-bit Quantization (use_4bit = True) : This setting activates the 4-bit quantization, meaning the model's weights are stored using only 4 bits. 

- Compute Precision (bnb_4bit_compute_dtype = "float16"): Even though the model weights are quantized to 4 bits, computations (inference/training) 
                                                          are carried out using 16-bit floating point precision. This is a compromise between the 
                                                          computational efficiency of lower bit widths and the numerical stability and precision offered by higher bit widths.

- Quantization Type (bnb_4bit_quant_type = "nf4") : Normal Float 4

- Nested Quantization (use_nested_quant = False) : This option, when set to 'True', allow for a more complex quantization scheme that might involve multiple stages or layers of quantization. 
                                                    Setting this to 'False' simplifies the process, focusing on straightforward quantization without additional layers.

https://huggingface.co/docs/transformers/main/en/quantization
'''
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [14]:
# 2: Model Quantization
model = AutoModelForCausalLM.from_pretrained(
    raw_model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [18]:
'''
### LoRA (Low-Rank Adaptation) Parameters

1. lora_r = 64: rank of the low-rank matrices to 64.
            The rank (r) determines the size of these matrices. A higher rank means a more expressive model (capable of capturing more complex modifications) 
            but also increases computational requirements.

2. lora_alpha = 16: This parameter controls the learning rate multiplier for the LoRA parameters. 
                    It adjusts how much the LoRA parameters should learn relative to the base model parameters during adaptation.

3. lora_dropout = 0.1:  Dropout is a regularization technique used to prevent overfitting

'''
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [19]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5  # Evaluate every 20 steps
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# 3: Train the model 
trainer.train()
trainer.model.save_pretrained(finetuned_model_name)



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5,1.7613,1.626225
10,1.4513,1.347709
15,1.1814,1.058061
20,0.8873,0.811848
25,0.6915,0.718004


In [20]:
# 4: Test the model
instruction_prompt = test_df['prompt'][0]
logging.set_verbosity(logging.CRITICAL)

# system_message followed by instruction
prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{instruction_prompt}. [/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])



[INST] <<SYS>>
Given the complex scenario and multiple factors, provide a well-reasoned recommendation or action plan.
<</SYS>>

You are the mayor of a city that is struggling with traffic congestion. You have two options: invest in improving public transportation or implement congestion charging. What is your decision?. [/INST] <<SYS>>

As the mayor, my primary goal is to reduce traffic congestion while also ensuring the city remains accessible and affordable for its citizens.

After careful consideration, I would propose a combination of both measures. First, I would invest in improving public transportation by increasing the frequency and coverage of buses and trains. This would encourage more people to use public transport, reducing the number of private vehicles on the road.

At the same time, I would introduce a congestion charging system, but with a twist. Instead of charging a flat fee per vehicle, I would implement a dynamic pricing system that adjusts the charge based on the 

In [23]:
# Test the model after trining
results = []
for index, row in test_df.iterrows():
    instruction_prompt =  row['prompt']

    prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{instruction_prompt}[/INST]"
    print(f"=== {index} ===")
    print(f'{prompt}\n')
    gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=1000)
    result = gen(prompt)
    generated_text = result[0]['generated_text'].replace(prompt, '')


    print(generated_text)
    
    result_dict = {
        'instruction': prompt,
        'generated_response': generated_text,
        'GPT4_response': row['response']
    }

    # Append the result dictionary to the list of results
    results.append(result_dict)
    
# Save the results as a JSON file
with open('LLAMA3_finetuned_output.json', 'w') as json_file:
    json.dump(results, json_file)

=== 0 ===
[INST] <<SYS>>
Given the complex scenario and multiple factors, provide a well-reasoned recommendation or action plan.
<</SYS>>

You are the mayor of a city that is struggling with traffic congestion. You have two options: invest in improving public transportation or implement congestion charging. What is your decision?[/INST]

 Addressing traffic congestion requires a comprehensive approach that considers the root causes of the problem. Improving public transportation can increase the number of people using public transport, reducing the number of private vehicles on the road. However, it may not be effective in areas where public transport is not well-connected or reliable. Congestion charging, on the other hand, can be an effective way to reduce traffic congestion by making it more expensive for people to drive in congested areas. However, it may be unpopular with drivers and could disproportionately affect low-income households.

As the mayor, my decision would be based o

### Merge the model and store

In [24]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    raw_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, finetuned_model_name)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
model_path = "llama-3-8b-finetuned" 

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('llama-3-8b-finetuned/tokenizer_config.json',
 'llama-3-8b-finetuned/special_tokens_map.json',
 'llama-3-8b-finetuned/tokenizer.json')

#### Load a fine-tuned model and run inference

In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "llama-3-8b-finetuned"  # change to the path where your model is saved

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
from transformers import pipeline

prompt = "What is 2 + 2?"  # change to your desired prompt
gen = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = gen(prompt)
print(result[0]['generated_text'])

What is 2 + 2?  # This is a comment, it won't be executed


In [29]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Sun Apr 28 15:03:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A4500               On  | 00000000:01:00.0 Off |                  Off |
| 30%   29C    P8              15W / 200W |   6364MiB /