# Fine-tune Llama2 without LoRA for QA


reference code: https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/

In [1]:
# import warnings
# warnings.filterwarnings('error', category=DeprecationWarning)

import warnings
warnings.filterwarnings('ignore')

In [2]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

Error [NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported] resolving:

1. pip install -U datasets
2. pip install fsspec==2023.9.2
3. restart the kernel of this jupyter notebook

https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor

In [16]:
# Dataset
data_name = "mlabonne/guanaco-llama2-1k"
training_data = load_dataset(data_name, split="train")

# Model and tokenizer names
# base_model_name = "/home/fanhuan/cache/llama-2-13b-chat-hf"
# refined_model = "/home/fanhuan/cache/llama-2-13b-chat-hf-TF"
base_model_name = "/home/fanhuan/cache/llama-2-7b-hf"
refined_model = "/home/fanhuan/cache/llama-2-7b-chat-hf-TF"
cache_dir = "/data/fanhuan/cache/temp/7b"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
# If do not use the LoRA, it is better to turn the 'load_in_4bit' into False
#, since the source code of this part is not perfect right now, Apr-30-2024
quant_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
#     device_map={"": 0}
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /home/fanhuan/cache/llama-2-7b-hf and are newly initialized: ['model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.ro

How to use the neptune features in Transformers:

https://docs.neptune.ai/integrations/transformers/#__tabbed_2_1

In [18]:
# from transformers.integrations import NeptuneCallback
# import neptune

# run = neptune.init_run(
#     project="fhuang181/LoRA", 
#     api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxZmI2ZTA2OC00ZGIxLTQ2NDktYTU4YS1jOWUyNWIwYmU3YWUifQ==", # your credentials
# )

# neptune_callback = NeptuneCallback(run=run)

# # LoRA Config
# peft_parameters = LoraConfig(
#     lora_alpha=16,
#     lora_dropout=0.1,
#     r=8,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# Training Params
train_params = TrainingArguments(
    output_dir=cache_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    # very important setting for keep the disk space enough for further training
    save_total_limit = 1,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False, # NotImplementedError: You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    # use the report_to parameter to avoid error in neptune stuff
    # report_to="none"
    report_to="tensorboard"
    # report_to="neptune"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    # model = check_point_model,
    train_dataset=training_data,
    # peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
    # callbacks=[neptune_callback]
)

fine_tuning.train()

Step,Training Loss
25,3.8549
50,4.416
75,4.0523
100,3.9901
125,3.9033
150,3.5664
175,3.7742
200,3.9281
225,3.95
250,4.046


TrainOutput(global_step=250, training_loss=3.9481307678222657, metrics={'train_runtime': 977.0586, 'train_samples_per_second': 1.023, 'train_steps_per_second': 0.256, 'total_flos': 1.7036321920745472e+16, 'train_loss': 3.9481307678222657, 'epoch': 1.0})

In [20]:
fine_tuning.model.save_pretrained(refined_model)
fine_tuning.tokenizer.save_pretrained(refined_model)

('/home/fanhuan/cache/llama-2-7b-chat-hf-TF/tokenizer_config.json',
 '/home/fanhuan/cache/llama-2-7b-chat-hf-TF/special_tokens_map.json',
 '/home/fanhuan/cache/llama-2-7b-chat-hf-TF/tokenizer.model',
 '/home/fanhuan/cache/llama-2-7b-chat-hf-TF/added_tokens.json',
 '/home/fanhuan/cache/llama-2-7b-chat-hf-TF/tokenizer.json')

In [21]:
# Fine-tuned model
prompt = "How do I use the OpenAI API?"
pipe = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=llama_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How do I use the OpenAI API? [/INST] I can help you use the OpenAI, or I can help you use the OpenAI, using various methods. 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including:

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI


In [22]:
# Original model
prompt = "How do I use the OpenAI API?"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=llama_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How do I use the OpenAI API? [/INST] I can help you use the OpenAI, or I can help you use the OpenAI, using various methods. 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including:

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI, using various methods, including: 

I can help you use the OpenAI, or I can help you use the OpenAI


### Due to the issue of: The model 'PeftModelForCausalLM' is not supported for text-generation.

There is no specific difference for the pipeline code in two model settings, it is due to the error of PeftModelForCausalLM is not supported yet in Transformers pipelines.

According to (https://huggingface.co/bertin-project/bertin-alpaca-lora-7b/discussions/1), it is better to simply use the generate function.

## Controlled generation, via generate function

In [23]:
# Generate Text - before fine-tune

cuda_name = 'cuda:0'
model = base_model
tokenizer = llama_tokenizer

text = "How do I use the OpenAI API?"
inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                           pad_token_id=tokenizer.eos_token_id,
                                           eos_token_id=tokenizer.eos_token_id,
                                           return_dict_in_generate=True, output_scores=True)

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
temp_output = tokenizer.decode(generated_tokens[0])

print(temp_output)

[/INST] I can ask you to provide what you need, to use, or what you would like to use, which will be helpful for you, if you have more help or use needed. 
I can also ask you about various tasks or questions you would not like, 
It's worth not what you want me, 

I can also help or use what you want, if you have any other area or if you need,, that'd be helpful, I might have, the use you'd be helpful has you like, the use you might want, I'd be helpful may be helpful. 
En què, 
En el área of what you want, I would like, I'd be helpful, you might have, I have neednir help or use, let me need, I might have, you'd be helpful, I have asked, I have asked, 
Sentence - Open.  - - Explainable, 

The phrase "Claroise what you did.  - Give me - I'd be helpful."  =  

We would like to help or use, I have asked, ask me, if I have any other area or inquiry you you need or, if - - or - - - I have asked, I have asked, you have asked, what you have asked, or what I have asked, I have asked, it's been 

In [24]:
# Generate Text - after fine-tune

cuda_name = 'cuda:0'
model = fine_tuning.model
tokenizer = llama_tokenizer

text = "How do I use the OpenAI API?"
inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                           pad_token_id=tokenizer.eos_token_id,
                                           eos_token_id=tokenizer.eos_token_id,
                                           return_dict_in_generate=True, output_scores=True)

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
temp_output = tokenizer.decode(generated_tokens[0])

print(temp_output)

I don't have any people with the United States? I doesn' have many people I'd like to connect with. Please include people you know with your [S]. I'd also like to connect at [The moment, I'd like to connect." 
Do you have more of what you'd like to connect or, just consider the moment or consider? 
Do you have further inquiry or query, please, I'd or consider, I'd or connect, I would like, I'd 
Do you have any further inquiry or or inquiring, I'd, or if no you'd, you'd, you'd, or it doesn', I'd, I, or 
Do you have any other inquiry or inquiry, please, I would or connect? I'd or connect, please,, I, or what other of my [day] I'd or connect, please, I have, I would, and the [nicht quite] person I need it?" 
Note that I cannot connect as well as other. 
Note that I may have an average total length of minutes or minutes, as well as an average total length of hours or moment, as to what a person is or is being do, or to what a [nicht average]. 
"Sincroniza with your "nicht average", we will

# Fine-grained performance analysis

In [25]:
import pandas as pd

df = pd.read_csv('dataset/implicit-hate-speech.csv')
df

Unnamed: 0,post,human_answer,llama2-13b-chat,llama2-13b-chat-FT-LoRA,llama2-7b-chat,llama2-7b-FT-LoRA
0,and i will point it out here when u call whit...,Blacks blame whites,"\n\nSo for example, if you would answer Yes, p...",\n\nYes: Explain that this text is an example ...,\n\nAnswer: I do not think implicit hate speec...,"\nThank you, I greatly appreciate any and all ..."
1,antiwhites expose their hatred of white kids ...,Anti-whites hate white people,"\n\nOf course, I won't be offended by your ans...",\n\nNote that if you do not see any implicit h...,\n### No \nThis text was not containing any ha...,\n\n\n\nHere is the text: \n\n\nHey buddy what...
2,interior sec y salazar pushing for hispanic n...,Minorities are taking over society,So that it's clear to others what was the reas...,\n\nHere is the short text: and i will point i...,\n- It's clear this text is containing implici...,Thank you.\n\nI hope that isn't too long 👌🏼💖\n...


In [26]:
from tqdm import tqdm

cuda_name = 'cuda:0'

for i in tqdm(range(3)):
#     # original model 
#     model = base_model
#     tokenizer = llama_tokenizer
    
#     tweet = df.loc[0,'post']

#     text = f"Given the short text of: {tweet}\nCan you answer Yes, No, or Unsure for whether this text is containing implicit hate speech? And then explain why in few setences."
#     inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
#     outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
#                                                pad_token_id=tokenizer.eos_token_id,
#                                                eos_token_id=tokenizer.eos_token_id,
#                                                return_dict_in_generate=True, output_scores=True)

#     input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
#     generated_tokens = outputs.sequences[:, input_length:]
#     temp_output = tokenizer.decode(generated_tokens[0])

#     df.loc[i, 'llama2-13b-chat'] = temp_output
    
    # fine-tuned model, using LoRA
    model = fine_tuning.model
    tokenizer = llama_tokenizer
    
    tweet = df.loc[0,'post']

    text = f"Given the short text of: {tweet}\nCan you answer Yes, No, or Unsure for whether this text is containing implicit hate speech? And then explain why in few setences."
    inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
    outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                               pad_token_id=tokenizer.eos_token_id,
                                               eos_token_id=tokenizer.eos_token_id,
                                               return_dict_in_generate=True, output_scores=True)

    input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
    generated_tokens = outputs.sequences[:, input_length:]
    temp_output = tokenizer.decode(generated_tokens[0])

    df.loc[i, 'llama2-7b-FT'] = temp_output

100%|████████████████████████████████████████████████████████████████████| 3/3 [01:48<00:00, 36.22s/it]


In [27]:
df.to_csv('dataset/implicit-hate-speech.csv', index=False)