##### Fine-tune Llama2 without LoRA for QA


reference code: https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/

In [4]:
# import warnings
# warnings.filterwarnings('error', category=DeprecationWarning)

import warnings
warnings.filterwarnings('ignore')

In [5]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7
import os, torch, logging
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

Error [NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported] resolving:

1. pip install -U datasets
2. pip install fsspec==2023.9.2
3. restart the kernel of this jupyter notebook

https://stackoverflow.com/questions/77433096/notimplementederror-loading-a-dataset-cached-in-a-localfilesystem-is-not-suppor

In [6]:
# Dataset
data_name = "mlabonne/guanaco-llama2-1k"
training_data = load_dataset(data_name, split="train")

# Model and tokenizer names
base_model_name = "/home/fanhuan/cache/llama-2-13b-chat-hf"
refined_model = "/home/fanhuan/cache/llama-2-13b-chat-hf-TF"
# base_model_name = "/home/fanhuan/cache/llama-2-7b-hf"
# refined_model = "/home/fanhuan/cache/llama-2-7b-chat-hf-TF"
cache_dir = "/data/fanhuan/cache/temp/13b"

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
# If do not use the LoRA, it is better to turn the 'load_in_4bit' into False
#, since the source code of this part is not perfect right now, Apr-30-2024
quant_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
#     device_map={"": 0}
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /home/fanhuan/cache/llama-2-13b-chat-hf and are newly initialized: ['model.layers.35.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.34.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.29.sel

How to use the neptune features in Transformers:

https://docs.neptune.ai/integrations/transformers/#__tabbed_2_1

In [9]:
# from transformers.integrations import NeptuneCallback
# import neptune

# run = neptune.init_run(
#     project="fhuang181/LoRA", 
#     api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxZmI2ZTA2OC00ZGIxLTQ2NDktYTU4YS1jOWUyNWIwYmU3YWUifQ==", # your credentials
# )

# neptune_callback = NeptuneCallback(run=run)

# # LoRA Config
# peft_parameters = LoraConfig(
#     lora_alpha=16,
#     lora_dropout=0.1,
#     r=8,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# Training Params
train_params = TrainingArguments(
    output_dir=cache_dir,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    # very important setting for keep the disk space enough for further training
    save_total_limit = 1,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    # use the report_to parameter to avoid error in neptune stuff
    # report_to="none"
    report_to="tensorboard"
    # report_to="neptune"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    # peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,
    # callbacks=[neptune_callback]
)

fine_tuning.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.8425
50,4.3427
75,8.3495
100,8.3043
125,8.1116
150,7.6407
175,7.8652
200,7.3525
225,7.5921
250,7.2867


TrainOutput(global_step=250, training_loss=7.068787139892578, metrics={'train_runtime': 3573.8231, 'train_samples_per_second': 0.28, 'train_steps_per_second': 0.07, 'total_flos': 3.313755669049344e+16, 'train_loss': 7.068787139892578, 'epoch': 1.0})

In [10]:
fine_tuning.model.save_pretrained(refined_model)
fine_tuning.tokenizer.save_pretrained(refined_model)

('/home/fanhuan/cache/llama-2-13b-chat-hf-TF/tokenizer_config.json',
 '/home/fanhuan/cache/llama-2-13b-chat-hf-TF/special_tokens_map.json',
 '/home/fanhuan/cache/llama-2-13b-chat-hf-TF/tokenizer.model',
 '/home/fanhuan/cache/llama-2-13b-chat-hf-TF/added_tokens.json',
 '/home/fanhuan/cache/llama-2-13b-chat-hf-TF/tokenizer.json')

In [11]:
# Fine-tuned model
prompt = "How do I use the OpenAI API?"
pipe = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=llama_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How do I use the OpenAI API? [/INST] is is the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


In [12]:
# Original model
prompt = "How do I use the OpenAI API?"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=llama_tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How do I use the OpenAI API? [/INST] is is the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


### Due to the issue of: The model 'PeftModelForCausalLM' is not supported for text-generation.

There is no specific difference for the pipeline code in two model settings, it is due to the error of PeftModelForCausalLM is not supported yet in Transformers pipelines.

According to (https://huggingface.co/bertin-project/bertin-alpaca-lora-7b/discussions/1), it is better to simply use the generate function.

## Controlled generation, via generate function

In [19]:
# Generate Text - before fine-tune

cuda_name = 'cuda:0'
model = base_model
tokenizer = llama_tokenizer

text = "How do I use the OpenAI API?"
inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                           pad_token_id=tokenizer.eos_token_id,
                                           eos_token_id=tokenizer.eos_token_id,
                                           return_dict_in_generate=True, output_scores=True)

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
temp_output = tokenizer.decode(generated_tokens[0])

print(temp_output)

.. to of to
 in and the, the/ and3 the  an.,. the,.

 be the and, of.,. is
 and the a a
 and. " un the in from ( to'.". the,en
..ic for, de The  " I-:. can"  or as and of . no., " of is the of'./ a  as  
 and. ,, ", as [ the. and  as. to

' of is  the " within s. "3. for  and, the  " to .s is of, as the, the. to/ andes,ing, of
 and. is the. notic' or which not can./.
 and-'s and
 has  of withs I the.INST] The is is of is and is The as to " I which you the the is the Earth
 of are the is the to a, [ of and is not the the the " of the about the the' is is, The the not a the The, of the is ' the a " to of the the the the  and the is is the the The the is an in of and and the and the is the the and with for the the is the the the the the information the if some a the the the of  the the the as. the is The that about which
, the the. the a the is the " the the all the the a is also. that of the as and the the is of is and
 the the the the a the the' The the The to the which the  the, and of

In [20]:
# Generate Text - after fine-tune

cuda_name = 'cuda:0'
model = fine_tuning.model
tokenizer = llama_tokenizer

text = "How do I use the OpenAI API?"
inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                           pad_token_id=tokenizer.eos_token_id,
                                           eos_token_id=tokenizer.eos_token_id,
                                           return_dict_in_generate=True, output_scores=True)

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
temp_output = tokenizer.decode(generated_tokens[0])

print(temp_output)


 of)1 and that the as ",,, a of not, from , for not no-. the, and a ",  ",, and a
. is is "? be- in- foring. it no,ing an I,
 and.  the  is is the' the, de the in of a, :- has the which/.en to in,,.',, the is3,  the,/INST] What several is is is " and that the has and can and the all to the of and like  is the thes or an the [ the which the is as the is to the is is  does iss the in the to of  or not are. I is the the of of is  a the this, between with or to a  froms the the as The is- to Earth the about of the, this the the the would with is has is the as. is the The ising' the and which not its the the is with The about is it is'
, the to is and of the in of to which the the not to the a known the the not are the the is  the of is the in the, the the the is is the the about  that the is the,  and the and the, the he which a the the of this the or  or the and and is of the a in the for [ of is the the to
 the of  is ands which all and the is that the  the which The [ and " is a the th

# Fine-grained performance analysis

In [18]:
import pandas as pd

df = pd.read_csv('dataset/implicit-hate-speech.csv')
df

Unnamed: 0,post,human_answer,llama2-13b-chat,llama2-13b-chat-FT-LoRA,llama2-7b,llama2-7b-FT-LoRA,llama2-7b-FT,llama2-13b-chat-FT
0,and i will point it out here when u call whit...,Blacks blame whites,"\n\nSo for example, if you would answer Yes, p...",\n\nYes: Explain that this text is an example ...,\n\nAnswer: I do not think implicit hate speec...,"\nThank you, I greatly appreciate any and all ...",[/INST] You have any further or other question...,",. -. a-,,\n as of and to is the, and-. of. th..."
1,antiwhites expose their hatred of white kids ...,Anti-whites hate white people,"\n\nOf course, I won't be offended by your ans...",\n\nNote that if you do not see any implicit h...,\n### No \nThis text was not containing any ha...,\n\n\n\nHere is the text: \n\n\nHey buddy what...,Can you explain? - - - - - - - - - - - - - - ...,"and for the,en of ( as, is the"":, and en and o..."
2,interior sec y salazar pushing for hispanic n...,Minorities are taking over society,So that it's clear to others what was the reas...,\n\nHere is the short text: and i will point i...,\n- It's clear this text is containing implici...,Thank you.\n\nI hope that isn't too long 👌🏼💖\n...,It Is also deliberately deliberately misreali...,", that a which as,, a,,\n./3. that the and of,..."


In [16]:
from tqdm import tqdm

cuda_name = 'cuda:0'

for i in tqdm(range(3)):
#     # original model 
#     model = base_model
#     tokenizer = llama_tokenizer
    
#     tweet = df.loc[0,'post']

#     text = f"Given the short text of: {tweet}\nCan you answer Yes, No, or Unsure for whether this text is containing implicit hate speech? And then explain why in few setences."
#     inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
#     outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
#                                                pad_token_id=tokenizer.eos_token_id,
#                                                eos_token_id=tokenizer.eos_token_id,
#                                                return_dict_in_generate=True, output_scores=True)

#     input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
#     generated_tokens = outputs.sequences[:, input_length:]
#     temp_output = tokenizer.decode(generated_tokens[0])

#     df.loc[i, 'llama2-13b-chat'] = temp_output
    
    # fine-tuned model, using LoRA
    model = fine_tuning.model
    tokenizer = llama_tokenizer
    
    tweet = df.loc[0,'post']

    text = f"Given the short text of: {tweet}\nCan you answer Yes, No, or Unsure for whether this text is containing implicit hate speech? And then explain why in few setences."
    inputs = tokenizer([text], return_tensors="pt").to(cuda_name)
    outputs = model.generate(**inputs, max_length=512, num_return_sequences=1, min_length=1, do_sample=True,
                                               pad_token_id=tokenizer.eos_token_id,
                                               eos_token_id=tokenizer.eos_token_id,
                                               return_dict_in_generate=True, output_scores=True)

    input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
    generated_tokens = outputs.sequences[:, input_length:]
    temp_output = tokenizer.decode(generated_tokens[0])

    df.loc[i, 'llama2-13b-chat-FT'] = temp_output

100%|███████████████████████████████████████████████████████████████████| 3/3 [11:08<00:00, 222.84s/it]


In [17]:
df.to_csv('dataset/implicit-hate-speech.csv', index=False)