In [20]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, LlamaTokenizerFast, LlamaForCausalLM
from datasets import load_dataset

In [3]:
import pandas as pd
import json

# Read data from the JSON file into a list of dictionaries

with open("kbase_all.json", "r") as json_file:
    data = json.load(json_file)

In [4]:
#data['data']

In [5]:
df = pd.DataFrame(data['data'])
df.head()

Unnamed: 0,question,answer
0,What is the citation for KBase?,The citation for KBase is as follows:\n\nArkin...
1,Who funds the Genomic Sciences Program DOE Sys...,The Genomic Sciences Program DOE Systems Biolo...
2,What are the four Award Numbers associated wit...,The four Award Numbers associated with the fun...
3,When was the documentation last modified?,The documentation was last modified 2 years ago.
4,"What topics are covered in the ""Getting Starte...","The ""Getting Started"" section of the documenta..."


In [6]:
print(df['question'][0],df['answer'][0])

What is the citation for KBase? The citation for KBase is as follows:

Arkin AP, Cottingham RW, Henry CS, Harris NL, Stevens RL, Maslov S, et al. KBase: The United States Department of Energy Systems Biology Knowledgebase. Nature Biotechnology. 2018;36: 566. doi: 10.1038/nbt.4163


In [7]:
# Save the data to a JSON file
with open("data.json", "w") as json_file:
    json.dump(data['data'], json_file, indent=4)

print("JSON data has been saved to output.json")

JSON data has been saved to output.json


In [8]:
ds = load_dataset("json",data_files="kbase_all.json",field='data')

In [9]:
ds['train']

Dataset({
    features: ['answer', 'question'],
    num_rows: 3040
})

In [10]:
split_ds = ds['train'].train_test_split(test_size=0.2)

In [11]:
split_ds['test']

Dataset({
    features: ['answer', 'question'],
    num_rows: 608
})

In [12]:
def generate_prompt(data_point):
    return f"""
    <human>: {data_point['question']}
    <assistant>: {data_point['answer']}
    """.strip()

In [13]:
model_name = "llama-2-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
model_path = '/scratch/ac.pgupta/convLLM/llama2_model/llama-2-7b-chat-tok/' #"/scratch/ac.pgupta/finetune_llama/Nemo/llama2-7b-hf/"
tokenizer = LlamaTokenizerFast.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path,
                                             device_map='auto',
                                             torch_dtype=torch.float16,quantization_config=bnb_config,
                                             )

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map = "auto",
#     quantization_config=bnb_config,
#     trust_remote_code=True
# )
model.config.use_cache = False

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [15]:
#tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [16]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt,padding=True,truncation=True)
    return tokenized_full_prompt

In [17]:
train_data = split_ds['train'].map(generate_and_tokenize_prompt)
val_data = split_ds['test'].map(generate_and_tokenize_prompt)

Map:   0%|          | 0/2432 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/608 [00:00<?, ? examples/s]

In [18]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [19]:
from peft import LoraConfig, get_peft_model


lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
    "q_proj",
    "v_proj",
    ]
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 33554432 || all params: 3533967360 || trainable%: 0.9494833591219133


In [22]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 1
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 3e-5
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    #group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [23]:
from trl import SFTTrainer

# max_seq_length = 512

# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_data,
#     peft_config=config,
#     dataset_text_field="text",
#     max_seq_length=max_seq_length,
#     tokenizer=tokenizer,
#     args=training_arguments,
# )
trainer = transformers.Trainer(
    model=model,
    train_dataset = train_data,
    eval_dataset = val_data,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False),
    args=training_arguments,
)

In [24]:
model.config.use_cache = False

In [25]:
import wandb
wandb.init(project="finetune llama",
           config={
               "batch_size": None,
               "learning_rate": 3e-5,
               "dataset": "kbase_docs",
           })

[34m[1mwandb[0m: Currently logged in as: [33mprachigupta[0m ([33mprachi_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
%%wandb
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.5379
20,2.5899
30,2.5053
40,2.372
50,2.2527
60,2.0758
70,1.9969
80,1.95
90,1.8373
100,1.8541




TrainOutput(global_step=1000, training_loss=1.3898888969421386, metrics={'train_runtime': 1695.5359, 'train_samples_per_second': 2.359, 'train_steps_per_second': 0.59, 'total_flos': 2.038155363552461e+16, 'train_loss': 1.3898888969421386, 'epoch': 1.64})

Exception in thread FileStreamThread:
Traceback (most recent call last):
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/threading.py", line 975, in run
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/site-packages/wandb/sdk/internal/file_stream.py", line 494, in _thread_except_body
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/site-packages/wandb/sdk/internal/file_stream.py", line 488, in _thread_except_body
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/site-packages/wandb/sdk/internal/file_stream.py", line 459, in _thread_body
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/site-packages/wandb/sdk/internal/file_stream.py", line 628, in request_with_retry
  File "/scratch/ac.pgupta/finetune_Falcon7b/env/lib/python3.11/site-packages/requests/sessions.py", line 637, in post
  File "/scratch/ac.pg

In [74]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mprachigupta[0m ([33mprachi_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.3102
20,2.1043
30,1.6588
40,1.5841
50,1.5213
60,1.3749
70,1.3033
80,1.2432
90,1.2564
100,1.2023




TrainOutput(global_step=500, training_loss=1.2379826793670654, metrics={'train_runtime': 1612.8498, 'train_samples_per_second': 1.24, 'train_steps_per_second': 0.31, 'total_flos': 2.000497359584256e+16, 'train_loss': 1.2379826793670654, 'epoch': 0.82})

In [33]:
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): RWForCausalLM(
          (transformer): RWModel(
            (word_embeddings): Embedding(65024, 4544)
            (h): ModuleList(
              (0-31): 32 x DecoderLayer(
                (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
                (self_attention): Attention(
                  (maybe_rotary): RotaryEmbedding()
                  (query_key_value): Linear4bit(
                    in_features=4544, out_features=4672, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4544, out_features=64, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=64, out_featu

In [35]:
model.save_pretrained('trained_model')

In [43]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
model.push_to_hub("pgupta2023/falcon-7b-ft-prac")

CommitInfo(commit_url='https://huggingface.co/pgupta2023/falcon-7b-ft-prac/commit/1f58dd084740474afd94c31d888da0da81b33ca6', commit_message='Upload model', commit_description='', oid='1f58dd084740474afd94c31d888da0da81b33ca6', pr_url=None, pr_revision=None, pr_num=None)