In [None]:
!pip install "transformers==4.31.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2"

# Dataset Creation

In [6]:
import pandas as pd    
train = pd.read_json('data/a/train_en.jsonl', lines= True)
val = pd.read_json('data/a/val_val_en.jsonl', lines= True)
test = pd.read_json('data/a/test_test_en.jsonl', lines= True)

In [7]:
from datasets import Dataset
dataset_train = Dataset.from_pandas(pd.DataFrame(data=train))
dataset_val = Dataset.from_pandas(pd.DataFrame(data=val))
dataset_test = Dataset.from_pandas(pd.DataFrame(data=test))

In [8]:
import json
def create_text_row(instruction, output):
    text_row = f"""<s>[INST] {instruction} [/INST] \\n {output} </s>"""
    return text_row

# interate over all the rows formate the dataset and store it in a jsonl file
def process_jsonl_file(dataset, output_file_path):
    with open(output_file_path, "w") as output_jsonl_file:
        for item in dataset:
            json_object = {
                "text": create_text_row(item["sentence"], item["ner"]),
                "instruction": item["sentence"],
                "output": item["ner"]
            }
            output_jsonl_file.write(json.dumps(json_object) + "\n")

# Provide the path where you want to save the formatted dataset
process_jsonl_file(dataset_train,"data/a/train.jsonl")
process_jsonl_file(dataset_val,"data/a/val.jsonl")
process_jsonl_file(dataset_test,"data/a/test.jsonl")

In [None]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='data/a/train.jsonl',split="train")
eval_dataset = load_dataset('json', data_files='data/a/val.jsonl',split="train")

# Load the base model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# Select Hugging Face model id gated or non-gated
model_id = "models--NousResearch--Llama-2-7b-hf/snapshots/dacdfcde31297e34b19ee0e7532f29586d2c17bc"

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# PEFT config

In [11]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=512,
        lora_dropout=0.1,
        r=256,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['q_proj','v_proj','k_proj','o_proj','lm_head',"gate_proj", "down_proj", "up_proj"]
)
# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 648,871,936 || all params: 4,149,284,864 || trainable%: 15.63816313576681


# Training Parameters

In [12]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="llama-7-int4-ner-a",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True, # disable tqdm since with packing values are in correct,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    report_to="tensorboard"
)

# Create the trainer

In [13]:
%%time
from trl import SFTTrainer

# max sequence length for model and packing of the dataset
max_seq_length = 2048 

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    args=args,
    dataset_text_field='text'
)

CPU times: user 73.9 ms, sys: 4.33 ms, total: 78.3 ms
Wall time: 77.4 ms


# Training

In [None]:
# train
trainer.train() 
# save model
trainer.save_model()