In [23]:
import pandas as pd
import torch
from datasets import load_dataset
from torch import cuda, bfloat16
import transformers
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from torch.optim import AdamW

In [2]:
## read your datasets here: Example; Download alpaca or dolly2.0 datasets from github
datasets = "/home/ubuntu/projects/ravi/Llama2/datasets/"

In [21]:
train_dataset = load_dataset(datasets, split = "train")

Downloading and preparing dataset csv/datasets to /home/ubuntu/.cache/huggingface/datasets/csv/datasets-a155d0aece92945d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/csv/datasets-a155d0aece92945d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


In [1]:
## To get the access of these models you need to request from hugging face it takes just few minutes to get it approved.: Try your hugging face emailid only 
## to request on meta site:
%%time
model_id = 'meta-llama/Llama-2-13b-chat-hf'
hf_auth = '<your hugging face API token>'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

In [None]:
%%time
## Initializing the tokenizer for `llama-2-13b-chat-hf` model:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
tokenizer.pad_token=tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

In [None]:
%%time
## impliment peft:
model = prepare_model_for_int8_training(model)
peft_config = LoraConfig(r=8, lora_alpha =32, lora_dropout=0.05, bias="none", task_type ="CAUSAL_LM")
model = get_peft_model(model, peft_config)

training_args = TrainingArguments(
    output_dir = "meta-llama-2-13b-chat-hf",
    per_device_train_batch_size = 4,
    optim = 'adamw_torch',
    logging_steps = 100,
    learning_rate=2e-4,
    fp16 =True,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    num_train_epochs=100,
    save_strategy ="epoch",
)

trainer = SFTTrainer(
    model=model,
    train_dataset = train_dataset,
    dataset_text_field="text",
    tokenizer= tokenizer,
    max_seq_length=1024,
    args = training_args,
    packing =True,
    peft_config = peft_config
)

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)
        
trainer.train()

In [None]:
lora_config = LoraConfig.from_pretrained('./meta-llama-2-13b-chat-h/checkpoint-1300')
model = get_peft_model(model, lora_config)

In [None]:
%%time 
text =" <Ask your questions here:>"
device = "cuda:0"

prompt_text = " <Write your prompt here'>: " + text

inputs = tokenizer(prompt_text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=300) 
print(tokenizer.decode(outputs[0], skip_special_tokens=True))