In [None]:
!pip install -q torch datasets
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
import transformers

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

2023-10-12 22:11:42.510010: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-12 22:11:42.720690: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import getpass
import locale; locale.getpreferredencoding = lambda: "UTF-8"
import logging
import os
import torch
import yaml

In [3]:
def generate_text(model_var, tokenizer_var, text, generation_config):
  
    inputs = tokenizer_var(text, return_tensors='pt')

    model_output = model_var.generate(
              inputs["input_ids"].to('cuda'), 
              generation_config=generation_config
          )[0]

    text_output = tokenizer_var.decode(model_output, skip_special_tokens=True)

    return text_output

In [4]:
generation_config = GenerationConfig(max_new_tokens=100, do_sample=True, temperature=0.7)

In [None]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = getpass.getpass("Token:")
assert os.environ["HUGGING_FACE_HUB_TOKEN"]

In [None]:
model_name='meta-llama/Llama-2-7b-chat-hf'

model_config = transformers.AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
sentence = "Where am I?"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0], 
        skip_special_tokens=True
    )

print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)

In [None]:

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    quantization_config=bnb_config,
)

In [None]:
%%time

for i in ["shakespeare", "plato", "moses"]:
    print(f"I am {i}")
    print("What are we going?")

    text = f"""
    Ok you are {i}, can you answer the following question in 2 sentences

    Where are we going?
    """

    print(generate_text(base_model, tokenizer, text, generation_config))

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)

### Finetuning

In [None]:
huggingface_dataset_name = "pubmed_qa"
#huggingface_dataset_name = "mlabonne/guanaco-llama2-1k"

dataset = load_dataset(huggingface_dataset_name, "pqa_labeled", split = "train")

#dataset = load_dataset(huggingface_dataset_name, split="train")

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

In [None]:
dataset.to_pandas().head(10)

In [None]:
def process_text(row):
    
    prompt = f"""[INST] {row['question']} [/INST] Short answer; {row['final_decision']}. Long Answer; {row['long_answer']}"""
    row['prompt'] = prompt
    
    return row

In [None]:
print(2)

In [None]:
dataset = dataset.map(process_text)

In [None]:
print(dataset.to_pandas().head(100)['prompt'].values[90])

In [None]:
train_test_datasets = dataset.train_test_split(test_size=0.1)

In [None]:
train_test_datasets

In [None]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
training_arguments = TrainingArguments(
    output_dir="weights_pubmed",
    fp16=True,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    save_steps=100,
    max_steps=1000)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_test_datasets['train'],
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=500,
    tokenizer=tokenizer,
    eval_dataset=train_test_datasets['test'],
    args=training_arguments
)

In [None]:
del base_model
del model

In [None]:
# Train model
trainer.train()

In [None]:
trainer.model.save_pretrained("pubmed_trial_1k")

In [5]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"":0},
    cache_dir="testing_new_dir"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"":0},
    cache_dir="testing_new_dir"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 21.99 GiB total capacity; 21.24 GiB already allocated; 13.69 MiB free; 21.24 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
generation_config = GenerationConfig(max_new_tokens=100, do_sample=True, temperature=0.7)

In [None]:
prompt = "Is HIV/STD control in Jamaica making a difference?"
print("base model")
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(generate_text(base_model, tokenizer, f"<s>[INST] {prompt} [/INST]", generation_config))

In [None]:
peft_model = PeftModel.from_pretrained(base_model, "pubmed_trial_1k")
tuned_model = peft_model.merge_and_unload()

In [None]:
print("tuned model")
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(generate_text(model, tokenizer, f"<s>[INST] {prompt} [/INST]", generation_config))

In [None]:
model

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])