<a href="https://colab.research.google.com/github/mitchelljphayes/COMP9444-group-assignment/blob/main/notebooks/mitch_fine_tune_llama2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade accelerate evaluate transformers datasets tiktoken torch rouge_score trl peft bitsandbytes



In [2]:
!nvidia-smi
!nvcc --version

Sat Nov 11 00:05:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    25W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import math
import numpy as np
import torch
import evaluate
import transformers
from io import open
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoModelForCausalLM, Trainer, AutoTokenizer, TrainingArguments, logging, BitsAndBytesConfig
from transformers.integrations import TensorBoardCallback
from datasets import load_dataset

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
if torch.backends.mps.is_available():
    device = 'mps'



In [4]:
print(device)
print(transformers.__version__)


cuda
4.35.0


In [5]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

In [6]:
project_root = '..'
dataset_name = 'vblagoje/lfqa_support_docs'
data_dir = os.path.join(project_root, 'data/lfqa/')
model_dir = os.path.join(project_root, 'models/')
# model_checkpoint = 'gpt2'
# model_checkpoint = 'distilgpt2'
# model_checkpoint = 'mistralai/Mistral-7B-v0.1'
model_checkpoint = 'NousResearch/Llama-2-7b-chat-hf'
# model_checkpoint = 'Salesforce/xgen-7b-8k-base'

In [8]:
compute_dtype = getattr(torch, 'float16')

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)



In [9]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, cache_dir=model_dir, quantization_config=quant_config, device_map={"":0})
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir=model_dir, trust_remote_code=True)
tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [10]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [12]:
special_tokens_dict = {'eos_token': '<|endoftext|>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(32002, 4096)

In [13]:
# if not os.path.exists(data_dir):
#     os.mkdir(data_dir)
data = load_dataset(dataset_name, cache_dir=data_dir)

Repo card metadata block was not found. Setting CardData to empty.


In [14]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'input', 'output', 'meta'],
        num_rows: 226147
    })
    validation: Dataset({
        features: ['id', 'input', 'output', 'meta'],
        num_rows: 3020
    })
})

In [15]:
train_data = data['train']
val_data = data['validation']
# test_data = data['test']

In [16]:
train_data

Dataset({
    features: ['id', 'input', 'output', 'meta'],
    num_rows: 226147
})

In [17]:
train_data = train_data.shard(24, 0)
train_data

Dataset({
    features: ['id', 'input', 'output', 'meta'],
    num_rows: 9423
})

In [18]:
def transform(example):
    qa = f"Question: {example['input']} \n Answer: {example['output'][0]['answer']}"
    max_length = 1024
    id_tensor = tokenizer.encode(qa, truncation=True, padding="max_length", max_length=max_length, return_tensors='np').squeeze(0)
    # id_tensor = tokenizer.encode(qa, return_tensors='np')
    return {"input_ids": id_tensor, "text": qa} ##"attention_mask": torch.ones(max_length)}

In [19]:
train_data_tokz = train_data.map(transform, num_proc=4, remove_columns=["input", "output", "meta", "id"])
val_data_tokz = train_data.map(transform, num_proc=4, remove_columns=["input", "output", "meta", "id"])

In [20]:
train_data_tokz

Dataset({
    features: ['input_ids', 'text'],
    num_rows: 9423
})

In [21]:
example = train_data_tokz[5]['input_ids']
len(example)

1024

In [22]:
model.resize_token_embeddings(len(tokenizer))
# model = prepare_model_for_int8_training(model)
# model = get_peft_model(model, lora_peft_config)

Embedding(32002, 4096)

In [23]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"../{model_dir}/{model_name}-finetuned-{dataset_name}",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    # per_device_eval_batch_size=4,
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=3.0,
    max_steps=-1,
    warmup_ratio=0.3,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    # metric_for_best_model="eval_loss",
    # greater_is_better=False,
    # evaluation_strategy="epoch",
    # prediction_loss_only=True,
    # save_strategy="epoch",
)

In [24]:
metric = evaluate.load("rouge")

In [25]:
def compute_metrics(eval_pred):
    logits, past_keys = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=past_keys)

In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data_tokz,
    peft_config=peft_params,
    dataset_text_field='text',
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    # eval_dataset=val_data_tokz,
    # compute_metrics=compute_metrics,
)



In [27]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
import gc
del variables
gc.collect()

In [28]:
trainer.train()
trainer.save_model(model_dir)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: ignored