In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from pynvml import *
from pathlib import Path

curdir = Path(os.getcwd())
sys.path.append(str(curdir.parent.absolute()))

from src.utils.data import read_pickle

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_gpu_utilization()

  from .autonotebook import tqdm as notebook_tqdm


GPU memory occupied: 51 MB.


In [2]:
print_gpu_utilization()

checkpoint = 'microsoft/biogpt'

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    # torch_dtype=torch.float16,
).cuda()

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

print_gpu_utilization()

GPU memory occupied: 51 MB.
GPU memory occupied: 2292 MB.


In [3]:
from peft import LoraConfig, get_peft_model 
import copy

# original_model = copy.deepcopy(model)
original_model = model

print_gpu_utilization()

# config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     # target_modules=["query_key_value"],
#     target_modules=["k_proj", "v_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# original_model = get_peft_model(model, config)

print_gpu_utilization()

print_trainable_parameters(original_model)
print_trainable_parameters(model)

GPU memory occupied: 2292 MB.
GPU memory occupied: 2292 MB.
trainable params: 346763264 || all params: 346763264 || trainable%: 100.00
trainable params: 346763264 || all params: 346763264 || trainable%: 100.00


In [4]:
from datasets import load_dataset

qa_dataset = load_dataset("squad_v2")

In [5]:
def create_prompt(context, question, answer):
  if len(answer["text"]) < 1:
    answer = "Cannot Find Answer"
  else:
    answer = answer["text"][0]
  prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
  return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

In [6]:
import transformers
import os
os.environ['WANDB_DISABLED'] = "true"

torch.cuda.reset_peak_memory_stats()

trainer = transformers.Trainer(
    model=original_model, 
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=20, 
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=10,
        learning_rate=1e-3, 
        fp16=True,
        logging_steps=1, 
        output_dir='outputs',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
original_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
results = trainer.train()
print_summary(results)
print_gpu_utilization()
print_trainable_parameters(original_model)

# BLOOM
# GPU memory occupied: 73010 MB (No LoRA)
# GPU memory occupied: 22054 MB (With LoRA)

# BioGPT
# GPU memory occupied: 53908 MB (With LoRA)
# GPU memory occupied: 55292 MB (No LoRA)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
1,4.1171
2,4.0766
3,3.9105
4,3.8097
5,3.6164
6,3.3912
7,3.4754
8,3.5074
9,3.394
10,3.3963


Attempted to log scalar metric loss:
4.1171
Attempted to log scalar metric learning_rate:
1e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
4.0766
Attempted to log scalar metric learning_rate:
2e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
3.9105
Attempted to log scalar metric learning_rate:
3e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
3.8097
Attempted to log scalar metric learning_rate:
4e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
3.6164
Attempted to log scalar metric learning_rate:
5e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
3.3912
Attempted to log scalar metric learning_rate:
6e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log scalar metric loss:
3.4754
Attempted to log scalar metric learning_rate:
7.000000000000001e-05
Attempted to log scalar metric epoch:
0.0
Attempted to log s

: 