In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U huggingface_hub==0.16.4
!pip install -q -U datasets trl einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproje

In [2]:
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {'PROJECT NAME'}

Updated property [core/project].


In [3]:
!echo "deb https://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!apt -qq update && apt -qq install gcsfuse

deb https://packages.cloud.google.com/apt gcsfuse-jammy main
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2659  100  2659    0     0  30576      0 --:--:-- --:--:-- --:--:-- 30918
OK
21 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mhttps://packages.cloud.google.com/apt/dists/gcsfuse-jammy/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.[0m
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 21 not upgraded.
Need to get 5,558 kB of archives.
After this operation, 0 B of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 120874 files and directories currently installed.)
Preparing to unpack .../gcsfuse_1.2.0_amd64.deb ...
Unpacking gcsfuse (1.2.0) 

In [4]:
bucket_names = [
    'DATA_BUCKET',
    'CHECKPOINTS_BUCKET',
]

In [5]:
for bn in bucket_names:
    local_path = f'/mnt/gs/{bn}'

    !mkdir -p {local_path}
    !gcsfuse --implicit-dirs {bn} {local_path}

{"time":"03/11/2023 06:07:00.189269","severity":"INFO","msg":"Start gcsfuse/1.2.0 (Go version go1.21.0) for app \"\" using mount point: /mnt/gs/book_data_c4574d131983d222a2c0e72ed8faa842\n"}
{"time":"03/11/2023 06:07:01.599911","severity":"INFO","msg":"Start gcsfuse/1.2.0 (Go version go1.21.0) for app \"\" using mount point: /mnt/gs/checkpoints_e023c33fb601976be9365f7285881db1\n"}


In [6]:
from huggingface_hub import login
login('HUGGINGFACE KEY HERE')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
import bitsandbytes as bnb
import os
import torch
from datasets import Dataset
from peft import (
    AutoPeftModelForCausalLM,
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments
)
from trl import SFTTrainer

MAX_LEN = 1024
OUTPUT_DIR = f'/mnt/gs/{bucket_names[1]}'

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # huge drop in vram requirements and compute time for minimal drop in accuracy
    bnb_4bit_quant_type='nf4', # fp4 or nf4, two variants of 4-bit float data types; nf4 has more exponent and less mantissa, so a wider range but lower precision, and is a better choice from what I've read
    bnb_4bit_compute_dtype=torch.float16, # larger compute space prevents overflows and precision loss
)

peft_config = LoraConfig(
    r=32, # large determinant in training vram consumption; with 'lora_alpha' defines how large a representation of the original model will be used in training
    lora_alpha=128, # large determinant in training vram consumption; should always be higher than 'r'
    lora_dropout=.1, # how likely neurons are to be ignored
    bias='none',
    task_type='CAUSAL_LM',
)

training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1, # didn't investigate batching much as I preprocessed my data to tightly fit in 1024-token windows
    gradient_accumulation_steps=4,
    learning_rate=3e-5, # this is relatively low, but lora is unstable with higher: https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2#sensitivity-of-lora-to-learning-rate
    lr_scheduler_type='constant', # honestly not sure if constant or linear is better; denotes rate of change of learning rate
    # max_steps=20, # number forward-backward passes per batch of data; use this or num_train_epoch=4 or so
    num_train_epochs=1, # number of full passes over all training data; usually go higher than 1 but I had a large dataset and it quickly converged
    max_grad_norm=.3, # locks gradient descent under a maximum allowable descent distance per training step
    fp16=True, # 16-bit instead of 32-bit back pass calculations
    bf16=False, # bf16 is better than fp16 (more exponent bits, less mantissa bits), but only the newest gpu's can work with bf16
    optim='paged_adamw_32bit', # 'paged' means dynamic memory usage; 32bit calculation space for 16bit floats prevents overflows and precision loss
    save_steps=1000, # save checkpoints every n training steps; consider lowering if you have a less-than-stable system
    save_total_limit=20, # auto-deletes checkpoints such that only the most recent N are kept on file
    logging_steps=3,
    warmup_ratio=.03, # prevent erratic weight shifts in the first training iterations
    group_by_length=True, # reshuffles training data such that similar-length data is trained on in succession; slightly speeds up training
)

In [9]:
dataset = Dataset.from_json(f'/mnt/gs/{bucket_names[0]}/books_960.ds.json')
dataset = dataset.map(remove_columns=['genre', 'title'])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12082 [00:00<?, ? examples/s]

In [10]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [11]:
def print_trainable_parameters(model, use_4bit=False):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [12]:
model_name = 'meta-llama/Llama-2-13b-chat-hf'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    use_cache=False,
    device_map='auto',
    trust_remote_code=True,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # check if your model needs an end-of-sentence token defined
tokenizer.padding_side = 'right' # padding side 'left' increases training speed but I read somewhere it can introduce bugs into quantized training

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [14]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
modules = find_all_linear_names(model)
peft_config.target_modules = modules
model = get_peft_model(model, peft_config)

print_trainable_parameters(model) # many layers and neuron types, like convolution, pooling, etc. don't have trainable weights

all params: 6,797,153,280 || trainable params: 125,173,760 || trainable%: 1.8415615308884132


In [15]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    # tokenizer=tokenizer, # use this or data_collator; data_collator adds padding dynamically, will lower average length of training samples by maybe 5% over tokenizer
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=MAX_LEN,
    args=training_arguments,
)

Map:   0%|          | 0/12082 [00:00<?, ? examples/s]

In [None]:
trainer.train()

In [17]:
final_checkpoint_path = os.path.join(OUTPUT_DIR, 'final_checkpoint')
final_merged_path = os.path.join(OUTPUT_DIR, 'final_merged')

os.makedirs(final_checkpoint_path, exist_ok=True)
os.makedirs(final_merged_path, exist_ok=True)

trainer.model.save_pretrained(final_checkpoint_path)

#free memory for merging weights
del model
del trainer
torch.cuda.empty_cache()

In [18]:
# merge model with lora adapter
model = AutoPeftModelForCausalLM.from_pretrained(
    final_checkpoint_path,
    device_map='auto',
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

model = model.merge_and_unload()
model.save_pretrained(final_merged_path, safe_serialization=True)

# we didn't change the tokenizer at all, this is just to save locally for quicker load time when using the model later
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(final_merged_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

('/mnt/gs/checkpoints_e023c33fb601976be9365f7285881db1/final_merged/tokenizer_config.json',
 '/mnt/gs/checkpoints_e023c33fb601976be9365f7285881db1/final_merged/special_tokens_map.json',
 '/mnt/gs/checkpoints_e023c33fb601976be9365f7285881db1/final_merged/tokenizer.json')