# Intro

Work captured here: https://github.com/cytora/cytora-llm-testing/issues/8




In [1]:
model_id = "EleutherAI/pythia-12b"

dataset_name = "databricks/databricks-dolly-15k"

seed = 42
gradient_checkpointing = False

In [2]:
import torch

def get_free_memory():
    n_gpus = torch.cuda.device_count()
    mem_info = torch.cuda.mem_get_info()
    max_mem_all_gpus = []
    for i in range(n_gpus):
        free_in_GB = int(mem_info[i] / 1024**3)
        max_memory = f"{free_in_GB-2}GB"
        max_mem_all_gpus.append(max_memory)
    return max_mem_all_gpus


get_free_memory()

['43GB', '45GB']

In [3]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

In [4]:
import os

os.environ["NUMEXPR_MAX_THREADS"] = "24"

In [5]:
# import os

# huggingface_token = os.getenv("HF_TOKEN")
# !huggingface-cli login --token $huggingface_token

In [6]:
import logging

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
logging.getLogger("py4j").setLevel(logging.WARNING)
logging.getLogger("sh.command").setLevel(logging.ERROR)

logger = logging.getLogger(__name__)

In [7]:
![ -e databricks_consts.py ] && echo "databricks_consts.py already downloaded" ||  wget -O databricks_consts.py https://raw.githubusercontent.com/databrickslabs/dolly/master/training/consts.py

databricks_consts.py already downloaded


# Load tokenizer

In [8]:
from transformers import PreTrainedTokenizer, AutoTokenizer
from databricks_consts import END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL

def load_tokenizer(pretrained_model_name_or_path: str) -> PreTrainedTokenizer:
    logger.info(f"Loading tokenizer for {pretrained_model_name_or_path}")
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]})
    return tokenizer

tokenizer = load_tokenizer(model_id)

  from .autonotebook import tqdm as notebook_tqdm
2023-06-13 22:45:41 INFO [__main__] Loading tokenizer for EleutherAI/pythia-12b


# Load base model

In [9]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType


def load_model(
    pretrained_model_name_or_path: str, *, gradient_checkpointing: bool = False
) -> AutoModelForCausalLM:
    logger.info(f"Loading model for {pretrained_model_name_or_path}")

    # not QLoRA yet - get regular LoRA working first!
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )


    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True,
        use_cache=not gradient_checkpointing,
        device_map="auto",
        # quantization_config=bnb_config,  # for QLoRA
    )

    # maybe enable this?
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=gradient_checkpointing)

    # The dimension used by the LoRA update matrices
    LORA_R = 8
    # Scaling factor
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05

    # r and alpha together control the total number of final trainable parameters when using LoRA, giving you the flexibility to balance a trade-off between end performance and compute efficiency.
    config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",  # Specifies if the bias parameters should be trained
        task_type=TaskType.CAUSAL_LM,
        target_modules=["query_key_value"],
        # inference_mode=False  # default is False
    )
    model = get_peft_model(model, config)

    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    model.print_trainable_parameters()
    return model


model = load_model(model_id, gradient_checkpointing=gradient_checkpointing)

# If you see this error below:
# UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files
# Ignore it - bitsandbytes just doesn't know about Ubuntu's alternatives system.

2023-06-13 22:45:41 INFO [torch.distributed.nn.jit.instantiator] Created a temporary directory at /tmp/tmpers2ns4x
2023-06-13 22:45:41 INFO [torch.distributed.nn.jit.instantiator] Writing /tmp/tmpers2ns4x/_remote_module_non_scriptable.py


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/opyate/anaconda3/envs/pythia-dolly-lora-py39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/opyate/anaconda3/envs/pythia-dolly-lora-py39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


2023-06-13 22:45:42 INFO [__main__] Loading model for EleutherAI/pythia-12b
Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.13s/it]


trainable params: 5,898,240 || all params: 11,851,970,560 || trainable%: 0.04976590154473013


# Load dataset

In [10]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from databricks_consts import PROMPT_WITH_INPUT_FORMAT, PROMPT_NO_INPUT_FORMAT
from functools import partial
from typing import Any, Dict, List, Tuple, Union
import numpy as np


def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


def load_training_dataset(path_or_dataset) -> Dataset:
    logger.info(f"Loading dataset: {path_or_dataset}")
    dataset = load_dataset(path_or_dataset)["train"]
    
    logger.info("Found %d rows", dataset.num_rows)

    def _add_text(rec):
        instruction = rec["instruction"]
        response = rec["response"]
        context = rec.get("context")

        if not instruction:
            raise ValueError(f"Expected an instruction in: {rec}")

        if not response:
            raise ValueError(f"Expected a response in: {rec}")

        # For some instructions there is an input that goes along with the instruction, providing context for the
        # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract
        # some piece of information from it.  The response is that information to extract.  In other cases there is
        # no input.  For example, the instruction might be open QA such as asking what year some historic figure was
        # born.
        if context:
            rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)
        else:
            rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
        return rec

    dataset = dataset.map(_add_text)

    return dataset

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, training_dataset: str, seed=42) -> Dataset:
    """Loads the training dataset and tokenizes it so it is ready for training.

    Args:
        tokenizer (AutoTokenizer): Tokenizer tied to the model.
        max_length (int): Maximum number of tokens to emit from tokenizer.

    Returns:
        Dataset: HuggingFace dataset
    """

    dataset = load_training_dataset(training_dataset)

    logger.info("Preprocessing dataset")
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Make sure we don't have any truncated records, as this would mean the end keyword is missing.
    logger.info("Processed dataset has %d rows", dataset.num_rows)
    dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
    logger.info("Processed dataset has %d rows after filtering for truncated records", dataset.num_rows)

    logger.info("Shuffling dataset")
    dataset = dataset.shuffle(seed=seed)

    logger.info("Done preprocessing")

    return dataset


class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

In [11]:
max_length = None
for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
    max_length = getattr(model.config, length_setting, None)
    if max_length:
        logger.info(f"Found max lenth: {max_length}")
        break
if not max_length:
    max_length = 1024
    logger.info(f"Using default max length: {max_length}")

print(f"max_length: {max_length}")

processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed, training_dataset=dataset_name)

test_size = int(len(processed_dataset["input_ids"]) * 0.2)
print(f"test_size: {test_size}")

split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)

logger.info("Train data size: %d", split_dataset["train"].num_rows)
logger.info("Test data size: %d", split_dataset["test"].num_rows)

data_collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)

2023-06-13 22:46:16 INFO [__main__] Found max lenth: 2048
2023-06-13 22:46:16 INFO [__main__] Loading dataset: databricks/databricks-dolly-15k


max_length: 2048


100%|██████████| 1/1 [00:00<00:00, 972.03it/s]
2023-06-13 22:46:17 INFO [__main__] Found 15011 rows
2023-06-13 22:46:18 INFO [__main__] Preprocessing dataset
2023-06-13 22:46:18 INFO [__main__] Processed dataset has 15011 rows
2023-06-13 22:46:18 INFO [__main__] Processed dataset has 14977 rows after filtering for truncated records
2023-06-13 22:46:18 INFO [__main__] Shuffling dataset
2023-06-13 22:46:18 INFO [__main__] Done preprocessing
2023-06-13 22:46:18 INFO [__main__] Train data size: 11982
2023-06-13 22:46:18 INFO [__main__] Test data size: 2995


test_size: 2995


# Train

In [12]:
import pathlib

base_folder = f"dolly_training/dolly_{timestamp}"

# ensure base_folder exists
pathlib.Path(base_folder).mkdir(parents=True, exist_ok=True)

In [13]:
from transformers import TrainingArguments, Trainer


bf16 = True  # RTX
save_steps = 200  # from dolly databricks notebook
save_total_limit = 20  # from dolly databricks notebook
logging_steps = 10  # from dolly databricks notebook
eval_steps = 50  # from dolly databricks notebook
epochs = 2
logging_steps = 10  # dolly @click default
per_device_train_batch_size = 1  # OK for A10 GPUs
per_device_eval_batch_size = per_device_train_batch_size
lr = 5e-6  # from dolly databricks notebook


training_args = TrainingArguments(
    output_dir=f"{base_folder}/output_dir",  # see below
    gradient_accumulation_steps=4,
    warmup_steps=2,
    # max_steps=10,  # replaced with num_train_epochs ???
    
    # logging_steps=1,  # see below
    optim="paged_adamw_8bit",
    
    # copied from dolly
    remove_unused_columns=False,  # https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.remove_unused_columns
    report_to="tensorboard",
    load_best_model_at_end=False,
    save_strategy="steps",
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    
    logging_dir=f"{base_folder}/runs",
    logging_strategy="steps",
    logging_steps=logging_steps,
    evaluation_strategy="steps",
    eval_steps=eval_steps,
    
    num_train_epochs=epochs,
    gradient_checkpointing=gradient_checkpointing,
    
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    
    fp16=False,
    bf16=bf16,
    learning_rate=lr,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    data_collator=DataCollatorForCompletionOnlyLM(tokenizer, mlm=False),
)

trainer.train()

  0%|          | 0/5990 [00:00<?, ?it/s]You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 10/5990 [00:09<1:39:42,  1.00s/it]

{'loss': 3.744, 'learning_rate': 4.993319973279894e-06, 'epoch': 0.0}


  0%|          | 20/5990 [00:18<1:23:57,  1.19it/s]

{'loss': 3.6911, 'learning_rate': 4.98496993987976e-06, 'epoch': 0.01}


  1%|          | 30/5990 [00:26<1:22:55,  1.20it/s]

{'loss': 3.1721, 'learning_rate': 4.976619906479626e-06, 'epoch': 0.01}


  1%|          | 40/5990 [00:35<1:23:02,  1.19it/s]

{'loss': 3.1695, 'learning_rate': 4.968269873079493e-06, 'epoch': 0.01}


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 47.50 GiB total capacity; 45.60 GiB already allocated; 34.38 MiB free; 45.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

I'm getting OutOfMemoryError at this point, so will convert this to a script which I can launch with `accelerate`.

# Save the model

In [None]:
trainer.save_model(output_dir=f"{base_folder}/model_output_dir")

In [None]:
import os

dataset_name_simple = os.path.basename(dataset_name)
model_size = os.path.basename(model_id).split("-")[2]

hf_model_name = f"{dataset_name_simple}_lora_{model_size}_ep{epochs}_lr{lr}_batch{per_device_train_batch_size}"
hf_model_name

In [None]:
# private first, so we can test
model.push_to_hub(f"opyate/{hf_model_name}", private=True)

In [None]:
!ls -la $base_folder/model_output_dir