# Download all the Python Libraries

In [1]:
# Check if the copmuter is on google colab
import sys
if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    !pip install rich
    !pip install "accelerate>=0.16.0,<1" 
    !pip install "torch>=1.13.1"
    !pip install "transformers[torch]>=4.28.1,<5" 
    !pip install "datasets>=1.14.0,<2"
    !pip install bitsandbytes
    !pip install sentencepiece
    !pip install triton
    !pip install einops
    !pip install safetensors
    !pip install langchain
    !pip install gradio
    !pip install -q -U git+https://github.com/huggingface/peft.git
    !pip install -q datasets
else:
    print("Not running on Google Colab")
from rich import print
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
ROOT_PATH = Path(__file__).parent.parent

Not running on Google Colab


# Check the GPU env
1. You can check the GPU in the Google Colab by clicking  and efficieny
2. Check if the GPU can use bfloat16 most effective as most model are pre-trained with bfloat16

In [2]:
import torch
from rich import print
if torch.cuda.is_available():
    !nvidia-smi
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("Cuda capability: ", torch.cuda.get_device_capability(0))
    '''
    On pre-ampere hardware bf16 works, but doesn't provide speed-ups compared to fp32 matmul operations, and some matmul operations are failing outright, so this check is more like "guaranteed to work and be performant" than "works somehow".  https://github.com/pytorch/pytorch/issues/75427
    '''
    print(f"bfloat16 support: { torch.cuda.is_bf16_supported()}") 

# Set the Seed Environment of the Notebook to ensure the reproducibility

In [3]:
from transformers import set_seed

DEFAULT_SEED = 42

set_seed( DEFAULT_SEED )

# Download the Datset from the Hugging Face Datset Face Dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset(
    "Rami/prompts",
)
print(dataset)

Found cached dataset parquet (/home/null/.cache/huggingface/datasets/Rami___parquet/Rami--prompts-a4a4f069a7addc53/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

# Split the dataset 

# Download the Tokenizers
1. We are suing Dolly model which was trained on the Pythia model. Instead we are recreating the dollvy tokenizer from the Pythia tokenizer

In [11]:
from transformers import AutoTokenizer

# Special Tokens
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

PRETRAINED_MODEL_NAME_OR_PATH = "databricks/dolly-v2-3b"
eleutherai_python_3b = "EleutherAI/pythia-2.8b"
dolly_v2_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)
print(dolly_v2_tokenizer)
pythia_tokenizer = AutoTokenizer.from_pretrained(eleutherai_python_3b)
print(pythia_tokenizer)

# Make sure that the pad token is the end of the tokens
pythia_tokenizer.pad_token = pythia_tokenizer.eos_token

# Add special tokens for End , Instruction , Response Key

pythia_tokenizer.add_special_tokens({
    "additional_special_tokens": [
        END_KEY,
        INSTRUCTION_KEY,
        RESPONSE_KEY,
    ]
})

print(pythia_tokenizer)


# Download the Model
1. Torch Datat

## Setup Bits and Butes Config

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neox-20b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold = 6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


## Download the LM Models
Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
from transformers import AutoModelForCausalLM
n_gpus = torch.cuda.device_count()

free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'

n_gpus = torch.cuda.device_count()
max_memory = {i: max_memory for i in range(n_gpus)}
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path = PRETRAINED_MODEL_NAME_OR_PATH,
    trust_remote_code = True,
    use_cache = False,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
    load_in_4bit = True,
    load_in_8bit = False,
    low_cpu_mem_usage = True,
    max_memory =  max_memory,
    quantize_config = bnb_config,
)


print(model)

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

## Set up the LoRa Models

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# References
[1] [Dolly Github](https://github.com/databrickslabs/dolly/blob/5021d941d95dddcf1f00d978d7f944709873f419/training/trainer.py#L138)
[2] https://gist.github.com/Birch-san/57878c4a27cf34f57d3e861865a7d0a2
[3]