In [2]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [15]:

def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{24000}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # quantization_config=bnb_config,
        # device_map="auto", # dispatch efficiently the model on the available ressources
        # max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


In [16]:
base_model_name = "meta-llama/Llama-2-7b-hf"


In [17]:

def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

In [18]:
bnb_config = create_bnb_config()

In [19]:
bbnb_config = create_bnb_config()

In [20]:
model, tokenizer = load_model(base_model_name, bnb_config)

Downloading (…)fetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 14.8MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 9.98G/9.98G [28:11<00:00, 5.90MB/s]
Downloading shards:  50%|█████     | 1/2 [40:01<40:01, 2401.68s/it]


ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [None]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset


In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


In [None]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters 
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    
    sample["text"] = formatted_prompt

    return sample

In [None]:

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
seed = 42

# Load the databricks dataset from Hugging Face
from datasets import load_dataset

dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)


Found cached dataset json (/home/lukas/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Loading cached processed dataset at /home/lukas/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-499895df8962eaa7.arrow
Loading cached processed dataset at /home/lukas/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-00cdfdb03a001d54.arrow
Loading cached processed dataset at /home/lukas/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7e9d1ea528656142.arrow
Loading cached shuffled indices for dataset at /home/lukas/.c

Found max lenth: 4096
Preprocessing dataset...


In [None]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 14999
})

In [None]:
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()
    
    
output_dir = "results/llama2/final_checkpoint3"
train(model, tokenizer, dataset, output_dir)

all params: 3,540,389,888 || trainable params: 39,976,960 || trainable%: 1.1291682911958425
torch.float32 302387200 0.08541070604255438
torch.uint8 3238002688 0.9145892939574456
Training...


[34m[1mwandb[0m: Currently logged in as: [33ml2k2[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.5446
2,1.9185
3,1.6438
4,1.5944
5,1.5932
6,1.2774
7,1.3211
8,1.3192
9,1.2997
10,1.2163


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   629604GF
  train_loss               =     1.3982
  train_runtime            = 0:00:52.13
  train_samples_per_second =      1.535
  train_steps_per_second   =      0.384
{'train_runtime': 52.1326, 'train_samples_per_second': 1.535, 'train_steps_per_second': 0.384, 'total_flos': 676032502579200.0, 'train_loss': 1.3981655985116959, 'epoch': 0.01}
Saving last checkpoint of the model...


In [None]:
import wandb
wandb.finish()

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁███
train/global_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇███
train/learning_rate,▅██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/loss,█▆▄▄▄▃▃▃▃▃▂▁▂▃▃▃▃▄▄▂
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,0.01
train/global_step,20.0
train/learning_rate,0.0
train/loss,1.0383
train/total_flos,676032502579200.0
train/train_loss,1.39817
train/train_runtime,52.1326
train/train_samples_per_second,1.535
train/train_steps_per_second,0.384


In [None]:
model_name = "meta-llama/Llama-2-7b-hf" 

# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
merged_model = model.merge_and_unload()

output_merged_dir = "results/llama2/final_merged_checkpoint2"
os.makedirs(output_merged_dir, exist_ok=True)
merged_model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('results/llama2/final_merged_checkpoint/tokenizer_config.json',
 'results/llama2/final_merged_checkpoint/special_tokens_map.json',
 'results/llama2/final_merged_checkpoint/tokenizer.json')

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    "results/llama2/final_checkpoint3",
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model_unsafe")
tokenizer.save_pretrained("merged_model_unsafe")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model_unsafe/tokenizer_config.json',
 'merged_model_unsafe/special_tokens_map.json',
 'merged_model_unsafe/tokenizer.json')

In [None]:
print(tokenizer.decode(dataset[2]['input_ids'], skip_special_tokens=True))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Given this paragraph, how many public high schools are in Arlington, Virginia?

Input:
Arlington Public Schools operates the county's public K-12 education system of 22 elementary schools; 6 middle schools (Dorothy Hamm Middle School, Gunston Middle School, Kenmore Middle School, Swanson Middle School, Thomas Jefferson Middle School, and Williamsburg Middle School); and 3 public high schools (Wakefield High School, Washington-Liberty High School, and Yorktown High School). H-B Woodlawn and Arlington Tech are alternative public schools. Arlington County spends about half of its local revenues on education. For the FY2013 budget, 83 percent of funding was from local revenues, and 12 percent from the state. Per pupil expenditures are expected to average $18,700, well above its neighbors, Fairfax County ($13,600) and Montgomery County ($14,900).

### Response:
There a

In [None]:

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")




NotImplementedError: Cannot copy out of meta tensor; no data!

In [None]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (de

In [None]:
text = "Hello my name is"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Hello my name is Melissa. nobody knows me and i'm 13 years old. I have been writing


In [None]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 14999
})

In [12]:
from datasets import load_dataset
ds =  load_dataset("json", data_files="dataset/training_data.json")
ds = ds['train'].train_test_split(test_size=0.3)
ds['train']
# dataset.train_test_split(test_size=0.3)
# dataset
# dataset.train_test_split(test_size=0.3)
# test_dataset = load_dataset("dataset", split="test")
# train_dataset = load_dataset("dataset", split="train")



Dataset({
    features: ['user', 'answer'],
    num_rows: 375
})

In [38]:
dataset[1]

{'input_ids': [1,
  13866,
  338,
  385,
  15278,
  393,
  16612,
  263,
  3414,
  29889,
  14350,
  263,
  2933,
  393,
  7128,
  2486,
  1614,
  2167,
  278,
  2009,
  29889,
  13,
  13,
  2277,
  29937,
  2799,
  4080,
  29901,
  13,
  22110,
  21614,
  278,
  1298,
  297,
  22556,
  29973,
  13,
  13,
  4290,
  29901,
  13,
  29911,
  9517,
  338,
  263,
  1153,
  3522,
  7980,
  393,
  338,
  5318,
  2845,
  29689,
  2750,
  263,
  2323,
  23995,
  296,
  313,
  2976,
  793,
  29897,
  470,
  1546,
  1023,
  10907,
  310,
  1023,
  10769,
  1269,
  313,
  29881,
  283,
  7586,
  467,
  7806,
  4847,
  3913,
  263,
  22556,
  1153,
  3522,
  393,
  338,
  851,
  686,
  411,
  13793,
  304,
  21283,
  263,
  298,
  2952,
  14051,
  495,
  8287,
  10664,
  411,
  7091,
  975,
  470,
  2820,
  263,
  7787,
  322,
  964,
  278,
  23995,
  296,
  29915,
  29879,
  8973,
  29889,
  450,
  1203,
  310,
  278,
  3748,
  338,
  304,
  767,
  7297,
  4090,
  276,
  278,
  8287,
  297,
  1316

In [41]:
print(tokenizer.decode(dataset[2]['input_ids'], skip_special_tokens=True))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Given this paragraph, how many public high schools are in Arlington, Virginia?

Input:
Arlington Public Schools operates the county's public K-12 education system of 22 elementary schools; 6 middle schools (Dorothy Hamm Middle School, Gunston Middle School, Kenmore Middle School, Swanson Middle School, Thomas Jefferson Middle School, and Williamsburg Middle School); and 3 public high schools (Wakefield High School, Washington-Liberty High School, and Yorktown High School). H-B Woodlawn and Arlington Tech are alternative public schools. Arlington County spends about half of its local revenues on education. For the FY2013 budget, 83 percent of funding was from local revenues, and 12 percent from the state. Per pupil expenditures are expected to average $18,700, well above its neighbors, Fairfax County ($13,600) and Montgomery County ($14,900).

### Response:
There a

In [26]:
text2="""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Given this paragraph, how many public high schools are in Arlington, Virginia?

Input:
Arlington Public Schools operates the county's public K-12 education system of 22 elementary schools; 6 middle schools (Dorothy Hamm Middle School, Gunston Middle School, Kenmore Middle School, Swanson Middle School, Thomas Jefferson Middle School, and Williamsburg Middle School); and 3 public high schools (Wakefield High School, Washington-Liberty High School, and Yorktown High School). H-B Woodlawn and Arlington Tech are alternative public schools. Arlington County spends about half of its local revenues on education. For the FY2013 budget, 83 percent of funding was from local revenues, and 12 percent from the state. Per pupil expenditures are expected to average $18,700, well above its neighbors, Fairfax County ($13,600) and Montgomery County ($14,900).

### Response:
"""

In [1]:

inputs = tokenizer(text2, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

NameError: name 'tokenizer' is not defined