In [None]:
!pip install --user --upgrade git+https://github.com/huggingface/transformers.git
!pip install --upgrade git+https://github.com/huggingface/accelerate.git
!pip install datasets

In [None]:
!python --version

In [None]:
!pip install bitsandbytes
!pip install git+https://github.com/huggingface/peft.git

In [1]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

0

In [2]:
import os
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, AutoConfig
from transformers import BitsAndBytesConfig
import torch


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/envs/ctp/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/envs/ctp/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


[2023-07-13 18:05:11,927] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
data_num=-1
max_source_len = 512
max_target_len=512
#cache_data = 'cache_data/summarize_python'
cache_data = 'batchsize_experiments/pandu'
load = 'Salesforce/codet5p-16b'
# load = "HuggingFaceH4/starchat-alpha"
# Training
epochs=10
lr=5e-3
lr_warmup_steps=200
batch_size_per_replica=1
grad_acc_steps=16
local_rank=-1
deepspeed="ds.json"
fp16=True

# Logging and stuff
save_dir="saved_models/summarize_python"
log_freq=10
save_freq=500
os.makedirs(save_dir, exist_ok=True)

In [4]:
def run_training(model, train_data):
    print(f"Starting main loop")

    training_args = TrainingArguments(
        report_to='tensorboard',
        output_dir=save_dir,
        overwrite_output_dir=False,

        do_train=True,
        save_strategy='epoch',

        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size_per_replica,
        gradient_accumulation_steps=grad_acc_steps,
        learning_rate=lr,
        weight_decay=0.05,
        warmup_steps=lr_warmup_steps,

        logging_dir=save_dir,
        logging_first_step=True,
        logging_steps=log_freq,
        save_total_limit=1,

        dataloader_drop_last=True,
        dataloader_num_workers=2,

        local_rank=local_rank,
#         deepspeed=deepspeed,
#         fp16=fp16,
        bf16 = True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
    )

    trainer.train()

    if local_rank in [0, -1]:
        final_checkpoint_dir = os.path.join(save_dir, "final_checkpoint")
        model.save_pretrained(final_checkpoint_dir)
        print(f'  ==> Finish training and save to {final_checkpoint_dir}')

In [5]:
#for deepspeed
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(load)
config = AutoConfig.from_pretrained(load, trust_remote_code=True, revision="main")
config.decoder_start_token_id = tokenizer.bos_token_id
config.pad_token_id = tokenizer.pad_token_id
#for deepspeed
config.max_position_embeddings = 512
# print('Model hidden size: ', config.cross_attention_hidden_size)

import math
import numpy as np
def get_model_size(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    model_size = sum([np.prod(p.size()) for p in model_parameters])
    return "{}M".format(round(model_size / 1e+6))

def freeze_decoder_except_xattn_codegen(model):
    print(f'Para before freezing: {model.num_parameters()}, trainable para: {get_model_size(model)}')
    for param in model.decoder.parameters():
        param.requires_grad = False

    num_decoder_layers = model.decoder.config.n_layer
    for i in range(num_decoder_layers):
        each_decoder_layer = model.decoder.transformer.h[i]
        if hasattr(each_decoder_layer, 'crossattention'):
            for param in each_decoder_layer.crossattention.parameters():
                param.requires_grad = True
            each_decoder_layer.crossattention.to(torch.float32)

        if hasattr(each_decoder_layer, 'alpha_xattn'):
            each_decoder_layer.alpha_xattn.requires_grad = True
    print(f'Para after freezing: {model.num_parameters()}, trainable para: {get_model_size(model)}')

def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 512)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

def preprocess_function(examples):
    source = [ex for ex in examples["question"]]
#     source = [ex for ex in examples["input"]]
    target = [ex for ex in examples["answer"]]

    model_inputs = tokenizer(source, max_length=max_source_len, padding="max_length", truncation=True)
    labels = tokenizer(target, max_length=max_target_len, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"].copy()
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
    ]
    return model_inputs


def load_tokenize_data():
#     if os.path.exists(cache_data):
#         train_data = load_from_disk(cache_data)
#         print(f'  ==> Loaded {len(train_data)} samples')
# #         res =  convert_size(train_data.size_in_bytes)
# #         print('Dataset Size:', res)
#         return train_data, config
#     else:
        datasets = load_from_disk("/home/unnati/batchsize_experiments/pandu")
#         datasets = load_dataset("semeru/text-code-codesummarization", split="validation")
        #datasets = datasets.select(range(20))
        #res =  convert_size(datasets.size_in_bytes)
        #print('Dataset Size:', res)
        train_data = datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=datasets.column_names,
            num_proc=64,
            load_from_cache_file=False,
        )
        print(f'  ==> Loaded {len(train_data)} samples')
        # train_data.save_to_disk(cache_data)
        # print(f'  ==> Saved to {cache_data}')

        return train_data, config

In [7]:
datasets = load_from_disk("/home/unnati/batchsize_experiments/pandu")
print(datasets )

Dataset({
    features: ['question', 'answer'],
    num_rows: 197
})


In [8]:
print(datasets['question'][0])  

question


In [9]:
print(datasets['answer'][0])

answers


In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
train_data, config = load_tokenize_data()

#LORA
from peft import LoraConfig, get_peft_model, TaskType
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj","v_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.SEQ_2_SEQ_LM
# )


#QLORA

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

qlora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q_proj","v_proj"]
)


if data_num != -1:
    train_data = train_data.select([i for i in range(data_num)])

model = AutoModelForSeq2SeqLM.from_pretrained(load,config=config,trust_remote_code=True,
                                              revision="main", 
#                                               low_cpu_mem_usage=True, 
                                              quantization_config=bnb_config)
# freeze_decoder_except_xattn_codegen(model)
print('Model: ', convert_size(model.get_memory_footprint()))
model = get_peft_model(model, qlora_config)
print('PEFT Model: ',convert_size(model.get_memory_footprint()))
print(model.print_trainable_parameters())

print(f"  ==> Loaded model from {load}, model size {model.num_parameters()}")

run_training(model, train_data)

Map (num_proc=64):   0%|          | 0/197 [00:00<?, ? examples/s]

  ==> Loaded 197 samples


Saving the dataset (0/1 shards):   0%|          | 0/197 [00:00<?, ? examples/s]

  ==> Saved to batchsize_experiments/pandu


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Model:  8.41 GB
PEFT Model:  8.42 GB
trainable params: 3,743,744 || all params: 8,434,923,520 || trainable%: 0.04438385233871095
None
  ==> Loaded model from Salesforce/codet5p-16b, model size 8434923520
Starting main loop




Step,Training Loss
1,1.5504
10,1.7393
20,1.5057
30,1.4119
40,1.2057
50,1.0213
60,0.9024
70,0.7909
80,0.6548
90,0.7547


  ==> Finish training and save to saved_models/summarize_python/final_checkpoint


In [13]:
tokenizer.save_pretrained('saved_models/summarize_python/final_checkpoint')

('saved_models/summarize_python/final_checkpoint/tokenizer_config.json',
 'saved_models/summarize_python/final_checkpoint/special_tokens_map.json',
 'saved_models/summarize_python/final_checkpoint/vocab.json',
 'saved_models/summarize_python/final_checkpoint/merges.txt',
 'saved_models/summarize_python/final_checkpoint/added_tokens.json',
 'saved_models/summarize_python/final_checkpoint/tokenizer.json')

# Inference

In [22]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from transformers import BitsAndBytesConfig
import time, os

load = "Salesforce/instructcodet5p-16b"
device = "cuda"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

max_len = 512

import re
def truncate(completion):
    import re
    
    def find_re(string, pattern, start_pos):
        m = pattern.search(string, start_pos)
        return m.start() if m else -1

    terminals = [re.compile(r, re.MULTILINE) for r in [re.escape('<|end|>'),"^'''", '^"""', '\n\n\n']]

    prints = list(re.finditer('^print', completion, re.MULTILINE))
    if len(prints) > 1:
        completion = completion[:prints[1].start()]

    defs = list(re.finditer('^def', completion, re.MULTILINE))
    if len(defs) > 1:
        completion = completion[:defs[1].start()]

    start_pos = 0

    terminals_pos = [pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1]
    if len(terminals_pos) > 0:
        return completion[:min(terminals_pos)]
    else:
        return completion  

def preprocess_function(examples):
    source = [ex for ex in examples["question"]]
#     source = [ex for ex in examples["input"]]
    target = [ex for ex in examples["answer"]]

    model_inputs = tokenizer(source, max_length=max_source_len, padding="max_length", truncation=True)
    labels = tokenizer(target, max_length=max_target_len, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"].copy()
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in model_inputs["labels"]
    ]
    return model_inputs



# Inference without fine tuning

In [17]:
tokenizer = AutoTokenizer.from_pretrained(load)
model = AutoModelForSeq2SeqLM.from_pretrained(load,
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True,
                                              trust_remote_code=True,
                                              quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [23]:
def inf_without_ft(test_ip):
    start_time= time.time()
    fmt_test_ip = preprocess_function(test_ip)
    encoding = tokenizer(fmt_test_ip, return_tensors="pt").to(device)
    encoding['decoder_input_ids'] = encoding['input_ids'].clone()
    outputs = model.generate(**encoding, max_length=512)
    resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
    stop_time=time.time()
    duration =stop_time - start_time
    return resp, duration

In [24]:
test_ip = """i want to drop a specific row from the dataframe. Provide me all the possible ways to to achive the task. Dataframe is stored in a variable df"""

In [25]:
resp, duration =inf_without_ft(test_ip)
print(str(duration)+' s')
print(resp[len(fmt_test_ip):])

TypeError: string indices must be integers