In [7]:
import os
import json

import transformers
from peft import PeftModel
from transformers import LlamaForCausalLM as LLaMAForCausalLM
from transformers import LlamaTokenizer as LLaMATokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model
from datasets import load_dataset
from EvaluateTestSet import EvaluateTestSet
from transformers.integrations import TensorBoardCallback
from transformers import GenerationConfig

In [4]:
# data_files = "/root/data/t2c_train.json"
EXPERIMENTS_PATH = "/root/experiments/"
experiment_config = {
    "experiment_name": "t2c_concode_220428_v14",
    "fn_train_dataset":  "/root/data/t2c_train.json",
    
    "default_model": "decapoda-research/llama-7b-hf",

    # Setting for A100 - For 3090 
    "MICRO_BATCH_SIZE": 2,#4 # 8  # change to 4 for 3090
    "BATCH_SIZE": 10,#32#128

    "EPOCHS": 10,#20  # paper uses 3
    "LEARNING_RATE":  2e-4,  # from the original paper
    "CUTOFF_LEN": 256,#384, # 256 accounts for about 96% of the data
    "LORA_R": 4,
    "LORA_ALPHA": 16,#*2
    "LORA_DROPOUT": 0.05,

    # Trainer config
    "warmup_steps": 200,
    "fp16": True,
    "logging_steps": 10,
    "save_total_limit": 1,
    "save_strategy": 'steps',
    "save_steps": 100,
    "seed": 42,
    "logging_strategy": 'steps',
    "report_to": 'tensorboard',
    "mlm": False,

    "truncation": True,
    "padding": "max_length",

    "config_use_cache": False,
#     "resume_from_checkpoint": True,
    "resume_from_checkpoint": False,
    
    "bleu_batch_size": 5
}

experiment_config["GRADIENT_ACCUMULATION_STEPS"] = experiment_config["BATCH_SIZE"] // experiment_config["MICRO_BATCH_SIZE"]

In [5]:
current_experiment_path = os.path.join(EXPERIMENTS_PATH, 
                                       experiment_config["experiment_name"]
                                      )

In [19]:
current_experiment_path

'/root/experiments/t2c_concode_220428_v14'

In [18]:
if experiment_config['resume_from_checkpoint'] or True:
    if not os.path.exists(current_experiment_path):
        raise ValueError("this experment does not exist")
    else:
        fn_config = current_experiment_path + "/experiment_config.json"
        if json.load(open(fn_config, "r")) != experiment_config:
            raise ValueError("At previous time there was different config")
else:
    if os.path.exists(current_experiment_path):
        input("this experiment already was done")
#         json.dump(experiment_config, open(current_experiment_path+"/experiment_config.json", "w+"))
# json.dump(current_experiment_path, open("experiment_config.json", "w+"))

In [10]:
json.dump(experiment_config, open(current_experiment_path+"/experiment_config.json", "w+"))

In [8]:
if os.path.exists(current_experiment_path) and 

True

In [6]:
def init_lora_model_and_tokenizer(default_model,
                             LORA_R,
                             LORA_ALPHA,
                             LORA_DROPOUT
                            ):


    """
        
    """
    model = LLaMAForCausalLM.from_pretrained(
    default_model,
    load_in_8bit=True,
    device_map="auto",
    )
    tokenizer = LLaMATokenizer.from_pretrained(
        default_model, add_eos_token=True
    )

    model = prepare_model_for_int8_training(model)

    config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, config)

    tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

    return model, tokenizer



class MyCustomCallback(TensorBoardCallback):
    log_bleu_steps_factor = 5
    bleu_generation_max_new_tokens = 30
    bleu_fn_test_data = "temp/t2c_answers.json"
    bleu_fn_etalon = "temp/answers.json"
    log_step = 0
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        super().on_log(args, state, control, logs=logs, **kwargs)
        print("kwargs", len(kwargs), kwargs.keys())
        if self.tb_writer is not None:
            print(state)
            print(state.global_step)
            print(self.log_step)
            if (self.log_step % self.log_bleu_steps_factor ==0):
                model = kwargs['model']
                tokenizer = kwargs['tokenizer']

                generation_config = GenerationConfig(max_new_tokens = self.bleu_generation_max_new_tokens,
                                                     min_new_tokens = 5
                                                    )
                evaluator = EvaluateTestSet(generation_config = generation_config,
                                        fn_test_data = self.bleu_fn_test_data,
                                        fn_etalon = self.bleu_fn_etalon
                                       )

                metric_res = evaluator.evaluate(model=model, 
                                                tokenizer=tokenizer,
                                               )
                print(metric_res)
                for key, val in metric_res.items():
                    self.tb_writer.add_scalar(key, val, state.global_step)
                self.tb_writer.flush()
            self.log_step += 1

In [7]:
model, tokenizer = init_lora_model_and_tokenizer(default_model = experiment_config["default_model"],
                                                 LORA_R = experiment_config["LORA_R"],
                                                 LORA_ALPHA = experiment_config["LORA_ALPHA"],
                                                 LORA_DROPOUT = experiment_config["LORA_DROPOUT"]
                                                )





Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [8]:
data = load_dataset("json", 
                    data_files = experiment_config["fn_train_dataset"]
                   )

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-6f0c4e89fb84a2e8/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# experiment_config["save_total_limit"]

In [10]:
def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""


data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=experiment_config["truncation"],
        max_length=experiment_config["CUTOFF_LEN"],
        padding=experiment_config["padding"]
    )
)

trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=experiment_config["MICRO_BATCH_SIZE"],
        gradient_accumulation_steps=experiment_config["GRADIENT_ACCUMULATION_STEPS"],
        warmup_steps=experiment_config["warmup_steps"],
        num_train_epochs=experiment_config["EPOCHS"],
        learning_rate=experiment_config["LEARNING_RATE"],
        fp16=experiment_config["fp16"],
        logging_steps=experiment_config["logging_steps"],
        output_dir=current_experiment_path,#"lora-alpaca",
        save_total_limit=experiment_config["save_total_limit"],
        save_strategy = experiment_config["save_strategy"],
        save_steps = experiment_config["save_steps"],
        seed=experiment_config["seed"],
        logging_dir=current_experiment_path,
        logging_strategy=experiment_config["logging_strategy"],
        report_to=experiment_config["report_to"]
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, 
                                                               mlm=experiment_config["mlm"]
                                                              ),
    callbacks = [MyCustomCallback]
)
model.config.use_cache = experiment_config["config_use_cache"]
# print(len(trainer.optimizer.state['found_inf_per_device']))


trainer.train(resume_from_checkpoint=experiment_config["resume_from_checkpoint"])

model.save_pretrained(current_experiment_path)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Step,Training Loss
1,2.4916
2,2.4456
3,2.4694
4,2.5534
5,2.7124
6,2.49
7,2.6936
8,2.5669
9,2.7878
10,2.5453


kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.01, global_step=1, max_steps=1000, num_train_epochs=10, total_flos=101553222451200.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}], best_metric=None, best_model_checkpoint=None, is_local_process_zero=True, is_world_process_zero=True, is_hyper_param_search=False, trial_name=None, trial_params=None)
1
0


  0%|                                                     | 0/3 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.08s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 20126.22it/s]


{'EM': 0.0, 'BLEU': 1.5754303790798663e-06, 'brevity_penalty': 5.829466373086881e-05, 'ratio': 0.09302325581395349, 'translation_length': 76, 'reference_length': 817, 'precisions_0': 0.03896103896103896, 'precisions_1': 0.02127659574468085, 'precisions_2': 0.023809523809523808, 'precisions_3': 0.02702702702702703}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.02, global_step=2, max_steps=1000, num_train_epochs=10, total_flos=203106444902400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}], best_metric=None, best_model_checkpoint=None, is_local_process_zero=True, is_world_process_zero=True, is_hyper_param_search=False, trial_name=None, trial_params=None)
2
1
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.03, global_step=3

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.08s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 71049.76it/s]


{'EM': 0.0, 'BLEU': 2.1392821230327552e-06, 'brevity_penalty': 7.679592536518248e-05, 'ratio': 0.09547123623011015, 'translation_length': 78, 'reference_length': 817, 'precisions_0': 0.05063291139240506, 'precisions_1': 0.02040816326530612, 'precisions_2': 0.022727272727272728, 'precisions_3': 0.02564102564102564}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.07, global_step=7, max_steps=1000, num_train_epochs=10, total_flos=710872557158400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch': 0.0

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.11s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 67869.00it/s]


{'EM': 0.0, 'BLEU': 2.1392821230327552e-06, 'brevity_penalty': 7.679592536518248e-05, 'ratio': 0.09547123623011015, 'translation_length': 78, 'reference_length': 817, 'precisions_0': 0.05063291139240506, 'precisions_1': 0.02040816326530612, 'precisions_2': 0.022727272727272728, 'precisions_3': 0.02564102564102564}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.12, global_step=12, max_steps=1000, num_train_epochs=10, total_flos=1218638669414400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch': 0

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.09s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 18289.12it/s]


{'EM': 0.0, 'BLEU': 8.272684649263504e-07, 'brevity_penalty': 2.734321190450097e-05, 'ratio': 0.08690330477356181, 'translation_length': 71, 'reference_length': 817, 'precisions_0': 0.041666666666666664, 'precisions_1': 0.023809523809523808, 'precisions_2': 0.02702702702702703, 'precisions_3': 0.03125}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.17, global_step=17, max_steps=1000, num_train_epochs=10, total_flos=1726404781670400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch': 0.07, 'step':

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.07s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 69060.99it/s]


{'EM': 0.0, 'BLEU': 1.3947481499636897e-06, 'brevity_penalty': 5.051029741138152e-05, 'ratio': 0.09179926560587515, 'translation_length': 75, 'reference_length': 817, 'precisions_0': 0.039473684210526314, 'precisions_1': 0.021739130434782608, 'precisions_2': 0.024390243902439025, 'precisions_3': 0.027777777777777776}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.22, global_step=22, max_steps=1000, num_train_epochs=10, total_flos=2234170893926400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch'

kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.26, global_step=26, max_steps=1000, num_train_epochs=10, total_flos=2640383783731200.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch': 0.07, 'step': 7}, {'loss': 2.5669, 'learning_rate': 5e-06, 'epoch': 0.08, 'step': 8}, {'loss': 2.7878, 'learning_rate': 6e-06, 'epoch': 0.09, 'step': 9}, {'loss': 2.5453, 'learning_rate': 7e-06, 'epoch': 0.1, 'step': 10}, {'loss': 2.472, 'learning_rate': 8e-06, 'epoch': 0.11, 'step': 11}, {'loss': 2.5162, 'learning_

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.06s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 40316.92it/s]


{'EM': 0.0, 'BLEU': 1.3947481499636897e-06, 'brevity_penalty': 5.051029741138152e-05, 'ratio': 0.09179926560587515, 'translation_length': 75, 'reference_length': 817, 'precisions_0': 0.039473684210526314, 'precisions_1': 0.021739130434782608, 'precisions_2': 0.024390243902439025, 'precisions_3': 0.027777777777777776}
kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.27, global_step=27, max_steps=1000, num_train_epochs=10, total_flos=2741937006182400.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch'

kwargs 6 dict_keys(['model', 'tokenizer', 'optimizer', 'lr_scheduler', 'train_dataloader', 'eval_dataloader'])
TrainerState(epoch=0.31, global_step=31, max_steps=1000, num_train_epochs=10, total_flos=3148149895987200.0, log_history=[{'loss': 2.4916, 'learning_rate': 0.0, 'epoch': 0.01, 'step': 1}, {'loss': 2.4456, 'learning_rate': 0.0, 'epoch': 0.02, 'step': 2}, {'loss': 2.4694, 'learning_rate': 0.0, 'epoch': 0.03, 'step': 3}, {'loss': 2.5534, 'learning_rate': 1e-06, 'epoch': 0.04, 'step': 4}, {'loss': 2.7124, 'learning_rate': 2e-06, 'epoch': 0.05, 'step': 5}, {'loss': 2.49, 'learning_rate': 3e-06, 'epoch': 0.06, 'step': 6}, {'loss': 2.6936, 'learning_rate': 4e-06, 'epoch': 0.07, 'step': 7}, {'loss': 2.5669, 'learning_rate': 5e-06, 'epoch': 0.08, 'step': 8}, {'loss': 2.7878, 'learning_rate': 6e-06, 'epoch': 0.09, 'step': 9}, {'loss': 2.5453, 'learning_rate': 7e-06, 'epoch': 0.1, 'step': 10}, {'loss': 2.472, 'learning_rate': 8e-06, 'epoch': 0.11, 'step': 11}, {'loss': 2.5162, 'learning_

100%|█████████████████████████████████████████████| 3/3 [00:15<00:00,  5.11s/it]
100%|████████████████████████████████████████| 30/30 [00:00<00:00, 67396.42it/s]


{'EM': 0.0, 'BLEU': 1.3947481499636897e-06, 'brevity_penalty': 5.051029741138152e-05, 'ratio': 0.09179926560587515, 'translation_length': 75, 'reference_length': 817, 'precisions_0': 0.039473684210526314, 'precisions_1': 0.021739130434782608, 'precisions_2': 0.024390243902439025, 'precisions_3': 0.027777777777777776}


KeyboardInterrupt: 

In [11]:
model.save_pretrained(current_experiment_path)

In [12]:
!ls {current_experiment_path}

1683312104.0230427   checkpoint-30
1683312104.035074    checkpoint-31
1683312337.4801078   events.out.tfevents.1683312104.8d048d63ed1a.6054.0
1683312337.4871686   events.out.tfevents.1683312104.8d048d63ed1a.6054.2
1683312517.1667607   events.out.tfevents.1683312337.8d048d63ed1a.7269.0
1683312517.1768987   events.out.tfevents.1683312337.8d048d63ed1a.7269.2
1683312553.3835888   events.out.tfevents.1683312517.8d048d63ed1a.7269.4
1683312553.3904636   events.out.tfevents.1683312517.8d048d63ed1a.7269.6
1683312918.7901428   events.out.tfevents.1683312553.8d048d63ed1a.7392.0
1683312918.7971816   events.out.tfevents.1683312553.8d048d63ed1a.7392.2
adapter_config.json  events.out.tfevents.1683312918.8d048d63ed1a.7560.0
adapter_model.bin    events.out.tfevents.1683312918.8d048d63ed1a.7560.2
