# Installation
(Start here if it is your 1st time)

In [None]:
#!pip install -q -U bitsandbytes
#!pip install -q -U git+https://github.com/huggingface/transformers.git
#!pip install -q -U git+https://github.com/huggingface/peft.git
#!pip install -q -U git+https://github.com/huggingface/accelerate.git
#!pip install -q -U datasets
#!pip install -q -U sentencepiece
#!pip install -q -U PyPDF2
#!pip install -q -U wandb

# Imports 
(Start here if its not your 1st time)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from datasets import Dataset
from peft import PeftModel
import datetime
from datetime import datetime
import wandb

# Configs

In [None]:
project = "legislinho"
save_base_path = "trainings"
ts = str(datetime.timestamp(datetime.now())).replace(".","")
model_id_tokenizer = "teknium/OpenHermes-2.5-Mistral-7B"
model_id = "./legislinhOpenHermesFinalPreTraining"
model_id_tokenizer = "lrds-code/samba-1.1B"
model_id = "./legislinhoSambaFinalPreTraining"
#max_lenght=0
#model_id = "lrds-code/samba-1.1B"
#max_lenght=2048
# BnB Config
load_in_4bit=True
bnb_4bit_use_double_quant=True
bnb_4bit_quant_type="nf4"
bnb_4bit_compute_dtype=torch.bfloat16
# LoraConfig
r=64
lora_alpha=128
target_modules=["q_proj","k_proj","v_proj","o_proj"]
lora_dropout=0.05
bias="none"
task_type="CAUSAL_LM"
# Dataset
type="json"
data_files = "./datasets_jsons_pretraining_2000/*"
# Encoder function
truncation=True
padding='max_length'
# Trainer Args
per_device_train_batch_size=8
gradient_accumulation_steps=32
warmup_steps=2
num_train_epochs=9
learning_rate=2e-4
fp16=True
logging_steps=1
optim="paged_adamw_8bit"
save_strategy="epoch"
load_best_model_at_end=True
# Save after training
save_adapter=True
save_config=True

# Code execution

In [None]:
wandb.init(
    project=project,
    config={
        "save_base_path": save_base_path,
        "ts": ts,
        "model_id": model_id,
        "load_in_4bit": load_in_4bit,
        "bnb_4bit_use_double_quant": bnb_4bit_use_double_quant,
        "bnb_4bit_quant_type": bnb_4bit_quant_type,
        "bnb_4bit_compute_dtype": bnb_4bit_compute_dtype,
        "lora_rank": r,
        "lora_alpha": lora_alpha,
        "target_modules": target_modules,
        "lora_dropout": lora_dropout,
        "bias": bias,
        "task_type": task_type,
        "type": type,
        "data_files": data_files,
        "truncation": truncation,
        "padding": padding,
        "per_device_train_batch_size": per_device_train_batch_size,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "warmup_steps": warmup_steps,
        "num_train_epochs": num_train_epochs,
        "learning_rate": learning_rate,
        "fp16": fp16,
        "logging_steps": logging_steps,
        "optim": optim,
        "save_strategy": save_strategy,
        "load_best_model_at_end": load_best_model_at_end,
        "save_adapter": save_adapter,
        "save_config": save_config
    }
)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    vanilla huggingface implementation
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
config = LoraConfig(
    r=r,
    lora_alpha=lora_alpha,
    target_modules=target_modules, # adiciona nos modulos locos da vida
    lora_dropout=lora_dropout,
    bias=bias,
    task_type=task_type
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
def encode(examples):
    return tokenizer(examples["data"],  truncation=truncation, padding=padding)

In [None]:
data = load_dataset(type,data_files=data_files)

In [None]:
data = data.map(lambda samples: encode(samples), batched=True)

In [None]:
output_dir = save_base_path+"/"+project+"-"+ts

In [None]:
torch.utils.checkpoint.use_reentrant=False # check if this working

In [None]:
trainer_args = transformers.TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=warmup_steps,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        fp16=fp16,
        logging_steps=logging_steps,
        output_dir=output_dir,
        optim=optim,
        save_strategy=save_strategy,
#        load_best_model_at_end=load_best_model_at_end,
        report_to="wandb",
    )

In [None]:
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args = trainer_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), # find a way to generalize
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

# Run Training

In [None]:
trainer.train()

In [None]:
model.save_pretrained(output_dir+"-final", save_adapter=save_adapter, save_config=save_config)