In [None]:
%pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:

model_name = "lonestar108/base" # The model that you want to train from the Hugging Face hub
dataset_name = "lonestar108/enlightenedllm" # The instruction dataset to use
new_model = "enlightenedllm" # Fine-tuned model name

################################################################################
# QLoRA parameters
################################################################################
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha parameter for LoRA scaling
lora_dropout = 0.1 # Dropout probability for LoRA layers

################################################################################
# bitsandbytes parameters
################################################################################
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)

################################################################################
# TrainingArguments parameters
################################################################################
output_dir = "./results" # Output directory where the model predictions and checkpoints will be store
num_train_epochs = 5 # Number of training epochs
fp16 = False # Enable fp16/bf16 training (set bf16 to True with an A100)
bf16 = True

per_device_train_batch_size = 4 # Batch size per GPU for training
per_device_eval_batch_size = 4 # Batch size per GPU for evaluation
gradient_accumulation_steps = 1 # Number of update steps to accumulate the gradients for
gradient_checkpointing = True # Enable gradient checkpointing
max_grad_norm = 0.3 # Maximum gradient normal (gradient clipping)
learning_rate = 2e-4 # Initial learning rate (AdamW optimizer)
weight_decay = 0.001 # Weight decay to apply to all layers except bias/LayerNorm weights
optim = "paged_adamw_32bit" # Optimizer to use
lr_scheduler_type = "cosine" # Learning rate schedule
max_steps = -1 # Number of training steps (overrides num_train_epochs)
warmup_ratio = 0.03 # Ratio of steps for a linear warmup (from 0 to learning rate)

# Saves memory and speeds up training considerably
group_by_length = True # Group sequences into batches with same length
save_steps = 0 # Save checkpoint every X updates steps
logging_steps = 25 # Log every X updates steps

################################################################################
# SFT parameters
################################################################################
max_seq_length = None # Maximum sequence length to use
packing = False # Pack multiple short examples in the same input sequence to increase efficiency
device_map = {"": 0} # Load the entire model on the GPU 0

In [None]:

model_name = "lonestar108/base" # The model that you want to train from the Hugging Face hub
dataset_name = "lonestar108/enlightenedllm" # The instruction dataset to use
new_model = "enlightenedllm" # Fine-tuned model name

################################################################################
# QLoRA parameters
################################################################################
lora_r = 64 # LoRA attention dimension
lora_alpha = 16 # Alpha parameter for LoRA scaling
lora_dropout = 0.1 # Dropout probability for LoRA layers

################################################################################
# bitsandbytes parameters
################################################################################
use_4bit = True # Activate 4-bit precision base model loading
bnb_4bit_compute_dtype = "float16" # Compute dtype for 4-bit base models
bnb_4bit_quant_type = "nf4" # Quantization type (fp4 or nf4)
use_nested_quant = False # Activate nested quantization for 4-bit base models (double quantization)

################################################################################
# TrainingArguments parameters
################################################################################
output_dir = "./results" # Output directory where the model predictions and checkpoints will be store
num_train_epochs = 5 # Number of training epochs
fp16 = False # Enable fp16/bf16 training (set bf16 to True with an A100)
bf16 = True

per_device_train_batch_size = 4 # Batch size per GPU for training
per_device_eval_batch_size = 4 # Batch size per GPU for evaluation
gradient_accumulation_steps = 1 # Number of update steps to accumulate the gradients for
gradient_checkpointing = True # Enable gradient checkpointing
max_grad_norm = 0.3 # Maximum gradient normal (gradient clipping)
learning_rate = 2e-4 # Initial learning rate (AdamW optimizer)
weight_decay = 0.001 # Weight decay to apply to all layers except bias/LayerNorm weights
optim = "paged_adamw_32bit" # Optimizer to use
lr_scheduler_type = "cosine" # Learning rate schedule
max_steps = -1 # Number of training steps (overrides num_train_epochs)
warmup_ratio = 0.03 # Ratio of steps for a linear warmup (from 0 to learning rate)

# Saves memory and speeds up training considerably
group_by_length = True # Group sequences into batches with same length
save_steps = 0 # Save checkpoint every X updates steps
logging_steps = 25 # Log every X updates steps

################################################################################
# SFT parameters
################################################################################
max_seq_length = None # Maximum sequence length to use
packing = False # Pack multiple short examples in the same input sequence to increase efficiency
device_map = {"": 0} # Load the entire model on the GPU 0

In [None]:

model_name = 'meta-llama/Llama-2-7b'
model_save_paths = ['model_1.pt', 'model_2.pt', 'model_3.pt']
random_seeds = [42, 123, 456]
system_prompt = "You are a spiritual teacher."

# Loop to train the models
for i in range(3):
    load_and_train_model_with_dataset(model_name, dataset_name, system_prompt, model_save_paths[i], num_train_epochs, random_seeds[i])

# Ensembling predictions
models = []
for path in model_save_paths:
    model = AutoModelForCausalLM.from_pretrained(path)
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    models.append(model)

dataset_test = load_dataset('lonestar108/enlightenedllm', split='test')  # specify 'test' or 'validation' for test set
outputs = []
for model in models:
    # make prediction for each model
    outputs.append(model(dataset_test))
    
# Averaging predictions
ensemble_output = sum(outputs) / 3

# Computing accuracy
correct_predictions = 0

for i in range(len(ensemble_output)):
    # get the predicted token
    predicted_token = torch.argmax(ensemble_output[i][0][0][-1]).item()
    # get the actual token
    actual_token = dataset_test[i]['text'][-1]
    # compare the predicted and actual tokens
    if predicted_token == actual_token:
        correct_predictions += 1

# Print accuracy
print(correct_predictions / len(ensemble_output))
