# LLaMA Factory Colab Tutorial

Please use a **free** Tesla T4 Colab GPU to run this!

Project homepage: https://github.com/hiyouga/LLaMA-Factory

In [1]:
%%capture
!git clone https://github.com/hiyouga/LLaMA-Factory.git
!pip install /kaggle/working/LLaMA-Factory
!pip install /kaggle/working/LLaMA-Factory[bitsandbytes]
!pip install "unsloth[cu121-torch211] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
# !pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

In [2]:
import torch
try:
  assert torch.cuda.is_available() is True
except AssertionError:
  print("Please set up a GPU before using LLaMA Factory: https://medium.com/mlearning-ai/training-yolov4-on-google-colab-316f8fff99c6")

In [3]:
# Authorize huggingface
!python -c "from kaggle_secrets import UserSecretsClient; from huggingface_hub import HfFolder; hftoken = UserSecretsClient().get_secret('hf-token'); HfFolder().save_token(hftoken)"

## Fine-tune model via Command Line

In [4]:
# gc.collect()
torch.cuda.empty_cache()

In [5]:
# !rm -r /kaggle/working/llama2
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Version:", torch.version.cuda)

PyTorch Version: 2.1.1+cu121
CUDA Version: 12.1


In [6]:
def get_latest_checkpoint(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        return None
    dirs = os.listdir(output_dir)
    dirs = [d for d in dirs if d.startswith("checkpoint")]
    dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
    path = dirs[-1] if len(dirs) > 0 else None
    return os.path.join(output_dir, path) if path is not None else None

In [7]:
import os
from huggingface_hub import snapshot_download
# Set environment variables
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
os.environ["WANDB_DISABLED"] = "true"
os.environ['TOKENIZERS_PARALLELISM'] = "false"
# model and data
# model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
model_name = "unsloth/llama-2-7b-bnb-4bit"
# model_name = "unsloth/gemma-7b-bnb-4bit"
dataset_dir = "/kaggle/input/json-arxiv"
# Train set
dataset_name = "train_data_25K_json"
# Test set
# dataset_name = "test_data_1K_json"
template = "alpaca"

# File
model = "llama2"
output_dir = f"/kaggle/working/{model}"
logging_dir = f"/kaggle/working/{model}/logs"
tokenized_path = f"/kaggle/working/{model}/tokenized_dataset"
hub_model_id = f"BrijeshGiri/llm-ds-{model}"
# Download HF repo
try:
    snapshot_download(repo_id=hub_model_id, local_dir=output_dir, local_dir_use_symlinks=False)
except:
    print("HF repo does not exist. Starting Fresh training.")
    
resume_from_checkpoint = get_latest_checkpoint(output_dir)

# hyperparams
learning_rate = 1e-5
num_epochs = 5
warmup_ratio = 1/num_epochs
lr_sched = "reduce_lr_on_plateau"
optim = "adamw_bnb_8bit"

# bs
per_device_train_batch_size = 2 # 1 for gemma
per_device_eval_batch_size = 2 # 1 for gemma
gradient_accumulation_steps = 2
cutoff = 2048 # input token cutoff length
dataloader_num_workers = 4
preprocessing_num_workers = 4
# rope_scaling = "dynamic" # {linear,dynamic,None} - for making normal model work with longer context in a simpler way

# steps
eval_steps = 100
save_steps = 100
logging_steps = 50
save_total_limit = 20

#lora
lora_target = "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"
lora_rank = 8
lora_alpha = 8

#removed         
# Construct the command using f-string
command = f"""
accelerate launch \
    --num_processes=2 \
    /kaggle/working/LLaMA-Factory/src/train_bash.py \
    --stage sft \
    --do_train \
    --model_name_or_path {model_name} \
    --dataset_dir {dataset_dir} \
    --dataset {dataset_name} \
    --template {template} \
    --finetuning_type lora \
    --lora_target {lora_target} \
    --lora_rank {lora_rank} \
    --lora_alpha {lora_alpha} \
    --output_dir {output_dir} \
    --overwrite_cache \
    --overwrite_output_dir \
    --cutoff_len {cutoff} \
    --preprocessing_num_workers {preprocessing_num_workers} \
    --dataloader_num_workers {dataloader_num_workers} \
    --per_device_train_batch_size {per_device_train_batch_size} \
    --per_device_eval_batch_size {per_device_eval_batch_size} \
    --gradient_accumulation_steps {gradient_accumulation_steps} \
    --lr_scheduler_type {lr_sched} \
    --warmup_ratio {warmup_ratio} \
    --logging_steps {logging_steps} \
    --logging_dir {logging_dir} \
    --save_steps {save_steps} \
    --eval_steps {eval_steps} \
    --save_total_limit {save_total_limit} \
    --evaluation_strategy steps \
    --load_best_model_at_end \
    --learning_rate {learning_rate} \
    --optim {optim} \
    --num_train_epochs {num_epochs} \
    --val_size 0.005 \
    --ddp_timeout 180000000 \
    --plot_loss \
    --fp16 \
    --use_unsloth \
    --quantization_bit 4 \
    --packing True \
    --report_to tensorboard \
    --tokenized_path {tokenized_path} \
    --push_to_hub \
    --hub_model_id {hub_model_id} \
    --hub_strategy all_checkpoints \
"""
# Debugging --max_samples 32
if resume_from_checkpoint is not None:
    command += f"""--resume_from_checkpoint {resume_from_checkpoint} \
    """
# Execute tokenization command
!{command} 
!{command} # Backup incase of tokenization

Fetching 340 files:   0%|          | 0/340 [00:00<?, ?it/s]

checkpoint-100/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-100/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-100/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-100/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-100/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-100/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-100/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-100/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-100/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-100/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-100/trainer_state.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

checkpoint-1000/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-1000/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-100/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

(…)heckpoint-1000/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1000/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1000/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1000/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1000/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1000/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1000/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1000/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1000/trainer_state.json:   0%|          | 0.00/5.40k [00:00<?, ?B/s]

checkpoint-1100/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1000/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1100/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-1100/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1100/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1100/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

(…)heckpoint-1100/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1100/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1100/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1100/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1100/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1100/trainer_state.json:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

checkpoint-1100/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1200/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1200/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1200/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1200/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1200/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1200/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1200/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1200/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1200/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1200/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1200/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1200/trainer_state.json:   0%|          | 0.00/6.38k [00:00<?, ?B/s]

checkpoint-1300/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-1300/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1300/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

(…)heckpoint-1300/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1300/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1300/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1300/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1300/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1300/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1300/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1300/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1400/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1300/trainer_state.json:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

checkpoint-1400/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1400/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1400/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1400/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1400/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1400/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1400/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1400/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1400/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1400/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1500/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1500/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1500/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1500/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1500/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1400/trainer_state.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

checkpoint-1500/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1500/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1500/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1500/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1500/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1500/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1600/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1600/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1600/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1600/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1500/trainer_state.json:   0%|          | 0.00/7.85k [00:00<?, ?B/s]

checkpoint-1600/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1600/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1600/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1600/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1600/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1600/trainer_state.json:   0%|          | 0.00/8.35k [00:00<?, ?B/s]

checkpoint-1600/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1600/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1700/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1700/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1700/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1700/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1700/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1700/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1700/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1700/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1700/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1700/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1700/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1800/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1800/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-1700/trainer_state.json:   0%|          | 0.00/8.84k [00:00<?, ?B/s]

(…)heckpoint-1800/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1800/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1800/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1800/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1800/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1800/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1800/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1800/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1800/trainer_state.json:   0%|          | 0.00/9.33k [00:00<?, ?B/s]

checkpoint-1800/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-1900/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-1900/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-1900/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-1900/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-1900/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1900/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-1900/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-1900/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-1900/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-1900/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-1900/trainer_state.json:   0%|          | 0.00/9.82k [00:00<?, ?B/s]

checkpoint-1900/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-200/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-200/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-200/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-200/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-200/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-200/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-200/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-200/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-200/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-200/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-200/trainer_state.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

checkpoint-200/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-2000/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-2000/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2000/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2000/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

(…)heckpoint-2000/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2000/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2000/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2000/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2000/trainer_state.json:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

checkpoint-2000/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2000/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2000/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2100/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2100/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-2100/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2100/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2100/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2100/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2100/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2100/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2100/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2100/trainer_state.json:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

(…)heckpoint-2100/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2100/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2200/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2200/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-2200/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

(…)heckpoint-2200/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2200/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2200/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2200/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2200/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2200/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2200/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2200/trainer_state.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

checkpoint-2200/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2300/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2300/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-2300/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2300/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2300/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2300/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2300/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2300/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2300/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2300/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2300/trainer_state.json:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

checkpoint-2300/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2400/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

(…)heckpoint-2400/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2400/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-2400/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2400/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2400/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2400/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2400/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2400/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2400/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2400/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2400/trainer_state.json:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

checkpoint-2500/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-2500/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2500/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2500/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

(…)heckpoint-2500/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2500/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2500/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2500/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2500/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2500/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2500/trainer_state.json:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

checkpoint-2600/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2600/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-2600/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2500/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2600/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2600/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2600/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2600/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2600/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2600/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2600/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2600/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2600/trainer_state.json:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

checkpoint-2700/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-2700/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

(…)heckpoint-2700/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-2700/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2700/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-2700/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-2700/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-2700/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-2700/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-2700/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-2700/training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

checkpoint-2700/trainer_state.json:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

checkpoint-300/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-300/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-300/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-300/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-300/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-300/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-300/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-300/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-300/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-300/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-300/trainer_state.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

checkpoint-300/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-400/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-400/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-400/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-400/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-400/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-400/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-400/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-400/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-400/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-400/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-400/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-400/trainer_state.json:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

checkpoint-500/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-500/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-500/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-500/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-500/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-500/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-500/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-500/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-500/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-500/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-500/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-500/trainer_state.json:   0%|          | 0.00/2.95k [00:00<?, ?B/s]

checkpoint-600/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-600/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-600/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-600/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-600/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-600/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-600/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-600/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-600/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-600/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-600/trainer_state.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

checkpoint-600/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-700/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-700/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-700/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-700/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-700/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-700/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-700/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-700/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-700/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-700/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-700/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-700/trainer_state.json:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

checkpoint-800/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-800/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-800/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-800/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-800/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-800/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-800/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-800/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-800/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

checkpoint-800/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-800/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

checkpoint-800/trainer_state.json:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

checkpoint-900/README.md:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

checkpoint-900/adapter_config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

checkpoint-900/adapter_model.safetensors:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

checkpoint-900/rng_state_0.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-900/optimizer.pt:   0%|          | 0.00/40.6M [00:00<?, ?B/s]

checkpoint-900/scheduler.pt:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

checkpoint-900/rng_state_1.pth:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

checkpoint-900/special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-900/tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

checkpoint-900/trainer_state.json:   0%|          | 0.00/4.90k [00:00<?, ?B/s]

checkpoint-900/tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

(…)t.tfevents.1713252166.6a26da8935f1.358.0:   0%|          | 0.00/18.4k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

checkpoint-900/training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

(…)zed_dataset/cache-3f327ca032623ade.arrow:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

(…)t.tfevents.1713308120.a9d7c77b26d3.197.0:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

(…)enized_dataset/data-00000-of-00002.arrow:   0%|          | 0.00/291M [00:00<?, ?B/s]

(…)enized_dataset/data-00001-of-00002.arrow:   0%|          | 0.00/291M [00:00<?, ?B/s]

tokenized_dataset/dataset_info.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

(…)zed_dataset/cache-84f176bdc8b1d235.arrow:   0%|          | 0.00/177k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenized_dataset/state.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

trainer_log.jsonl:   0%|          | 0.00/6.18k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]0it [00:00, ?it/s]


  warn(


2024-04-19 16:06:10.363018: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 16:06:10.363026: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 16:06:10.363095: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-19 16:06:10.363167: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered


2024-04-19 16:06:10.533744: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-19 16:06:10.533730: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


04/19/2024 16:06:26 - INFO - llmtuner.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.float16


04/19/2024 16:06:26 - INFO - llmtuner.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.float16


tokenizer_config.json:   0%|                          | 0.00/894 [00:00<?, ?B/s]tokenizer_config.json: 100%|███████████████████| 894/894 [00:00<00:00, 5.66MB/s]


tokenizer.model:   0%|                               | 0.00/500k [00:00<?, ?B/s]

tokenizer.model: 100%|███████████████████████| 500k/500k [00:00<00:00, 9.67MB/s]


special_tokens_map.json:   0%|                        | 0.00/438 [00:00<?, ?B/s]special_tokens_map.json: 100%|█████████████████| 438/438 [00:00<00:00, 3.62MB/s]


tokenizer.json:   0%|                               | 0.00/1.84M [00:00<?, ?B/s]

tokenizer.json: 100%|██████████████████████| 1.84M/1.84M [00:00<00:00, 15.2MB/s]tokenizer.json: 100%|██████████████████████| 1.84M/1.84M [00:00<00:00, 15.0MB/s]
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:27,290 >> loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/tokenizer.model
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:27,290 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:27,290 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/special_tokens_map.json
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:27,290 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e

04/19/2024 16:06:27 - INFO - llmtuner.data.loader - Loaded tokenized dataset from /kaggle/working/llama2/tokenized_dataset.
04/19/2024 16:06:27 - INFO - llmtuner.data.loader - Loaded tokenized dataset from /kaggle/working/llama2/tokenized_dataset.


config.json:   0%|                                  | 0.00/1.10k [00:00<?, ?B/s]config.json: 100%|█████████████████████████| 1.10k/1.10k [00:00<00:00, 6.37MB/s]


04/19/2024 16:06:27 - INFO - llmtuner.model.patcher - Loading ?-bit BITSANDBYTES-quantized model.
Multiple CUDA devices detected but we require a single device.
We will override CUDA_VISIBLE_DEVICES to first device: 0.
[INFO|configuration_utils.py:726] 2024-04-19 16:06:27,621 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 16:06:27,623 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/llama-2-7b-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_



/sbin/ldconfig.real: Renaming of /etc/ld.so.cache~ to /etc/ld.so.cache failed: No such file or directory


==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.23. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
[INFO|configuration_utils.py:726] 2024-04-19 16:06:28,900 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 16:06:28,902 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/llama-2-7b-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_e

[INFO|configuration_utils.py:726] 2024-04-19 16:06:28,956 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 16:06:28,958 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/llama-2-7b-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int

[INFO|configuration_utils.py:726] 2024-04-19 16:06:29,073 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 16:06:29,074 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/llama-2-7b-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int

model.safetensors:   0%|                            | 0.00/3.87G [00:00<?, ?B/s]

model.safetensors:   0%|                   | 10.5M/3.87G [00:00<01:10, 54.9MB/s]

model.safetensors:   1%|▏                  | 31.5M/3.87G [00:00<00:39, 98.1MB/s]

model.safetensors:   2%|▎                   | 62.9M/3.87G [00:00<00:25, 147MB/s]

model.safetensors:   2%|▍                   | 83.9M/3.87G [00:00<00:23, 164MB/s]

model.safetensors:   3%|▋                    | 115M/3.87G [00:00<00:20, 184MB/s]

model.safetensors:   4%|▊                    | 147M/3.87G [00:00<00:19, 193MB/s]

model.safetensors:   5%|▉                    | 178M/3.87G [00:01<00:18, 198MB/s]

model.safetensors:   5%|█▏                   | 210M/3.87G [00:01<00:17, 204MB/s]

model.safetensors:   6%|█▎                   | 231M/3.87G [00:01<00:17, 203MB/s]

model.safetensors:   7%|█▎                   | 252M/3.87G [00:01<00:17, 202MB/s]

model.safetensors:   7%|█▌                   | 283M/3.87G [00:01<00:17, 204MB/s]

model.safetensors:   8%|█▋                   | 304M/3.87G [00:01<00:17, 202MB/s]

model.safetensors:   8%|█▊                   | 325M/3.87G [00:01<00:17, 202MB/s]

model.safetensors:   9%|█▉                   | 357M/3.87G [00:01<00:17, 205MB/s]

model.safetensors:  10%|██                   | 388M/3.87G [00:02<00:16, 210MB/s]

model.safetensors:  11%|██▎                  | 419M/3.87G [00:02<00:16, 210MB/s]

model.safetensors:  12%|██▍                  | 451M/3.87G [00:02<00:16, 211MB/s]

model.safetensors:  12%|██▌                  | 482M/3.87G [00:02<00:16, 206MB/s]

model.safetensors:  13%|██▋                  | 503M/3.87G [00:02<00:16, 205MB/s]

model.safetensors:  14%|██▉                  | 535M/3.87G [00:02<00:15, 208MB/s]

model.safetensors:  15%|███                  | 566M/3.87G [00:02<00:15, 211MB/s]

model.safetensors:  15%|███▏                 | 598M/3.87G [00:03<00:15, 211MB/s]

model.safetensors:  16%|███▍                 | 629M/3.87G [00:03<00:15, 210MB/s]

model.safetensors:  17%|███▌                 | 661M/3.87G [00:03<00:15, 211MB/s]

model.safetensors:  18%|███▊                 | 692M/3.87G [00:03<00:14, 212MB/s]

model.safetensors:  19%|███▉                 | 724M/3.87G [00:03<00:15, 209MB/s]

model.safetensors:  19%|████                 | 744M/3.87G [00:03<00:15, 207MB/s]

model.safetensors:  20%|████▏                | 765M/3.87G [00:03<00:14, 208MB/s]

model.safetensors:  20%|████▎                | 786M/3.87G [00:03<00:14, 207MB/s]

model.safetensors:  21%|████▍                | 818M/3.87G [00:04<00:14, 209MB/s]

model.safetensors:  22%|████▌                | 849M/3.87G [00:04<00:14, 211MB/s]

model.safetensors:  23%|████▊                | 881M/3.87G [00:04<00:14, 209MB/s]

model.safetensors:  23%|████▉                | 902M/3.87G [00:04<00:14, 209MB/s]

model.safetensors:  24%|█████                | 933M/3.87G [00:04<00:13, 211MB/s]

model.safetensors:  25%|█████▏               | 965M/3.87G [00:04<00:13, 213MB/s]

model.safetensors:  26%|█████▍               | 996M/3.87G [00:04<00:13, 212MB/s]

model.safetensors:  27%|█████▎              | 1.03G/3.87G [00:05<00:13, 213MB/s]

model.safetensors:  27%|█████▍              | 1.06G/3.87G [00:05<00:13, 214MB/s]

model.safetensors:  28%|█████▋              | 1.09G/3.87G [00:05<00:13, 213MB/s]

model.safetensors:  29%|█████▊              | 1.12G/3.87G [00:05<00:12, 213MB/s]

model.safetensors:  30%|█████▉              | 1.15G/3.87G [00:05<00:12, 214MB/s]

model.safetensors:  31%|██████▏             | 1.18G/3.87G [00:05<00:12, 212MB/s]

model.safetensors:  31%|██████▎             | 1.22G/3.87G [00:05<00:12, 210MB/s]

model.safetensors:  32%|██████▍             | 1.25G/3.87G [00:06<00:12, 212MB/s]

model.safetensors:  33%|██████▌             | 1.28G/3.87G [00:06<00:12, 211MB/s]

model.safetensors:  34%|██████▊             | 1.31G/3.87G [00:06<00:12, 209MB/s]

model.safetensors:  35%|██████▉             | 1.34G/3.87G [00:06<00:11, 211MB/s]

model.safetensors:  36%|███████             | 1.37G/3.87G [00:06<00:11, 210MB/s]

model.safetensors:  36%|███████▎            | 1.41G/3.87G [00:06<00:11, 212MB/s]

model.safetensors:  37%|███████▍            | 1.44G/3.87G [00:07<00:11, 211MB/s]

model.safetensors:  38%|███████▌            | 1.47G/3.87G [00:07<00:11, 210MB/s]

model.safetensors:  39%|███████▊            | 1.50G/3.87G [00:07<00:11, 212MB/s]

model.safetensors:  40%|███████▉            | 1.53G/3.87G [00:07<00:10, 214MB/s]

model.safetensors:  40%|████████            | 1.56G/3.87G [00:07<00:10, 214MB/s]

model.safetensors:  41%|████████▏           | 1.59G/3.87G [00:07<00:10, 212MB/s]

model.safetensors:  42%|████████▍           | 1.63G/3.87G [00:07<00:10, 211MB/s]

model.safetensors:  43%|████████▌           | 1.66G/3.87G [00:08<00:10, 207MB/s]

model.safetensors:  43%|████████▋           | 1.68G/3.87G [00:08<00:10, 207MB/s]

model.safetensors:  44%|████████▊           | 1.71G/3.87G [00:08<00:10, 206MB/s]

model.safetensors:  45%|████████▉           | 1.73G/3.87G [00:08<00:10, 207MB/s]

model.safetensors:  45%|█████████           | 1.75G/3.87G [00:08<00:10, 206MB/s]

model.safetensors:  46%|█████████▏          | 1.77G/3.87G [00:08<00:10, 204MB/s]

model.safetensors:  46%|█████████▎          | 1.79G/3.87G [00:08<00:10, 205MB/s]

model.safetensors:  47%|█████████▍          | 1.81G/3.87G [00:08<00:09, 206MB/s]

model.safetensors:  47%|█████████▍          | 1.84G/3.87G [00:08<00:09, 205MB/s]

model.safetensors:  48%|█████████▋          | 1.87G/3.87G [00:09<00:09, 207MB/s]

model.safetensors:  49%|█████████▊          | 1.90G/3.87G [00:09<00:09, 207MB/s]

model.safetensors:  50%|█████████▉          | 1.93G/3.87G [00:09<00:09, 210MB/s]

model.safetensors:  50%|██████████          | 1.95G/3.87G [00:09<00:09, 209MB/s]

model.safetensors:  51%|██████████▏         | 1.97G/3.87G [00:09<00:09, 205MB/s]

model.safetensors:  52%|██████████▎         | 2.00G/3.87G [00:09<00:09, 206MB/s]

model.safetensors:  52%|██████████▍         | 2.02G/3.87G [00:09<00:08, 206MB/s]

model.safetensors:  53%|██████████▋         | 2.06G/3.87G [00:10<00:08, 208MB/s]

model.safetensors:  54%|██████████▊         | 2.09G/3.87G [00:10<00:09, 186MB/s]

model.safetensors:  55%|██████████▉         | 2.11G/3.87G [00:10<00:12, 137MB/s]

model.safetensors:  55%|███████████         | 2.13G/3.87G [00:10<00:12, 138MB/s]

model.safetensors:  56%|███████████         | 2.15G/3.87G [00:10<00:11, 151MB/s]

model.safetensors:  56%|███████████▎        | 2.18G/3.87G [00:10<00:11, 150MB/s]

model.safetensors:  57%|███████████▍        | 2.20G/3.87G [00:11<00:12, 132MB/s]

model.safetensors:  58%|██████████▉        | 2.22G/3.87G [00:12<00:36, 45.6MB/s]

model.safetensors:  58%|███████████        | 2.24G/3.87G [00:12<00:29, 54.6MB/s]

model.safetensors:  59%|███████████▏       | 2.26G/3.87G [00:12<00:26, 60.5MB/s]

model.safetensors:  59%|███████████▏       | 2.28G/3.87G [00:13<00:24, 63.9MB/s]

model.safetensors:  59%|███████████▏       | 2.29G/3.87G [00:13<00:24, 64.2MB/s]

model.safetensors:  59%|███████████▎       | 2.30G/3.87G [00:13<00:23, 66.8MB/s]

model.safetensors:  60%|███████████▎       | 2.31G/3.87G [00:13<00:27, 56.1MB/s]

model.safetensors:  60%|███████████▍       | 2.33G/3.87G [00:13<00:21, 71.7MB/s]

model.safetensors:  60%|███████████▍       | 2.34G/3.87G [00:13<00:21, 70.0MB/s]

model.safetensors:  61%|███████████▌       | 2.36G/3.87G [00:14<00:19, 78.3MB/s]

model.safetensors:  62%|███████████▋       | 2.38G/3.87G [00:14<00:17, 86.1MB/s]

model.safetensors:  62%|███████████▋       | 2.39G/3.87G [00:14<00:17, 82.5MB/s]

model.safetensors:  62%|███████████▊       | 2.40G/3.87G [00:14<00:18, 78.6MB/s]

model.safetensors:  62%|███████████▊       | 2.41G/3.87G [00:14<00:18, 80.6MB/s]

model.safetensors:  63%|███████████▉       | 2.42G/3.87G [00:14<00:19, 76.0MB/s]

model.safetensors:  63%|███████████▉       | 2.43G/3.87G [00:15<00:18, 78.4MB/s]

model.safetensors:  63%|████████████       | 2.44G/3.87G [00:15<00:19, 73.1MB/s]

model.safetensors:  64%|████████████       | 2.46G/3.87G [00:15<00:15, 91.0MB/s]

model.safetensors:  64%|████████████▏      | 2.47G/3.87G [00:15<00:16, 84.7MB/s]

model.safetensors:  64%|████████████▏      | 2.49G/3.87G [00:15<00:15, 88.9MB/s]

model.safetensors:  65%|████████████▎      | 2.50G/3.87G [00:15<00:16, 84.9MB/s]

model.safetensors:  65%|█████████████       | 2.52G/3.87G [00:15<00:11, 113MB/s]

model.safetensors:  66%|█████████████▏      | 2.55G/3.87G [00:16<00:09, 144MB/s]

model.safetensors:  66%|█████████████▎      | 2.57G/3.87G [00:16<00:10, 118MB/s]

model.safetensors:  67%|█████████████▍      | 2.59G/3.87G [00:16<00:11, 113MB/s]

model.safetensors:  68%|█████████████▌      | 2.62G/3.87G [00:16<00:08, 138MB/s]

model.safetensors:  68%|█████████████▋      | 2.64G/3.87G [00:16<00:08, 152MB/s]

model.safetensors:  69%|█████████████▊      | 2.66G/3.87G [00:16<00:07, 165MB/s]

model.safetensors:  69%|█████████████▉      | 2.68G/3.87G [00:16<00:06, 174MB/s]

model.safetensors:  70%|█████████████▉      | 2.71G/3.87G [00:17<00:06, 182MB/s]

model.safetensors:  71%|██████████████      | 2.73G/3.87G [00:17<00:06, 189MB/s]

model.safetensors:  71%|██████████████▏     | 2.75G/3.87G [00:17<00:05, 192MB/s]

model.safetensors:  72%|██████████████▎     | 2.77G/3.87G [00:17<00:05, 194MB/s]

model.safetensors:  72%|██████████████▍     | 2.79G/3.87G [00:17<00:05, 197MB/s]

model.safetensors:  73%|██████████████▌     | 2.81G/3.87G [00:17<00:05, 200MB/s]

model.safetensors:  73%|██████████████▋     | 2.83G/3.87G [00:17<00:05, 202MB/s]

model.safetensors:  74%|██████████████▊     | 2.85G/3.87G [00:17<00:04, 203MB/s]

model.safetensors:  74%|██████████████▊     | 2.87G/3.87G [00:17<00:04, 205MB/s]

model.safetensors:  75%|██████████████▉     | 2.89G/3.87G [00:17<00:04, 206MB/s]

model.safetensors:  75%|███████████████     | 2.92G/3.87G [00:18<00:04, 203MB/s]

model.safetensors:  76%|███████████████▏    | 2.94G/3.87G [00:18<00:04, 204MB/s]

model.safetensors:  76%|███████████████▎    | 2.96G/3.87G [00:18<00:04, 204MB/s]

model.safetensors:  77%|███████████████▍    | 2.98G/3.87G [00:18<00:04, 205MB/s]

model.safetensors:  78%|███████████████▌    | 3.01G/3.87G [00:18<00:04, 208MB/s]

model.safetensors:  78%|███████████████▋    | 3.03G/3.87G [00:18<00:04, 207MB/s]

model.safetensors:  79%|███████████████▊    | 3.05G/3.87G [00:18<00:03, 206MB/s]

model.safetensors:  80%|███████████████▉    | 3.08G/3.87G [00:18<00:03, 208MB/s]

model.safetensors:  81%|████████████████    | 3.11G/3.87G [00:19<00:03, 212MB/s]

model.safetensors:  81%|████████████████▎   | 3.15G/3.87G [00:19<00:03, 214MB/s]

model.safetensors:  82%|████████████████▍   | 3.18G/3.87G [00:19<00:03, 213MB/s]

model.safetensors:  83%|████████████████▌   | 3.21G/3.87G [00:19<00:03, 213MB/s]

model.safetensors:  84%|████████████████▊   | 3.24G/3.87G [00:19<00:02, 211MB/s]

model.safetensors:  85%|████████████████▉   | 3.27G/3.87G [00:19<00:02, 211MB/s]

model.safetensors:  85%|█████████████████   | 3.30G/3.87G [00:19<00:02, 213MB/s]

model.safetensors:  86%|█████████████████▎  | 3.33G/3.87G [00:20<00:02, 213MB/s]

model.safetensors:  87%|█████████████████▍  | 3.37G/3.87G [00:20<00:02, 212MB/s]

model.safetensors:  88%|█████████████████▌  | 3.40G/3.87G [00:20<00:02, 212MB/s]

model.safetensors:  89%|█████████████████▋  | 3.43G/3.87G [00:20<00:02, 209MB/s]

model.safetensors:  90%|█████████████████▉  | 3.46G/3.87G [00:20<00:01, 210MB/s]

model.safetensors:  90%|██████████████████  | 3.49G/3.87G [00:20<00:01, 210MB/s]

model.safetensors:  91%|██████████████████▏ | 3.52G/3.87G [00:20<00:01, 211MB/s]

model.safetensors:  92%|██████████████████▍ | 3.55G/3.87G [00:21<00:01, 210MB/s]

model.safetensors:  93%|██████████████████▌ | 3.59G/3.87G [00:21<00:01, 210MB/s]

model.safetensors:  94%|██████████████████▋ | 3.62G/3.87G [00:21<00:01, 211MB/s]

model.safetensors:  94%|██████████████████▉ | 3.65G/3.87G [00:21<00:01, 212MB/s]

model.safetensors:  95%|███████████████████ | 3.68G/3.87G [00:21<00:00, 212MB/s]

model.safetensors:  96%|███████████████████▏| 3.71G/3.87G [00:21<00:00, 212MB/s]

model.safetensors:  97%|███████████████████▎| 3.74G/3.87G [00:21<00:00, 210MB/s]

model.safetensors:  98%|███████████████████▌| 3.77G/3.87G [00:22<00:00, 209MB/s]

model.safetensors:  98%|███████████████████▋| 3.81G/3.87G [00:22<00:00, 211MB/s]

model.safetensors:  99%|███████████████████▊| 3.84G/3.87G [00:22<00:00, 211MB/s]

model.safetensors: 100%|████████████████████| 3.87G/3.87G [00:22<00:00, 211MB/s]model.safetensors: 100%|████████████████████| 3.87G/3.87G [00:22<00:00, 171MB/s]
[INFO|modeling_utils.py:3283] 2024-04-19 16:06:51,846 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/model.safetensors


[INFO|modeling_utils.py:1417] 2024-04-19 16:06:51,906 >> Instantiating LlamaForCausalLM model under default dtype torch.float16.
[INFO|configuration_utils.py:928] 2024-04-19 16:06:51,911 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 2
}



[INFO|modeling_utils.py:4024] 2024-04-19 16:06:54,668 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:4032] 2024-04-19 16:06:54,668 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at unsloth/llama-2-7b-bnb-4bit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.


generation_config.json:   0%|                         | 0.00/188 [00:00<?, ?B/s]generation_config.json: 100%|███████████████████| 188/188 [00:00<00:00, 848kB/s]
[INFO|configuration_utils.py:883] 2024-04-19 16:06:54,797 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/generation_config.json
[INFO|configuration_utils.py:928] 2024-04-19 16:06:54,798 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}



tokenizer_config.json:   0%|                          | 0.00/894 [00:00<?, ?B/s]tokenizer_config.json: 100%|███████████████████| 894/894 [00:00<00:00, 4.55MB/s]


tokenizer.model:   0%|                               | 0.00/500k [00:00<?, ?B/s]tokenizer.model: 100%|████████████████████████| 500k/500k [00:00<00:00, 338MB/s]


special_tokens_map.json:   0%|                        | 0.00/438 [00:00<?, ?B/s]special_tokens_map.json: 100%|██████████████████| 438/438 [00:00<00:00, 990kB/s]


tokenizer.json:   0%|                               | 0.00/1.84M [00:00<?, ?B/s]

tokenizer.json: 100%|██████████████████████| 1.84M/1.84M [00:00<00:00, 30.9MB/s]
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:55,746 >> loading file tokenizer.model from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/tokenizer.model
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:55,746 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:55,746 >> loading file special_tokens_map.json from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/special_tokens_map.json
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:55,746 >> loading file tokenizer_config.json from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/tokenizer_config.json
[INFO|tokenization_utils_base.py:2084] 2024

[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:56,022 >> loading file tokenizer.model from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/tokenizer.model
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:56,022 >> loading file tokenizer.json from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/tokenizer.json
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:56,022 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:56,022 >> loading file special_tokens_map.json from cache at huggingface_tokenizers_cache/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/special_tokens_map.json
[INFO|tokenization_utils_base.py:2084] 2024-04-19 16:06:56,023 >> loading file tokenizer_config.json from cache at huggingface_tokenizers_ca

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers




[INFO|tokenization_utils_base.py:2502] 2024-04-19 16:06:57,719 >> tokenizer config file saved in _unsloth_sentencepiece_temp/unsloth_llama-2-7b-bnb-4bit/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 16:06:57,719 >> Special tokens file saved in _unsloth_sentencepiece_temp/unsloth_llama-2-7b-bnb-4bit/special_tokens_map.json
[INFO|tokenization_utils_base.py:2502] 2024-04-19 16:06:57,721 >> tokenizer config file saved in _unsloth_sentencepiece_temp/unsloth_llama-2-7b-bnb-4bit/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 16:06:57,721 >> Special tokens file saved in _unsloth_sentencepiece_temp/unsloth_llama-2-7b-bnb-4bit/special_tokens_map.json


[INFO|tokenization_utils_base.py:2082] 2024-04-19 16:06:57,756 >> loading file tokenizer.model
[INFO|tokenization_utils_base.py:2082] 2024-04-19 16:06:57,756 >> loading file tokenizer.json
[INFO|tokenization_utils_base.py:2082] 2024-04-19 16:06:57,756 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:2082] 2024-04-19 16:06:57,756 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:2082] 2024-04-19 16:06:57,756 >> loading file tokenizer_config.json


04/19/2024 16:06:58 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
04/19/2024 16:06:58 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA


04/19/2024 16:06:58 - INFO - llmtuner.model.patcher - Gradient checkpointing enabled.
04/19/2024 16:06:58 - INFO - llmtuner.model.adapter - Fine-tuning method: LoRA


Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


04/19/2024 16:06:59 - INFO - llmtuner.model.loader - trainable params: 19988480 || all params: 6758404096 || trainable%: 0.2958


04/19/2024 16:06:59 - INFO - llmtuner.model.loader - trainable params: 19988480 || all params: 6758404096 || trainable%: 0.2958


[INFO|trainer.py:607] 2024-04-19 16:06:59,609 >> Using auto half precision backend
[INFO|trainer.py:2348] 2024-04-19 16:06:59,609 >> Loading model from /kaggle/working/llama2/checkpoint-2700.


   \\   /|    Num examples = 21,709 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 13,570
 "-____-"     Number of trainable parameters = 19,988,480
[INFO|<string>:235] 2024-04-19 16:07:03,214 >>   Continuing training from checkpoint, will skip to saved global_step
[INFO|<string>:236] 2024-04-19 16:07:03,214 >>   Continuing training from epoch 0
[INFO|<string>:237] 2024-04-19 16:07:03,214 >>   Continuing training from global step 2700
[INFO|<string>:239] 2024-04-19 16:07:03,214 >>   Will skip the first 0 epochs then the first 5400 batches in the first epoch.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 2
   \\   /|    Num examples = 21,709 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 13,570
 "-____-"     Number of trainable parameters = 19,988,480
  0%|              

 20%|███████▎                             | 2701/13570 [00:21<01:26, 125.92it/s]

 20%|███████▎                             | 2701/13570 [00:33<01:26, 125.92it/s]

 20%|███████▌                              | 2702/13570 [00:38<03:06, 58.32it/s]

 20%|███████▌                              | 2703/13570 [00:56<05:33, 32.60it/s]

 20%|███████▌                              | 2704/13570 [01:14<09:00, 20.09it/s]

 20%|███████▌                              | 2705/13570 [01:32<14:04, 12.87it/s]

 20%|███████▌                              | 2706/13570 [01:51<21:20,  8.48it/s]

 20%|███████▌                              | 2707/13570 [02:09<31:48,  5.69it/s]

 20%|███████▌                              | 2708/13570 [02:28<46:48,  3.87it/s]

 20%|███████▏                            | 2709/13570 [02:47<1:08:06,  2.66it/s]

 20%|███████▏                            | 2710/13570 [03:06<1:38:15,  1.84it/s]

 20%|███████▏                            | 2711/13570 [03:25<2:20:14,  1.29it/s]

 20%|███████▏                            | 2712/13570 [03:44<3:18:37,  1.10s/it]

 20%|███████▏                            | 2713/13570 [04:03<4:38:13,  1.54s/it]

 20%|███████▏                            | 2714/13570 [04:23<6:26:25,  2.14s/it]

 20%|███████▏                            | 2715/13570 [04:42<8:52:07,  2.94s/it]

 20%|███████                            | 2716/13570 [05:01<11:56:39,  3.96s/it]

 20%|███████                            | 2717/13570 [05:21<15:47:35,  5.24s/it]

 20%|███████                            | 2718/13570 [05:40<20:12:22,  6.70s/it]

 20%|███████                            | 2719/13570 [06:00<25:10:56,  8.35s/it]

 20%|███████                            | 2720/13570 [06:19<30:23:30, 10.08s/it]

 20%|███████                            | 2721/13570 [06:39<35:47:49, 11.88s/it]

 20%|███████                            | 2722/13570 [06:59<40:33:29, 13.46s/it]

 20%|███████                            | 2723/13570 [07:19<44:51:55, 14.89s/it]

 20%|███████                            | 2724/13570 [07:38<48:23:21, 16.06s/it]

 20%|███████                            | 2725/13570 [07:58<51:21:59, 17.05s/it]

 20%|███████                            | 2726/13570 [08:18<53:48:09, 17.86s/it]

 20%|███████                            | 2727/13570 [08:38<55:16:30, 18.35s/it]

 20%|███████                            | 2728/13570 [08:57<56:13:08, 18.67s/it]

 20%|███████                            | 2729/13570 [09:17<57:03:21, 18.95s/it]

 20%|███████                            | 2730/13570 [09:36<57:39:42, 19.15s/it]

 20%|███████                            | 2731/13570 [09:56<58:18:17, 19.36s/it]

 20%|███████                            | 2732/13570 [10:16<58:39:00, 19.48s/it]

 20%|███████                            | 2733/13570 [10:36<58:46:07, 19.52s/it]

 20%|███████                            | 2734/13570 [10:55<58:58:05, 19.59s/it]

 20%|███████                            | 2735/13570 [11:15<59:04:17, 19.63s/it]

 20%|███████                            | 2736/13570 [11:35<59:00:22, 19.61s/it]

 20%|███████                            | 2737/13570 [11:54<59:02:11, 19.62s/it]

 20%|███████                            | 2738/13570 [12:15<59:32:31, 19.79s/it]

 20%|███████                            | 2739/13570 [12:34<59:14:18, 19.69s/it]

 20%|███████                            | 2740/13570 [12:54<59:30:24, 19.78s/it]

 20%|███████                            | 2741/13570 [13:13<59:07:45, 19.66s/it]

 20%|███████                            | 2742/13570 [13:33<59:14:10, 19.69s/it]

 20%|███████                            | 2743/13570 [13:53<59:02:27, 19.63s/it]

 20%|███████                            | 2744/13570 [14:12<58:55:18, 19.59s/it]

 20%|███████                            | 2745/13570 [14:32<58:52:02, 19.58s/it]

 20%|███████                            | 2746/13570 [14:52<59:03:06, 19.64s/it]

 20%|███████                            | 2747/13570 [15:11<58:55:04, 19.60s/it]

 20%|███████                            | 2748/13570 [15:31<58:57:09, 19.61s/it]

 20%|███████                            | 2749/13570 [15:51<59:22:56, 19.76s/it]

 20%|███████                            | 2750/13570 [16:11<59:35:32, 19.83s/it]                                                                                {'loss': 1.6743, 'grad_norm': 0.28103217482566833, 'learning_rate': 0.0003, 'epoch': 1.01}
 20%|███████                            | 2750/13570 [16:11<59:35:32, 19.83s/it]

 20%|███████                            | 2751/13570 [16:30<59:14:52, 19.71s/it]

 20%|███████                            | 2752/13570 [16:50<59:02:41, 19.65s/it]

 20%|███████                            | 2753/13570 [17:09<58:56:14, 19.61s/it]

 20%|███████                            | 2754/13570 [17:29<59:09:31, 19.69s/it]

 20%|███████                            | 2755/13570 [17:49<59:03:39, 19.66s/it]

 20%|███████                            | 2756/13570 [18:08<59:04:15, 19.66s/it]

 20%|███████                            | 2757/13570 [18:28<59:04:10, 19.67s/it]

 20%|███████                            | 2758/13570 [18:47<58:48:47, 19.58s/it]

 20%|███████                            | 2759/13570 [19:07<58:57:16, 19.63s/it]

 20%|███████                            | 2760/13570 [19:27<58:50:13, 19.59s/it]

 20%|███████                            | 2761/13570 [19:46<58:57:48, 19.64s/it]

 20%|███████                            | 2762/13570 [20:06<59:18:31, 19.75s/it]

 20%|███████▏                           | 2763/13570 [20:26<59:11:19, 19.72s/it]

 20%|███████▏                           | 2764/13570 [20:46<59:13:04, 19.73s/it]

 20%|███████▏                           | 2765/13570 [21:05<59:00:26, 19.66s/it]

 20%|███████▏                           | 2766/13570 [21:25<59:04:56, 19.69s/it]

 20%|███████▏                           | 2767/13570 [21:45<58:58:53, 19.66s/it]

 20%|███████▏                           | 2768/13570 [22:05<59:12:56, 19.73s/it]

 20%|███████▏                           | 2769/13570 [22:24<59:12:29, 19.73s/it]

 20%|███████▏                           | 2770/13570 [22:44<59:07:16, 19.71s/it]

 20%|███████▏                           | 2771/13570 [23:04<59:28:50, 19.83s/it]

 20%|███████▏                           | 2772/13570 [23:24<59:26:20, 19.82s/it]

 20%|███████▏                           | 2773/13570 [23:44<59:39:33, 19.89s/it]

 20%|███████▏                           | 2774/13570 [24:03<59:12:44, 19.74s/it]

 20%|███████▏                           | 2775/13570 [24:23<59:02:48, 19.69s/it]

 20%|███████▏                           | 2776/13570 [24:43<59:24:55, 19.82s/it]

 20%|███████▏                           | 2777/13570 [25:03<59:38:16, 19.89s/it]

 20%|███████▏                           | 2778/13570 [25:23<59:32:10, 19.86s/it]

 20%|███████▏                           | 2779/13570 [25:42<59:18:14, 19.78s/it]

 20%|███████▏                           | 2780/13570 [26:02<59:00:31, 19.69s/it]

 20%|███████▏                           | 2781/13570 [26:21<58:52:50, 19.65s/it]

 21%|███████▏                           | 2782/13570 [26:41<58:36:46, 19.56s/it]

 21%|███████▏                           | 2783/13570 [27:00<58:32:54, 19.54s/it]

 21%|███████▏                           | 2784/13570 [27:20<58:30:49, 19.53s/it]

 21%|███████▏                           | 2785/13570 [27:40<58:56:51, 19.68s/it]

 21%|███████▏                           | 2786/13570 [28:00<59:03:19, 19.71s/it]

 21%|███████▏                           | 2787/13570 [28:19<58:55:30, 19.67s/it]

 21%|███████▏                           | 2788/13570 [28:39<58:51:52, 19.65s/it]

 21%|███████▏                           | 2789/13570 [28:58<58:51:15, 19.65s/it]

 21%|███████▏                           | 2790/13570 [29:18<58:38:03, 19.58s/it]

 21%|███████▏                           | 2791/13570 [29:38<58:45:41, 19.63s/it]

 21%|███████▏                           | 2792/13570 [29:57<58:54:59, 19.68s/it]

 21%|███████▏                           | 2793/13570 [30:17<58:42:23, 19.61s/it]

 21%|███████▏                           | 2794/13570 [30:37<58:53:27, 19.67s/it]

 21%|███████▏                           | 2795/13570 [30:56<58:49:26, 19.65s/it]

 21%|███████▏                           | 2796/13570 [31:16<59:01:50, 19.72s/it]

 21%|███████▏                           | 2797/13570 [31:36<58:54:16, 19.68s/it]

 21%|███████▏                           | 2798/13570 [31:56<59:10:38, 19.78s/it]

 21%|███████▏                           | 2799/13570 [32:15<59:06:09, 19.75s/it]

 21%|███████▏                           | 2800/13570 [32:35<59:11:42, 19.79s/it]                                                                                {'loss': 1.6736, 'grad_norm': 0.32094869017601013, 'learning_rate': 0.0003, 'epoch': 1.03}
 21%|███████▏                           | 2800/13570 [32:35<59:11:42, 19.79s/it][INFO|trainer.py:3512] 2024-04-19 16:39:39,078 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 16:39:39,078 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 16:39:39,078 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.73s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.85s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.93s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.98s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.04s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.06s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.07s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.08s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.09s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.09s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.10s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.10s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.10s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.10s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.10s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7982343435287476, 'eval_runtime': 87.0466, 'eval_samples_per_second': 1.264, 'eval_steps_per_second': 0.322, 'epoch': 1.03}
 21%|███████▏                           | 2800/13570 [34:02<59:11:42, 19.79s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 16:41:06,127 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-2800


[INFO|configuration_utils.py:726] 2024-04-19 16:41:06,384 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 16:41:06,386 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 16:41:06,591 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-2800/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 16:41:06,592 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-2800/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 16:41:06,966 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json


[INFO|tokenization_utils_base.py:2511] 2024-04-19 16:41:06,966 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 16:41:06,979 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-100] due to args.save_total_limit
[INFO|trainer.py:3295] 2024-04-19 16:41:07,007 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-200] due to args.save_total_limit


[INFO|trainer.py:3295] 2024-04-19 16:41:07,035 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-300] due to args.save_total_limit
[INFO|trainer.py:3295] 2024-04-19 16:41:07,061 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-400] due to args.save_total_limit


[INFO|trainer.py:3295] 2024-04-19 16:41:07,088 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-500] due to args.save_total_limit
[INFO|trainer.py:3295] 2024-04-19 16:41:07,114 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-600] due to args.save_total_limit


[INFO|trainer.py:3295] 2024-04-19 16:41:07,139 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-700] due to args.save_total_limit
[INFO|trainer.py:3295] 2024-04-19 16:41:07,164 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-800] due to args.save_total_limit


 21%|███████                           | 2801/13570 [34:23<138:18:43, 46.24s/it]

 21%|███████                           | 2802/13570 [34:43<114:31:21, 38.29s/it]

 21%|███████▏                           | 2803/13570 [35:03<97:55:58, 32.74s/it]

 21%|███████▏                           | 2804/13570 [35:22<86:06:30, 28.79s/it]

 21%|███████▏                           | 2805/13570 [35:42<77:45:01, 26.00s/it]

 21%|███████▏                           | 2806/13570 [36:01<71:56:49, 24.06s/it]

 21%|███████▏                           | 2807/13570 [36:21<68:10:47, 22.80s/it]

 21%|███████▏                           | 2808/13570 [36:41<65:28:13, 21.90s/it]

 21%|███████▏                           | 2809/13570 [37:01<63:50:54, 21.36s/it]

 21%|███████▏                           | 2810/13570 [37:21<62:30:52, 20.92s/it]

 21%|███████▎                           | 2811/13570 [37:41<61:46:41, 20.67s/it]

 21%|███████▎                           | 2812/13570 [38:01<60:51:47, 20.37s/it]

 21%|███████▎                           | 2813/13570 [38:21<60:21:29, 20.20s/it]

 21%|███████▎                           | 2814/13570 [38:41<60:06:28, 20.12s/it]

 21%|███████▎                           | 2815/13570 [39:00<59:37:25, 19.96s/it]

 21%|███████▎                           | 2816/13570 [39:20<59:08:40, 19.80s/it]

 21%|███████▎                           | 2817/13570 [39:39<59:08:43, 19.80s/it]

 21%|███████▎                           | 2818/13570 [39:59<59:07:48, 19.80s/it]

 21%|███████▎                           | 2819/13570 [40:19<58:53:09, 19.72s/it]

 21%|███████▎                           | 2820/13570 [40:38<58:43:59, 19.67s/it]

 21%|███████▎                           | 2821/13570 [40:58<58:39:36, 19.65s/it]

 21%|███████▎                           | 2822/13570 [41:18<59:02:26, 19.78s/it]

 21%|███████▎                           | 2823/13570 [41:38<59:00:50, 19.77s/it]

 21%|███████▎                           | 2824/13570 [41:58<59:19:49, 19.88s/it]

 21%|███████▎                           | 2825/13570 [42:17<59:09:00, 19.82s/it]

 21%|███████▎                           | 2826/13570 [42:38<59:29:04, 19.93s/it]

 21%|███████▎                           | 2827/13570 [42:58<59:35:12, 19.97s/it]

 21%|███████▎                           | 2828/13570 [43:17<59:23:18, 19.90s/it]

 21%|███████▎                           | 2829/13570 [43:37<59:21:35, 19.90s/it]

 21%|███████▎                           | 2830/13570 [43:57<59:32:19, 19.96s/it]

 21%|███████▎                           | 2831/13570 [44:17<59:18:30, 19.88s/it]

 21%|███████▎                           | 2832/13570 [44:37<59:18:50, 19.89s/it]

 21%|███████▎                           | 2833/13570 [44:56<58:52:16, 19.74s/it]

 21%|███████▎                           | 2834/13570 [45:16<59:00:43, 19.79s/it]

 21%|███████▎                           | 2835/13570 [45:36<58:56:35, 19.77s/it]

 21%|███████▎                           | 2836/13570 [45:56<59:04:17, 19.81s/it]

 21%|███████▎                           | 2837/13570 [46:16<58:59:36, 19.79s/it]

 21%|███████▎                           | 2838/13570 [46:36<59:05:05, 19.82s/it]

 21%|███████▎                           | 2839/13570 [46:56<59:11:41, 19.86s/it]

 21%|███████▎                           | 2840/13570 [47:16<59:14:52, 19.88s/it]

 21%|███████▎                           | 2841/13570 [47:35<58:55:20, 19.77s/it]

 21%|███████▎                           | 2842/13570 [47:55<59:08:36, 19.85s/it]

 21%|███████▎                           | 2843/13570 [48:15<59:06:03, 19.83s/it]

 21%|███████▎                           | 2844/13570 [48:35<58:57:29, 19.79s/it]

 21%|███████▎                           | 2845/13570 [48:54<59:00:40, 19.81s/it]

 21%|███████▎                           | 2846/13570 [49:15<59:20:02, 19.92s/it]

 21%|███████▎                           | 2847/13570 [49:34<59:15:05, 19.89s/it]

 21%|███████▎                           | 2848/13570 [49:54<58:53:53, 19.78s/it]

 21%|███████▎                           | 2849/13570 [50:14<58:45:51, 19.73s/it]

 21%|███████▎                           | 2850/13570 [50:33<58:38:59, 19.70s/it]

                                                                                {'loss': 1.6667, 'grad_norm': 0.34613725543022156, 'learning_rate': 0.0003, 'epoch': 1.05}
 21%|███████▎                           | 2850/13570 [50:33<58:38:59, 19.70s/it]

 21%|███████▎                           | 2851/13570 [50:53<58:31:25, 19.66s/it]

 21%|███████▎                           | 2852/13570 [51:13<58:45:29, 19.74s/it]

 21%|███████▎                           | 2853/13570 [51:32<58:50:07, 19.76s/it]

 21%|███████▎                           | 2854/13570 [51:52<59:01:39, 19.83s/it]

 21%|███████▎                           | 2855/13570 [52:13<59:15:55, 19.91s/it]

 21%|███████▎                           | 2856/13570 [52:33<59:30:01, 19.99s/it]

 21%|███████▎                           | 2857/13570 [52:53<59:26:39, 19.98s/it]

 21%|███████▎                           | 2858/13570 [53:13<59:31:45, 20.01s/it]

 21%|███████▎                           | 2859/13570 [53:33<59:31:11, 20.00s/it]

 21%|███████▍                           | 2860/13570 [53:52<59:07:10, 19.87s/it]

 21%|███████▍                           | 2861/13570 [54:12<59:04:17, 19.86s/it]

 21%|███████▍                           | 2862/13570 [54:32<58:58:42, 19.83s/it]

 21%|███████▍                           | 2863/13570 [54:52<58:51:13, 19.79s/it]

 21%|███████▍                           | 2864/13570 [55:11<58:55:15, 19.81s/it]

 21%|███████▍                           | 2865/13570 [55:31<58:39:31, 19.73s/it]

 21%|███████▍                           | 2866/13570 [55:51<58:40:21, 19.73s/it]

 21%|███████▍                           | 2867/13570 [56:11<59:11:11, 19.91s/it]

 21%|███████▍                           | 2868/13570 [56:31<59:02:00, 19.86s/it]

 21%|███████▍                           | 2869/13570 [56:51<58:56:49, 19.83s/it]

 21%|███████▍                           | 2870/13570 [57:10<58:46:28, 19.77s/it]

 21%|███████▍                           | 2871/13570 [57:30<58:44:57, 19.77s/it]

 21%|███████▍                           | 2872/13570 [57:50<58:42:51, 19.76s/it]

 21%|███████▍                           | 2873/13570 [58:10<58:45:59, 19.78s/it]

 21%|███████▍                           | 2874/13570 [58:29<58:54:23, 19.83s/it]

 21%|███████▍                           | 2875/13570 [58:49<58:52:37, 19.82s/it]

 21%|███████▍                           | 2876/13570 [59:09<58:45:17, 19.78s/it]

 21%|███████▍                           | 2877/13570 [59:29<58:44:37, 19.78s/it]

 21%|███████▍                           | 2878/13570 [59:49<58:47:45, 19.80s/it]

 21%|███████                          | 2879/13570 [1:00:09<59:14:09, 19.95s/it]

 21%|███████                          | 2880/13570 [1:00:28<58:49:50, 19.81s/it]

 21%|███████                          | 2881/13570 [1:00:48<58:53:15, 19.83s/it]

 21%|███████                          | 2882/13570 [1:01:08<58:56:48, 19.85s/it]

 21%|███████                          | 2883/13570 [1:01:28<59:16:07, 19.97s/it]

 21%|███████                          | 2884/13570 [1:01:48<59:23:59, 20.01s/it]

 21%|███████                          | 2885/13570 [1:02:08<59:14:33, 19.96s/it]

 21%|███████                          | 2886/13570 [1:02:28<59:18:03, 19.98s/it]

 21%|███████                          | 2887/13570 [1:02:48<59:26:03, 20.03s/it]

 21%|███████                          | 2888/13570 [1:03:08<59:17:25, 19.98s/it]

 21%|███████                          | 2889/13570 [1:03:28<58:59:36, 19.88s/it]

 21%|███████                          | 2890/13570 [1:03:48<59:10:35, 19.95s/it]

 21%|███████                          | 2891/13570 [1:04:08<59:07:27, 19.93s/it]

 21%|███████                          | 2892/13570 [1:04:28<58:44:04, 19.80s/it]

 21%|███████                          | 2893/13570 [1:04:47<58:42:41, 19.80s/it]

 21%|███████                          | 2894/13570 [1:05:07<58:58:55, 19.89s/it]

 21%|███████                          | 2895/13570 [1:05:27<58:55:38, 19.87s/it]

 21%|███████                          | 2896/13570 [1:05:47<58:54:32, 19.87s/it]

 21%|███████                          | 2897/13570 [1:06:07<58:55:54, 19.88s/it]

 21%|███████                          | 2898/13570 [1:06:26<58:34:20, 19.76s/it]

 21%|███████                          | 2899/13570 [1:06:46<58:40:52, 19.80s/it]

 21%|███████                          | 2900/13570 [1:07:06<58:50:05, 19.85s/it]                                                                                {'loss': 1.6551, 'grad_norm': 0.29416289925575256, 'learning_rate': 0.0003, 'epoch': 1.07}
 21%|███████                          | 2900/13570 [1:07:06<58:50:05, 19.85s/it][INFO|trainer.py:3512] 2024-04-19 17:14:10,063 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 17:14:10,063 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 17:14:10,063 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.73s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.85s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.93s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.99s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.02s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.06s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.09s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.10s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.10s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.10s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.10s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.10s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.10s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.10s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.10s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.10s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8037728071212769, 'eval_runtime': 87.1037, 'eval_samples_per_second': 1.263, 'eval_steps_per_second': 0.321, 'epoch': 1.07}
 21%|███████                          | 2900/13570 [1:08:33<58:50:05, 19.85s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 17:15:37,169 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-2900


[INFO|configuration_utils.py:726] 2024-04-19 17:15:37,412 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 17:15:37,413 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 17:15:37,612 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-2900/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 17:15:37,612 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-2900/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 17:15:37,937 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 17:15:37,938 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json


[INFO|trainer.py:3295] 2024-04-19 17:15:37,944 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-900] due to args.save_total_limit


 21%|██████▊                         | 2901/13570 [1:08:54<136:43:00, 46.13s/it]

 21%|██████▊                         | 2902/13570 [1:09:13<113:08:49, 38.18s/it]

 21%|███████                          | 2903/13570 [1:09:33<96:49:11, 32.68s/it]

 21%|███████                          | 2904/13570 [1:09:53<85:12:17, 28.76s/it]

 21%|███████                          | 2905/13570 [1:10:13<77:33:20, 26.18s/it]

 21%|███████                          | 2906/13570 [1:10:33<71:47:47, 24.24s/it]

 21%|███████                          | 2907/13570 [1:10:53<67:59:04, 22.95s/it]

 21%|███████                          | 2908/13570 [1:11:12<64:52:17, 21.90s/it]

 21%|███████                          | 2909/13570 [1:11:32<62:58:29, 21.27s/it]

 21%|███████                          | 2910/13570 [1:11:51<61:19:37, 20.71s/it]

 21%|███████                          | 2911/13570 [1:12:11<60:33:51, 20.46s/it]

 21%|███████                          | 2912/13570 [1:12:31<59:59:55, 20.27s/it]

 21%|███████                          | 2913/13570 [1:12:51<59:36:06, 20.13s/it]

 21%|███████                          | 2914/13570 [1:13:11<59:21:17, 20.05s/it]

 21%|███████                          | 2915/13570 [1:13:31<59:13:40, 20.01s/it]

 21%|███████                          | 2916/13570 [1:13:51<59:17:38, 20.04s/it]

 21%|███████                          | 2917/13570 [1:14:11<59:08:19, 19.98s/it]

 22%|███████                          | 2918/13570 [1:14:31<59:17:15, 20.04s/it]

 22%|███████                          | 2919/13570 [1:14:50<59:02:16, 19.95s/it]

 22%|███████                          | 2920/13570 [1:15:10<59:03:10, 19.96s/it]

 22%|███████                          | 2921/13570 [1:15:30<58:58:52, 19.94s/it]

 22%|███████                          | 2922/13570 [1:15:50<58:58:02, 19.94s/it]

 22%|███████                          | 2923/13570 [1:16:10<58:32:31, 19.79s/it]

 22%|███████                          | 2924/13570 [1:16:29<58:27:33, 19.77s/it]

 22%|███████                          | 2925/13570 [1:16:49<58:24:54, 19.76s/it]

 22%|███████                          | 2926/13570 [1:17:09<58:36:07, 19.82s/it]

 22%|███████                          | 2927/13570 [1:17:29<58:51:34, 19.91s/it]

 22%|███████                          | 2928/13570 [1:17:49<58:59:30, 19.96s/it]

 22%|███████                          | 2929/13570 [1:18:09<58:54:01, 19.93s/it]

 22%|███████▏                         | 2930/13570 [1:18:29<58:50:01, 19.91s/it]

 22%|███████▏                         | 2931/13570 [1:18:49<58:45:31, 19.88s/it]

 22%|███████▏                         | 2932/13570 [1:19:09<58:33:53, 19.82s/it]

 22%|███████▏                         | 2933/13570 [1:19:28<58:19:11, 19.74s/it]

 22%|███████▏                         | 2934/13570 [1:19:49<58:54:13, 19.94s/it]

 22%|███████▏                         | 2935/13570 [1:20:08<58:54:27, 19.94s/it]

 22%|███████▏                         | 2936/13570 [1:20:28<58:52:41, 19.93s/it]

 22%|███████▏                         | 2937/13570 [1:20:48<58:54:05, 19.94s/it]

 22%|███████▏                         | 2938/13570 [1:21:08<58:30:55, 19.81s/it]

 22%|███████▏                         | 2939/13570 [1:21:28<58:34:58, 19.84s/it]

 22%|███████▏                         | 2940/13570 [1:21:47<58:26:08, 19.79s/it]

 22%|███████▏                         | 2941/13570 [1:22:07<58:21:31, 19.77s/it]

 22%|███████▏                         | 2942/13570 [1:22:27<58:28:32, 19.81s/it]

 22%|███████▏                         | 2943/13570 [1:22:47<58:42:04, 19.89s/it]

 22%|███████▏                         | 2944/13570 [1:23:07<58:39:46, 19.87s/it]

 22%|███████▏                         | 2945/13570 [1:23:27<58:36:36, 19.86s/it]

 22%|███████▏                         | 2946/13570 [1:23:47<58:35:59, 19.86s/it]

 22%|███████▏                         | 2947/13570 [1:24:06<58:32:11, 19.84s/it]

 22%|███████▏                         | 2948/13570 [1:24:26<58:12:15, 19.73s/it]

 22%|███████▏                         | 2949/13570 [1:24:46<58:08:32, 19.71s/it]

 22%|███████▏                         | 2950/13570 [1:25:06<58:23:56, 19.80s/it]                                                                                {'loss': 1.6822, 'grad_norm': 0.3634116053581238, 'learning_rate': 0.0003, 'epoch': 1.09}
 22%|███████▏                         | 2950/13570 [1:25:06<58:23:56, 19.80s/it]

 22%|███████▏                         | 2951/13570 [1:25:25<58:17:34, 19.76s/it]

 22%|███████▏                         | 2952/13570 [1:25:45<58:28:45, 19.83s/it]

 22%|███████▏                         | 2953/13570 [1:26:05<58:12:09, 19.74s/it]

 22%|███████▏                         | 2954/13570 [1:26:25<58:13:45, 19.75s/it]

 22%|███████▏                         | 2955/13570 [1:26:44<58:11:55, 19.74s/it]

 22%|███████▏                         | 2956/13570 [1:27:04<58:08:46, 19.72s/it]

 22%|███████▏                         | 2957/13570 [1:27:23<58:01:30, 19.68s/it]

 22%|███████▏                         | 2958/13570 [1:27:43<58:13:06, 19.75s/it]

 22%|███████▏                         | 2959/13570 [1:28:03<58:08:31, 19.73s/it]

 22%|███████▏                         | 2960/13570 [1:28:23<58:10:42, 19.74s/it]

 22%|███████▏                         | 2961/13570 [1:28:42<57:59:56, 19.68s/it]

 22%|███████▏                         | 2962/13570 [1:29:02<58:04:37, 19.71s/it]

 22%|███████▏                         | 2963/13570 [1:29:22<57:57:34, 19.67s/it]

 22%|███████▏                         | 2964/13570 [1:29:42<58:04:00, 19.71s/it]

 22%|███████▏                         | 2965/13570 [1:30:01<57:56:06, 19.67s/it]

 22%|███████▏                         | 2966/13570 [1:30:21<58:06:01, 19.72s/it]

 22%|███████▏                         | 2967/13570 [1:30:41<58:17:22, 19.79s/it]

 22%|███████▏                         | 2968/13570 [1:31:01<58:20:04, 19.81s/it]

 22%|███████▏                         | 2969/13570 [1:31:21<58:16:50, 19.79s/it]

 22%|███████▏                         | 2970/13570 [1:31:40<58:11:32, 19.76s/it]

 22%|███████▏                         | 2971/13570 [1:32:00<58:12:24, 19.77s/it]

 22%|███████▏                         | 2972/13570 [1:32:20<58:01:19, 19.71s/it]

 22%|███████▏                         | 2973/13570 [1:32:40<58:14:39, 19.79s/it]

 22%|███████▏                         | 2974/13570 [1:32:59<58:13:05, 19.78s/it]

 22%|███████▏                         | 2975/13570 [1:33:19<58:19:55, 19.82s/it]

 22%|███████▏                         | 2976/13570 [1:33:39<57:58:42, 19.70s/it]

 22%|███████▏                         | 2977/13570 [1:33:59<58:26:48, 19.86s/it]

 22%|███████▏                         | 2978/13570 [1:34:19<58:42:42, 19.95s/it]

 22%|███████▏                         | 2979/13570 [1:34:39<58:39:20, 19.94s/it]

 22%|███████▏                         | 2980/13570 [1:34:59<58:50:04, 20.00s/it]

 22%|███████▏                         | 2981/13570 [1:35:19<58:59:51, 20.06s/it]

 22%|███████▎                         | 2982/13570 [1:35:39<58:50:26, 20.01s/it]

 22%|███████▎                         | 2983/13570 [1:35:59<59:02:02, 20.07s/it]

 22%|███████▎                         | 2984/13570 [1:36:19<58:58:06, 20.05s/it]

 22%|███████▎                         | 2985/13570 [1:36:39<58:50:23, 20.01s/it]

 22%|███████▎                         | 2986/13570 [1:36:59<58:40:06, 19.96s/it]

 22%|███████▎                         | 2987/13570 [1:37:19<58:11:58, 19.80s/it]

 22%|███████▎                         | 2988/13570 [1:37:38<58:05:56, 19.77s/it]

 22%|███████▎                         | 2989/13570 [1:37:58<58:21:19, 19.85s/it]

 22%|███████▎                         | 2990/13570 [1:38:18<58:31:38, 19.91s/it]

 22%|███████▎                         | 2991/13570 [1:38:38<58:19:41, 19.85s/it]

 22%|███████▎                         | 2992/13570 [1:38:58<58:12:23, 19.81s/it]

 22%|███████▎                         | 2993/13570 [1:39:18<58:34:10, 19.93s/it]

 22%|███████▎                         | 2994/13570 [1:39:38<58:16:38, 19.84s/it]

 22%|███████▎                         | 2995/13570 [1:39:58<58:29:04, 19.91s/it]

 22%|███████▎                         | 2996/13570 [1:40:18<58:24:17, 19.88s/it]

 22%|███████▎                         | 2997/13570 [1:40:38<58:47:55, 20.02s/it]

 22%|███████▎                         | 2998/13570 [1:40:58<58:52:39, 20.05s/it]

 22%|███████▎                         | 2999/13570 [1:41:18<59:00:06, 20.09s/it]

 22%|███████▎                         | 3000/13570 [1:41:38<58:46:30, 20.02s/it]                                                                                {'loss': 1.6839, 'grad_norm': 0.32180094718933105, 'learning_rate': 0.0003, 'epoch': 1.11}
 22%|███████▎                         | 3000/13570 [1:41:38<58:46:30, 20.02s/it][INFO|trainer.py:3512] 2024-04-19 17:48:41,771 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 17:48:41,771 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 17:48:41,771 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.73s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.85s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.93s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.99s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.02s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.07s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.09s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.10s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.10s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.10s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.10s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.10s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8087329864501953, 'eval_runtime': 87.2094, 'eval_samples_per_second': 1.261, 'eval_steps_per_second': 0.321, 'epoch': 1.11}
 22%|███████▎                         | 3000/13570 [1:43:05<58:46:30, 20.02s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 17:50:08,984 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3000


[INFO|configuration_utils.py:726] 2024-04-19 17:50:09,276 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 17:50:09,278 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 17:50:09,510 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 17:50:09,511 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3000/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 17:50:09,849 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 17:50:09,850 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 17:50:09,855 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1000] due to args.save_total_limit


 22%|███████                         | 3001/13570 [1:43:26<136:14:05, 46.40s/it]

 22%|███████                         | 3002/13570 [1:43:46<112:47:02, 38.42s/it]

 22%|███████▎                         | 3003/13570 [1:44:06<96:47:55, 32.98s/it]

 22%|███████▎                         | 3004/13570 [1:44:26<85:12:56, 29.03s/it]

 22%|███████▎                         | 3005/13570 [1:44:46<76:59:49, 26.24s/it]

 22%|███████▎                         | 3006/13570 [1:45:05<71:22:51, 24.33s/it]

 22%|███████▎                         | 3007/13570 [1:45:25<67:23:00, 22.97s/it]

 22%|███████▎                         | 3008/13570 [1:45:45<64:37:47, 22.03s/it]

 22%|███████▎                         | 3009/13570 [1:46:05<62:34:31, 21.33s/it]

 22%|███████▎                         | 3010/13570 [1:46:25<61:36:09, 21.00s/it]

 22%|███████▎                         | 3011/13570 [1:46:45<60:40:06, 20.68s/it]

 22%|███████▎                         | 3012/13570 [1:47:05<60:11:26, 20.52s/it]

 22%|███████▎                         | 3013/13570 [1:47:25<59:35:28, 20.32s/it]

 22%|███████▎                         | 3014/13570 [1:47:45<59:05:00, 20.15s/it]

 22%|███████▎                         | 3015/13570 [1:48:05<59:00:31, 20.13s/it]

 22%|███████▎                         | 3016/13570 [1:48:25<59:11:54, 20.19s/it]

 22%|███████▎                         | 3017/13570 [1:48:45<58:51:55, 20.08s/it]

 22%|███████▎                         | 3018/13570 [1:49:05<58:29:19, 19.95s/it]

 22%|███████▎                         | 3019/13570 [1:49:24<58:22:07, 19.92s/it]

 22%|███████▎                         | 3020/13570 [1:49:44<58:19:55, 19.90s/it]

 22%|███████▎                         | 3021/13570 [1:50:04<58:13:54, 19.87s/it]

 22%|███████▎                         | 3022/13570 [1:50:24<58:08:28, 19.84s/it]

 22%|███████▎                         | 3023/13570 [1:50:44<58:02:51, 19.81s/it]

 22%|███████▎                         | 3024/13570 [1:51:04<58:16:34, 19.89s/it]

 22%|███████▎                         | 3025/13570 [1:51:24<58:14:46, 19.88s/it]

 22%|███████▎                         | 3026/13570 [1:51:43<58:13:17, 19.88s/it]

 22%|███████▎                         | 3027/13570 [1:52:03<58:08:14, 19.85s/it]

 22%|███████▎                         | 3028/13570 [1:52:23<58:24:57, 19.95s/it]

 22%|███████▎                         | 3029/13570 [1:52:43<58:25:39, 19.95s/it]

 22%|███████▎                         | 3030/13570 [1:53:03<58:11:06, 19.87s/it]

 22%|███████▎                         | 3031/13570 [1:53:23<58:08:13, 19.86s/it]

 22%|███████▎                         | 3032/13570 [1:53:43<58:07:03, 19.85s/it]

 22%|███████▍                         | 3033/13570 [1:54:03<58:04:54, 19.84s/it]

 22%|███████▍                         | 3034/13570 [1:54:22<57:58:24, 19.81s/it]

 22%|███████▍                         | 3035/13570 [1:54:42<58:14:46, 19.90s/it]

 22%|███████▍                         | 3036/13570 [1:55:02<58:13:21, 19.90s/it]

 22%|███████▍                         | 3037/13570 [1:55:22<58:09:38, 19.88s/it]

 22%|███████▍                         | 3038/13570 [1:55:42<57:56:35, 19.81s/it]

 22%|███████▍                         | 3039/13570 [1:56:02<57:57:34, 19.81s/it]

 22%|███████▍                         | 3040/13570 [1:56:21<57:51:15, 19.78s/it]

 22%|███████▍                         | 3041/13570 [1:56:41<57:54:52, 19.80s/it]

 22%|███████▍                         | 3042/13570 [1:57:01<58:03:35, 19.85s/it]

 22%|███████▍                         | 3043/13570 [1:57:21<58:13:49, 19.91s/it]

 22%|███████▍                         | 3044/13570 [1:57:41<57:52:48, 19.80s/it]

 22%|███████▍                         | 3045/13570 [1:58:01<57:56:51, 19.82s/it]

 22%|███████▍                         | 3046/13570 [1:58:21<58:09:31, 19.89s/it]

 22%|███████▍                         | 3047/13570 [1:58:40<57:43:52, 19.75s/it]

 22%|███████▍                         | 3048/13570 [1:59:00<57:33:21, 19.69s/it]

 22%|███████▍                         | 3049/13570 [1:59:20<57:45:50, 19.77s/it]

 22%|███████▍                         | 3050/13570 [1:59:40<57:57:55, 19.84s/it]                                                                                {'loss': 1.6899, 'grad_norm': 0.3951851725578308, 'learning_rate': 0.0003, 'epoch': 1.12}
 22%|███████▍                         | 3050/13570 [1:59:40<57:57:55, 19.84s/it]

 22%|███████▍                         | 3051/13570 [1:59:59<58:01:35, 19.86s/it]

 22%|███████▍                         | 3052/13570 [2:00:20<58:10:07, 19.91s/it]

 22%|███████▍                         | 3053/13570 [2:00:39<58:09:26, 19.91s/it]

 23%|███████▍                         | 3054/13570 [2:00:59<58:12:22, 19.93s/it]

 23%|███████▍                         | 3055/13570 [2:01:19<58:19:46, 19.97s/it]

 23%|███████▍                         | 3056/13570 [2:01:39<58:15:38, 19.95s/it]

 23%|███████▍                         | 3057/13570 [2:01:59<58:09:53, 19.92s/it]

 23%|███████▍                         | 3058/13570 [2:02:19<58:14:02, 19.94s/it]

 23%|███████▍                         | 3059/13570 [2:02:39<58:02:28, 19.88s/it]

 23%|███████▍                         | 3060/13570 [2:02:59<58:02:41, 19.88s/it]

 23%|███████▍                         | 3061/13570 [2:03:19<57:55:53, 19.85s/it]

 23%|███████▍                         | 3062/13570 [2:03:38<57:40:49, 19.76s/it]

 23%|███████▍                         | 3063/13570 [2:03:58<57:38:24, 19.75s/it]

 23%|███████▍                         | 3064/13570 [2:04:18<57:46:08, 19.80s/it]

 23%|███████▍                         | 3065/13570 [2:04:38<57:51:36, 19.83s/it]

 23%|███████▍                         | 3066/13570 [2:04:57<57:45:35, 19.80s/it]

 23%|███████▍                         | 3067/13570 [2:05:17<57:45:26, 19.80s/it]

 23%|███████▍                         | 3068/13570 [2:05:37<57:49:03, 19.82s/it]

 23%|███████▍                         | 3069/13570 [2:05:57<57:41:38, 19.78s/it]

 23%|███████▍                         | 3070/13570 [2:06:17<57:42:23, 19.79s/it]

 23%|███████▍                         | 3071/13570 [2:06:36<57:44:51, 19.80s/it]

 23%|███████▍                         | 3072/13570 [2:06:56<57:53:57, 19.85s/it]

 23%|███████▍                         | 3073/13570 [2:07:16<57:35:15, 19.75s/it]

 23%|███████▍                         | 3074/13570 [2:07:36<57:51:03, 19.84s/it]

 23%|███████▍                         | 3075/13570 [2:07:56<57:40:26, 19.78s/it]

 23%|███████▍                         | 3076/13570 [2:08:15<57:34:42, 19.75s/it]

 23%|███████▍                         | 3077/13570 [2:08:35<57:58:50, 19.89s/it]

 23%|███████▍                         | 3078/13570 [2:08:55<57:30:58, 19.73s/it]

 23%|███████▍                         | 3079/13570 [2:09:15<57:26:59, 19.71s/it]

 23%|███████▍                         | 3080/13570 [2:09:34<57:18:00, 19.66s/it]

 23%|███████▍                         | 3081/13570 [2:09:54<57:35:51, 19.77s/it]

 23%|███████▍                         | 3082/13570 [2:10:14<57:41:36, 19.80s/it]

 23%|███████▍                         | 3083/13570 [2:10:34<57:30:22, 19.74s/it]

 23%|███████▍                         | 3084/13570 [2:10:54<57:49:03, 19.85s/it]

 23%|███████▌                         | 3085/13570 [2:11:14<57:51:26, 19.87s/it]

 23%|███████▌                         | 3086/13570 [2:11:33<57:55:01, 19.89s/it]

 23%|███████▌                         | 3087/13570 [2:11:53<57:51:25, 19.87s/it]

 23%|███████▌                         | 3088/13570 [2:12:13<57:59:22, 19.92s/it]

 23%|███████▌                         | 3089/13570 [2:12:33<57:45:53, 19.84s/it]

 23%|███████▌                         | 3090/13570 [2:12:53<58:02:08, 19.94s/it]

 23%|███████▌                         | 3091/13570 [2:13:13<58:07:58, 19.97s/it]

 23%|███████▌                         | 3092/13570 [2:13:33<58:12:04, 20.00s/it]

 23%|███████▌                         | 3093/13570 [2:13:53<57:53:33, 19.89s/it]

 23%|███████▌                         | 3094/13570 [2:14:12<57:28:31, 19.75s/it]

 23%|███████▌                         | 3095/13570 [2:14:32<57:47:19, 19.86s/it]

 23%|███████▌                         | 3096/13570 [2:14:52<57:53:02, 19.90s/it]

 23%|███████▌                         | 3097/13570 [2:15:12<57:51:13, 19.89s/it]

 23%|███████▌                         | 3098/13570 [2:15:32<57:36:22, 19.80s/it]

 23%|███████▌                         | 3099/13570 [2:15:52<57:35:51, 19.80s/it]

 23%|███████▌                         | 3100/13570 [2:16:11<57:30:54, 19.78s/it]                                                                                {'loss': 1.6902, 'grad_norm': 0.39457303285598755, 'learning_rate': 0.0003, 'epoch': 1.14}
 23%|███████▌                         | 3100/13570 [2:16:11<57:30:54, 19.78s/it][INFO|trainer.py:3512] 2024-04-19 18:23:15,164 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 18:23:15,164 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 18:23:15,164 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.54s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.74s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.86s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.94s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.99s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.03s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.07s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.09s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.10s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.10s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.10s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.10s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.10s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.10s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.10s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.10s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.10s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.10s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.10s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8099159002304077, 'eval_runtime': 87.3072, 'eval_samples_per_second': 1.26, 'eval_steps_per_second': 0.321, 'epoch': 1.14}
 23%|███████▌                         | 3100/13570 [2:17:39<57:30:54, 19.78s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 18:24:42,474 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3100


[INFO|configuration_utils.py:726] 2024-04-19 18:24:42,731 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 18:24:42,732 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 18:24:42,932 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3100/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 18:24:42,932 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3100/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 18:24:43,270 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 18:24:43,271 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 18:24:43,275 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1100] due to args.save_total_limit


 23%|███████▎                        | 3101/13570 [2:17:59<134:25:41, 46.23s/it]

 23%|███████▎                        | 3102/13570 [2:18:19<111:09:09, 38.23s/it]

 23%|███████▌                         | 3103/13570 [2:18:39<95:04:41, 32.70s/it]

 23%|███████▌                         | 3104/13570 [2:18:58<83:44:34, 28.81s/it]

 23%|███████▌                         | 3105/13570 [2:19:18<75:41:52, 26.04s/it]

 23%|███████▌                         | 3106/13570 [2:19:38<70:35:24, 24.29s/it]

 23%|███████▌                         | 3107/13570 [2:19:58<66:43:54, 22.96s/it]

 23%|███████▌                         | 3108/13570 [2:20:18<63:55:32, 22.00s/it]

 23%|███████▌                         | 3109/13570 [2:20:38<62:12:20, 21.41s/it]

 23%|███████▌                         | 3110/13570 [2:20:58<60:47:54, 20.92s/it]

 23%|███████▌                         | 3111/13570 [2:21:17<59:39:27, 20.53s/it]

 23%|███████▌                         | 3112/13570 [2:21:37<58:55:09, 20.28s/it]

 23%|███████▌                         | 3113/13570 [2:21:56<58:07:07, 20.01s/it]

 23%|███████▌                         | 3114/13570 [2:22:16<58:12:44, 20.04s/it]

 23%|███████▌                         | 3115/13570 [2:22:36<58:05:31, 20.00s/it]

 23%|███████▌                         | 3116/13570 [2:22:56<58:06:37, 20.01s/it]

 23%|███████▌                         | 3117/13570 [2:23:16<57:48:07, 19.91s/it]

 23%|███████▌                         | 3118/13570 [2:23:36<57:41:50, 19.87s/it]

 23%|███████▌                         | 3119/13570 [2:23:55<57:19:01, 19.74s/it]

 23%|███████▌                         | 3120/13570 [2:24:15<57:32:16, 19.82s/it]

 23%|███████▌                         | 3121/13570 [2:24:35<57:22:36, 19.77s/it]

 23%|███████▌                         | 3122/13570 [2:24:55<57:22:38, 19.77s/it]

 23%|███████▌                         | 3123/13570 [2:25:15<57:24:16, 19.78s/it]

 23%|███████▌                         | 3124/13570 [2:25:34<57:15:21, 19.73s/it]

 23%|███████▌                         | 3125/13570 [2:25:54<57:28:58, 19.81s/it]

 23%|███████▌                         | 3126/13570 [2:26:14<57:23:46, 19.78s/it]

 23%|███████▌                         | 3127/13570 [2:26:34<57:19:24, 19.76s/it]

 23%|███████▌                         | 3128/13570 [2:26:53<57:07:51, 19.70s/it]

 23%|███████▌                         | 3129/13570 [2:27:13<57:08:54, 19.70s/it]

 23%|███████▌                         | 3130/13570 [2:27:33<57:29:31, 19.82s/it]

 23%|███████▌                         | 3131/13570 [2:27:53<57:22:21, 19.79s/it]

 23%|███████▌                         | 3132/13570 [2:28:12<57:21:28, 19.78s/it]

 23%|███████▌                         | 3133/13570 [2:28:32<57:11:27, 19.73s/it]

 23%|███████▌                         | 3134/13570 [2:28:52<57:22:00, 19.79s/it]

 23%|███████▌                         | 3135/13570 [2:29:12<57:16:16, 19.76s/it]

 23%|███████▋                         | 3136/13570 [2:29:31<57:11:27, 19.73s/it]

 23%|███████▋                         | 3137/13570 [2:29:51<56:52:43, 19.63s/it]

 23%|███████▋                         | 3138/13570 [2:30:10<56:55:10, 19.64s/it]

 23%|███████▋                         | 3139/13570 [2:30:30<56:57:49, 19.66s/it]

 23%|███████▋                         | 3140/13570 [2:30:50<56:44:39, 19.59s/it]

 23%|███████▋                         | 3141/13570 [2:31:09<57:00:56, 19.68s/it]

 23%|███████▋                         | 3142/13570 [2:31:29<57:16:51, 19.77s/it]

 23%|███████▋                         | 3143/13570 [2:31:49<57:24:00, 19.82s/it]

 23%|███████▋                         | 3144/13570 [2:32:09<57:03:36, 19.70s/it]

 23%|███████▋                         | 3145/13570 [2:32:28<56:56:25, 19.66s/it]

 23%|███████▋                         | 3146/13570 [2:32:48<56:34:13, 19.54s/it]

 23%|███████▋                         | 3147/13570 [2:33:07<56:51:53, 19.64s/it]

 23%|███████▋                         | 3148/13570 [2:33:27<56:58:01, 19.68s/it]

 23%|███████▋                         | 3149/13570 [2:33:47<56:49:56, 19.63s/it]

 23%|███████▋                         | 3150/13570 [2:34:06<56:51:18, 19.64s/it]                                                                                {'loss': 1.6693, 'grad_norm': 0.3839050829410553, 'learning_rate': 0.0003, 'epoch': 1.16}
 23%|███████▋                         | 3150/13570 [2:34:06<56:51:18, 19.64s/it]

 23%|███████▋                         | 3151/13570 [2:34:26<56:57:05, 19.68s/it]

 23%|███████▋                         | 3152/13570 [2:34:46<57:15:13, 19.78s/it]

 23%|███████▋                         | 3153/13570 [2:35:06<57:19:39, 19.81s/it]

 23%|███████▋                         | 3154/13570 [2:35:26<57:15:15, 19.79s/it]

 23%|███████▋                         | 3155/13570 [2:35:46<57:13:05, 19.78s/it]

 23%|███████▋                         | 3156/13570 [2:36:06<57:21:45, 19.83s/it]

 23%|███████▋                         | 3157/13570 [2:36:25<57:09:18, 19.76s/it]

 23%|███████▋                         | 3158/13570 [2:36:45<57:27:04, 19.86s/it]

 23%|███████▋                         | 3159/13570 [2:37:05<57:10:21, 19.77s/it]

 23%|███████▋                         | 3160/13570 [2:37:25<57:26:56, 19.87s/it]

 23%|███████▋                         | 3161/13570 [2:37:45<57:19:27, 19.83s/it]

 23%|███████▋                         | 3162/13570 [2:38:04<57:10:30, 19.78s/it]

 23%|███████▋                         | 3163/13570 [2:38:24<56:58:06, 19.71s/it]

 23%|███████▋                         | 3164/13570 [2:38:44<57:02:21, 19.73s/it]

 23%|███████▋                         | 3165/13570 [2:39:04<57:24:06, 19.86s/it]

 23%|███████▋                         | 3166/13570 [2:39:24<57:33:13, 19.91s/it]

 23%|███████▋                         | 3167/13570 [2:39:44<57:45:56, 19.99s/it]

 23%|███████▋                         | 3168/13570 [2:40:04<57:38:36, 19.95s/it]

 23%|███████▋                         | 3169/13570 [2:40:24<57:49:01, 20.01s/it]

 23%|███████▋                         | 3170/13570 [2:40:44<57:30:55, 19.91s/it]

 23%|███████▋                         | 3171/13570 [2:41:03<57:22:56, 19.86s/it]

 23%|███████▋                         | 3172/13570 [2:41:23<57:31:20, 19.92s/it]

 23%|███████▋                         | 3173/13570 [2:41:43<57:18:35, 19.84s/it]

 23%|███████▋                         | 3174/13570 [2:42:03<57:22:31, 19.87s/it]

 23%|███████▋                         | 3175/13570 [2:42:23<57:06:48, 19.78s/it]

 23%|███████▋                         | 3176/13570 [2:42:42<56:50:33, 19.69s/it]

 23%|███████▋                         | 3177/13570 [2:43:02<57:20:31, 19.86s/it]

 23%|███████▋                         | 3178/13570 [2:43:22<57:08:29, 19.79s/it]

 23%|███████▋                         | 3179/13570 [2:43:42<57:18:37, 19.86s/it]

 23%|███████▋                         | 3180/13570 [2:44:01<56:58:05, 19.74s/it]

 23%|███████▋                         | 3181/13570 [2:44:21<57:10:50, 19.81s/it]

 23%|███████▋                         | 3182/13570 [2:44:42<57:23:21, 19.89s/it]

 23%|███████▋                         | 3183/13570 [2:45:01<57:18:19, 19.86s/it]

 23%|███████▋                         | 3184/13570 [2:45:21<57:15:07, 19.84s/it]

 23%|███████▋                         | 3185/13570 [2:45:41<57:19:48, 19.87s/it]

 23%|███████▋                         | 3186/13570 [2:46:01<57:31:42, 19.94s/it]

 23%|███████▊                         | 3187/13570 [2:46:21<57:29:09, 19.93s/it]

 23%|███████▊                         | 3188/13570 [2:46:41<57:29:06, 19.93s/it]

 24%|███████▊                         | 3189/13570 [2:47:01<57:24:39, 19.91s/it]

 24%|███████▊                         | 3190/13570 [2:47:21<57:18:00, 19.87s/it]

 24%|███████▊                         | 3191/13570 [2:47:40<57:09:09, 19.82s/it]

 24%|███████▊                         | 3192/13570 [2:48:00<57:12:08, 19.84s/it]

 24%|███████▊                         | 3193/13570 [2:48:20<57:02:53, 19.79s/it]

 24%|███████▊                         | 3194/13570 [2:48:40<57:10:58, 19.84s/it]

 24%|███████▊                         | 3195/13570 [2:49:00<57:20:48, 19.90s/it]

 24%|███████▊                         | 3196/13570 [2:49:20<57:22:48, 19.91s/it]

 24%|███████▊                         | 3197/13570 [2:49:40<57:37:56, 20.00s/it]

 24%|███████▊                         | 3198/13570 [2:50:00<57:34:27, 19.98s/it]

 24%|███████▊                         | 3199/13570 [2:50:20<57:44:11, 20.04s/it]

 24%|███████▊                         | 3200/13570 [2:50:40<57:56:16, 20.11s/it]                                                                                {'loss': 1.7187, 'grad_norm': 0.3278166949748993, 'learning_rate': 0.0003, 'epoch': 1.18}
 24%|███████▊                         | 3200/13570 [2:50:40<57:56:16, 20.11s/it][INFO|trainer.py:3512] 2024-04-19 18:57:44,187 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 18:57:44,188 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 18:57:44,188 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.21s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:01,  2.54s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:03,  2.74s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:03,  2.87s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.94s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  3.00s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.03s/it][A


 36%|███████████████▎                           | 10/28 [00:28<00:55,  3.06s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.08s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.09s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.10s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.10s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.11s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:56<00:27,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.12s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.12s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.12s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.12s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.12s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8070600032806396, 'eval_runtime': 87.4786, 'eval_samples_per_second': 1.257, 'eval_steps_per_second': 0.32, 'epoch': 1.18}
 24%|███████▊                         | 3200/13570 [2:52:08<57:56:16, 20.11s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 18:59:11,669 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3200


[INFO|configuration_utils.py:726] 2024-04-19 18:59:11,934 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 18:59:11,936 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 18:59:12,145 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3200/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 18:59:12,146 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3200/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 18:59:12,481 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 18:59:12,482 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 18:59:12,487 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1200] due to args.save_total_limit


 24%|███████▌                        | 3201/13570 [2:52:29<134:13:47, 46.60s/it]

 24%|███████▌                        | 3202/13570 [2:52:49<111:12:17, 38.61s/it]

 24%|███████▊                         | 3203/13570 [2:53:09<94:57:15, 32.97s/it]

 24%|███████▊                         | 3204/13570 [2:53:28<83:20:08, 28.94s/it]

 24%|███████▊                         | 3205/13570 [2:53:48<75:09:48, 26.11s/it]

 24%|███████▊                         | 3206/13570 [2:54:08<69:55:53, 24.29s/it]

 24%|███████▊                         | 3207/13570 [2:54:27<65:57:14, 22.91s/it]

 24%|███████▊                         | 3208/13570 [2:54:48<63:32:44, 22.08s/it]

 24%|███████▊                         | 3209/13570 [2:55:08<61:43:18, 21.45s/it]

 24%|███████▊                         | 3210/13570 [2:55:27<60:22:54, 20.98s/it]

 24%|███████▊                         | 3211/13570 [2:55:47<59:28:41, 20.67s/it]

 24%|███████▊                         | 3212/13570 [2:56:07<58:38:27, 20.38s/it]

 24%|███████▊                         | 3213/13570 [2:56:27<58:23:28, 20.30s/it]

 24%|███████▊                         | 3214/13570 [2:56:47<57:56:30, 20.14s/it]

 24%|███████▊                         | 3215/13570 [2:57:07<57:59:52, 20.16s/it]

 24%|███████▊                         | 3216/13570 [2:57:27<57:57:56, 20.15s/it]

 24%|███████▊                         | 3217/13570 [2:57:47<57:34:25, 20.02s/it]

 24%|███████▊                         | 3218/13570 [2:58:07<57:25:36, 19.97s/it]

 24%|███████▊                         | 3219/13570 [2:58:27<57:13:07, 19.90s/it]

 24%|███████▊                         | 3220/13570 [2:58:47<57:15:36, 19.92s/it]

 24%|███████▊                         | 3221/13570 [2:59:07<57:22:02, 19.96s/it]

 24%|███████▊                         | 3222/13570 [2:59:27<57:25:12, 19.98s/it]

 24%|███████▊                         | 3223/13570 [2:59:46<57:17:41, 19.93s/it]

 24%|███████▊                         | 3224/13570 [3:00:06<56:54:03, 19.80s/it]

 24%|███████▊                         | 3225/13570 [3:00:26<57:17:29, 19.94s/it]

 24%|███████▊                         | 3226/13570 [3:00:46<57:05:53, 19.87s/it]

 24%|███████▊                         | 3227/13570 [3:01:06<57:10:29, 19.90s/it]

 24%|███████▊                         | 3228/13570 [3:01:26<57:06:28, 19.88s/it]

 24%|███████▊                         | 3229/13570 [3:01:46<57:07:44, 19.89s/it]

 24%|███████▊                         | 3230/13570 [3:02:06<57:11:56, 19.91s/it]

 24%|███████▊                         | 3231/13570 [3:02:25<56:55:56, 19.82s/it]

 24%|███████▊                         | 3232/13570 [3:02:45<57:14:06, 19.93s/it]

 24%|███████▊                         | 3233/13570 [3:03:05<56:56:32, 19.83s/it]

 24%|███████▊                         | 3234/13570 [3:03:25<57:06:18, 19.89s/it]

 24%|███████▊                         | 3235/13570 [3:03:45<56:54:53, 19.83s/it]

 24%|███████▊                         | 3236/13570 [3:04:04<56:43:26, 19.76s/it]

 24%|███████▊                         | 3237/13570 [3:04:25<57:09:26, 19.91s/it]

 24%|███████▊                         | 3238/13570 [3:04:45<57:18:14, 19.97s/it]

 24%|███████▉                         | 3239/13570 [3:05:05<57:13:41, 19.94s/it]

 24%|███████▉                         | 3240/13570 [3:05:24<57:03:54, 19.89s/it]

 24%|███████▉                         | 3241/13570 [3:05:44<57:13:03, 19.94s/it]

 24%|███████▉                         | 3242/13570 [3:06:04<57:11:07, 19.93s/it]

 24%|███████▉                         | 3243/13570 [3:06:24<56:58:40, 19.86s/it]

 24%|███████▉                         | 3244/13570 [3:06:44<57:04:23, 19.90s/it]

 24%|███████▉                         | 3245/13570 [3:07:04<56:58:10, 19.86s/it]

 24%|███████▉                         | 3246/13570 [3:07:24<57:03:38, 19.90s/it]

 24%|███████▉                         | 3247/13570 [3:07:43<56:39:48, 19.76s/it]

 24%|███████▉                         | 3248/13570 [3:08:03<56:45:20, 19.79s/it]

 24%|███████▉                         | 3249/13570 [3:08:23<57:00:04, 19.88s/it]

 24%|███████▉                         | 3250/13570 [3:08:43<56:50:49, 19.83s/it]                                                                                {'loss': 1.7017, 'grad_norm': 0.32497861981391907, 'learning_rate': 0.0003, 'epoch': 1.2}
 24%|███████▉                         | 3250/13570 [3:08:43<56:50:49, 19.83s/it]

 24%|███████▉                         | 3251/13570 [3:09:02<56:39:39, 19.77s/it]

 24%|███████▉                         | 3252/13570 [3:09:22<56:36:23, 19.75s/it]

 24%|███████▉                         | 3253/13570 [3:09:42<56:18:29, 19.65s/it]

 24%|███████▉                         | 3254/13570 [3:10:01<56:29:07, 19.71s/it]

 24%|███████▉                         | 3255/13570 [3:10:21<56:36:58, 19.76s/it]

 24%|███████▉                         | 3256/13570 [3:10:41<56:52:07, 19.85s/it]

 24%|███████▉                         | 3257/13570 [3:11:01<56:52:06, 19.85s/it]

 24%|███████▉                         | 3258/13570 [3:11:21<57:07:14, 19.94s/it]

 24%|███████▉                         | 3259/13570 [3:11:41<56:57:38, 19.89s/it]

 24%|███████▉                         | 3260/13570 [3:12:01<56:43:55, 19.81s/it]

 24%|███████▉                         | 3261/13570 [3:12:21<56:50:42, 19.85s/it]

 24%|███████▉                         | 3262/13570 [3:12:40<56:34:19, 19.76s/it]

 24%|███████▉                         | 3263/13570 [3:13:00<56:24:56, 19.70s/it]

 24%|███████▉                         | 3264/13570 [3:13:20<56:50:13, 19.85s/it]

 24%|███████▉                         | 3265/13570 [3:13:40<56:52:57, 19.87s/it]

 24%|███████▉                         | 3266/13570 [3:14:00<56:39:36, 19.80s/it]

 24%|███████▉                         | 3267/13570 [3:14:19<56:23:39, 19.70s/it]

 24%|███████▉                         | 3268/13570 [3:14:39<56:30:34, 19.75s/it]

 24%|███████▉                         | 3269/13570 [3:14:59<56:41:05, 19.81s/it]

 24%|███████▉                         | 3270/13570 [3:15:18<56:30:17, 19.75s/it]

 24%|███████▉                         | 3271/13570 [3:15:38<56:27:34, 19.74s/it]

 24%|███████▉                         | 3272/13570 [3:15:58<56:37:52, 19.80s/it]

 24%|███████▉                         | 3273/13570 [3:16:18<56:28:02, 19.74s/it]

 24%|███████▉                         | 3274/13570 [3:16:37<56:28:12, 19.74s/it]

 24%|███████▉                         | 3275/13570 [3:16:57<56:40:33, 19.82s/it]

 24%|███████▉                         | 3276/13570 [3:17:17<56:39:25, 19.81s/it]

 24%|███████▉                         | 3277/13570 [3:17:37<56:51:46, 19.89s/it]

 24%|███████▉                         | 3278/13570 [3:17:58<57:07:33, 19.98s/it]

 24%|███████▉                         | 3279/13570 [3:18:18<57:06:41, 19.98s/it]

 24%|███████▉                         | 3280/13570 [3:18:37<56:50:35, 19.89s/it]

 24%|███████▉                         | 3281/13570 [3:18:57<56:52:29, 19.90s/it]

 24%|███████▉                         | 3282/13570 [3:19:17<56:50:02, 19.89s/it]

 24%|███████▉                         | 3283/13570 [3:19:37<56:52:20, 19.90s/it]

 24%|███████▉                         | 3284/13570 [3:19:56<56:30:04, 19.77s/it]

 24%|███████▉                         | 3285/13570 [3:20:16<56:24:28, 19.74s/it]

 24%|███████▉                         | 3286/13570 [3:20:36<56:34:43, 19.81s/it]

 24%|███████▉                         | 3287/13570 [3:20:56<56:28:54, 19.77s/it]

 24%|███████▉                         | 3288/13570 [3:21:15<56:27:38, 19.77s/it]

 24%|███████▉                         | 3289/13570 [3:21:36<56:49:40, 19.90s/it]

 24%|████████                         | 3290/13570 [3:21:56<56:49:29, 19.90s/it]

 24%|████████                         | 3291/13570 [3:22:16<56:52:14, 19.92s/it]

 24%|████████                         | 3292/13570 [3:22:35<56:46:16, 19.88s/it]

 24%|████████                         | 3293/13570 [3:22:55<56:33:47, 19.81s/it]

 24%|████████                         | 3294/13570 [3:23:15<56:23:45, 19.76s/it]

 24%|████████                         | 3295/13570 [3:23:35<56:44:27, 19.88s/it]

 24%|████████                         | 3296/13570 [3:23:55<56:49:21, 19.91s/it]

 24%|████████                         | 3297/13570 [3:24:15<56:51:07, 19.92s/it]

 24%|████████                         | 3298/13570 [3:24:35<56:52:37, 19.93s/it]

 24%|████████                         | 3299/13570 [3:24:54<56:33:43, 19.83s/it]

 24%|████████                         | 3300/13570 [3:25:14<56:38:05, 19.85s/it]                                                                                {'loss': 1.6697, 'grad_norm': 0.4891209006309509, 'learning_rate': 0.0003, 'epoch': 1.22}
 24%|████████                         | 3300/13570 [3:25:14<56:38:05, 19.85s/it][INFO|trainer.py:3512] 2024-04-19 19:32:17,893 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 19:32:17,893 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 19:32:17,893 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.54s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.74s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.86s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.94s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.99s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.03s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.07s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.10s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.10s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A




 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.10s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.10s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.10s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8099867105484009, 'eval_runtime': 87.3057, 'eval_samples_per_second': 1.26, 'eval_steps_per_second': 0.321, 'epoch': 1.22}
 24%|████████                         | 3300/13570 [3:26:41<56:38:05, 19.85s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 19:33:45,201 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3300


[INFO|configuration_utils.py:726] 2024-04-19 19:33:45,630 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 19:33:45,632 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 19:33:45,833 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3300/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 19:33:45,833 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3300/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 19:33:46,174 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 19:33:46,174 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 19:33:46,178 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1300] due to args.save_total_limit


 24%|███████▊                        | 3301/13570 [3:27:02<131:54:09, 46.24s/it]

 24%|███████▊                        | 3302/13570 [3:27:22<109:15:27, 38.31s/it]

 24%|████████                         | 3303/13570 [3:27:41<93:17:47, 32.71s/it]

 24%|████████                         | 3304/13570 [3:28:02<82:28:50, 28.92s/it]

 24%|████████                         | 3305/13570 [3:28:21<74:35:30, 26.16s/it]

 24%|████████                         | 3306/13570 [3:28:41<69:31:50, 24.39s/it]

 24%|████████                         | 3307/13570 [3:29:01<65:32:22, 22.99s/it]

 24%|████████                         | 3308/13570 [3:29:21<62:51:23, 22.05s/it]

 24%|████████                         | 3309/13570 [3:29:41<60:57:56, 21.39s/it]

 24%|████████                         | 3310/13570 [3:30:00<59:19:47, 20.82s/it]

 24%|████████                         | 3311/13570 [3:30:20<58:16:42, 20.45s/it]

 24%|████████                         | 3312/13570 [3:30:40<57:52:29, 20.31s/it]

 24%|████████                         | 3313/13570 [3:31:00<57:39:01, 20.23s/it]

 24%|████████                         | 3314/13570 [3:31:20<57:36:56, 20.22s/it]

 24%|████████                         | 3315/13570 [3:31:40<57:10:35, 20.07s/it]

 24%|████████                         | 3316/13570 [3:32:00<57:04:57, 20.04s/it]

 24%|████████                         | 3317/13570 [3:32:20<57:01:44, 20.02s/it]

 24%|████████                         | 3318/13570 [3:32:40<56:41:15, 19.91s/it]

 24%|████████                         | 3319/13570 [3:33:00<56:49:30, 19.96s/it]

 24%|████████                         | 3320/13570 [3:33:19<56:35:43, 19.88s/it]

 24%|████████                         | 3321/13570 [3:33:39<56:35:17, 19.88s/it]

 24%|████████                         | 3322/13570 [3:33:59<56:36:06, 19.88s/it]

 24%|████████                         | 3323/13570 [3:34:19<56:30:42, 19.85s/it]

 24%|████████                         | 3324/13570 [3:34:39<56:28:55, 19.85s/it]

 25%|████████                         | 3325/13570 [3:34:59<56:35:09, 19.88s/it]

 25%|████████                         | 3326/13570 [3:35:19<56:34:36, 19.88s/it]

 25%|████████                         | 3327/13570 [3:35:39<56:40:09, 19.92s/it]

 25%|████████                         | 3328/13570 [3:35:58<56:18:29, 19.79s/it]

 25%|████████                         | 3329/13570 [3:36:18<56:22:21, 19.82s/it]

 25%|████████                         | 3330/13570 [3:36:37<56:09:31, 19.74s/it]

 25%|████████                         | 3331/13570 [3:36:57<55:59:51, 19.69s/it]

 25%|████████                         | 3332/13570 [3:37:17<56:23:07, 19.83s/it]

 25%|████████                         | 3333/13570 [3:37:37<56:06:07, 19.73s/it]

 25%|████████                         | 3334/13570 [3:37:57<56:13:35, 19.77s/it]

 25%|████████                         | 3335/13570 [3:38:16<55:59:15, 19.69s/it]

 25%|████████                         | 3336/13570 [3:38:36<55:50:35, 19.64s/it]

 25%|████████                         | 3337/13570 [3:38:55<55:45:52, 19.62s/it]

 25%|████████                         | 3338/13570 [3:39:15<56:08:10, 19.75s/it]

 25%|████████                         | 3339/13570 [3:39:35<56:01:56, 19.72s/it]

 25%|████████                         | 3340/13570 [3:39:55<56:16:44, 19.80s/it]

 25%|████████                         | 3341/13570 [3:40:15<56:19:37, 19.82s/it]

 25%|████████▏                        | 3342/13570 [3:40:35<56:26:00, 19.86s/it]

 25%|████████▏                        | 3343/13570 [3:40:54<56:11:57, 19.78s/it]

 25%|████████▏                        | 3344/13570 [3:41:14<56:06:47, 19.75s/it]

 25%|████████▏                        | 3345/13570 [3:41:34<56:09:21, 19.77s/it]

 25%|████████▏                        | 3346/13570 [3:41:53<55:59:05, 19.71s/it]

 25%|████████▏                        | 3347/13570 [3:42:13<55:49:17, 19.66s/it]

 25%|████████▏                        | 3348/13570 [3:42:32<55:43:27, 19.63s/it]

 25%|████████▏                        | 3349/13570 [3:42:52<55:38:15, 19.60s/it]

 25%|████████▏                        | 3350/13570 [3:43:12<56:07:39, 19.77s/it]                                                                                {'loss': 1.6602, 'grad_norm': 0.3882026672363281, 'learning_rate': 0.0003, 'epoch': 1.23}
 25%|████████▏                        | 3350/13570 [3:43:12<56:07:39, 19.77s/it]

 25%|████████▏                        | 3351/13570 [3:43:32<55:56:54, 19.71s/it]

 25%|████████▏                        | 3352/13570 [3:43:51<55:59:50, 19.73s/it]

 25%|████████▏                        | 3353/13570 [3:44:11<56:06:00, 19.77s/it]

 25%|████████▏                        | 3354/13570 [3:44:31<56:03:27, 19.75s/it]

 25%|████████▏                        | 3355/13570 [3:44:51<55:49:35, 19.67s/it]

 25%|████████▏                        | 3356/13570 [3:45:10<55:54:24, 19.70s/it]

 25%|████████▏                        | 3357/13570 [3:45:30<55:47:31, 19.67s/it]

 25%|████████▏                        | 3358/13570 [3:45:50<55:48:11, 19.67s/it]

 25%|████████▏                        | 3359/13570 [3:46:10<56:00:56, 19.75s/it]

 25%|████████▏                        | 3360/13570 [3:46:29<56:08:30, 19.80s/it]

 25%|████████▏                        | 3361/13570 [3:46:49<56:07:58, 19.79s/it]

 25%|████████▏                        | 3362/13570 [3:47:09<56:11:28, 19.82s/it]

 25%|████████▏                        | 3363/13570 [3:47:29<55:55:36, 19.73s/it]

 25%|████████▏                        | 3364/13570 [3:47:48<55:42:11, 19.65s/it]

 25%|████████▏                        | 3365/13570 [3:48:08<56:10:19, 19.82s/it]

 25%|████████▏                        | 3366/13570 [3:48:29<56:32:32, 19.95s/it]

 25%|████████▏                        | 3367/13570 [3:48:48<56:31:25, 19.94s/it]

 25%|████████▏                        | 3368/13570 [3:49:08<56:21:51, 19.89s/it]

 25%|████████▏                        | 3369/13570 [3:49:28<56:03:51, 19.79s/it]

 25%|████████▏                        | 3370/13570 [3:49:48<56:07:06, 19.81s/it]

 25%|████████▏                        | 3371/13570 [3:50:07<56:03:23, 19.79s/it]

 25%|████████▏                        | 3372/13570 [3:50:27<56:15:28, 19.86s/it]

 25%|████████▏                        | 3373/13570 [3:50:47<56:04:49, 19.80s/it]

 25%|████████▏                        | 3374/13570 [3:51:07<56:16:01, 19.87s/it]

 25%|████████▏                        | 3375/13570 [3:51:27<56:35:47, 19.99s/it]

 25%|████████▏                        | 3376/13570 [3:51:47<56:32:43, 19.97s/it]

 25%|████████▏                        | 3377/13570 [3:52:07<56:30:31, 19.96s/it]

 25%|████████▏                        | 3378/13570 [3:52:27<56:32:01, 19.97s/it]

 25%|████████▏                        | 3379/13570 [3:52:47<56:03:46, 19.80s/it]

 25%|████████▏                        | 3380/13570 [3:53:07<56:18:10, 19.89s/it]

 25%|████████▏                        | 3381/13570 [3:53:27<56:17:17, 19.89s/it]

 25%|████████▏                        | 3382/13570 [3:53:47<56:29:05, 19.96s/it]

 25%|████████▏                        | 3383/13570 [3:54:07<56:23:05, 19.93s/it]

 25%|████████▏                        | 3384/13570 [3:54:27<56:36:39, 20.01s/it]

 25%|████████▏                        | 3385/13570 [3:54:47<56:29:23, 19.97s/it]

 25%|████████▏                        | 3386/13570 [3:55:07<56:26:30, 19.95s/it]

 25%|████████▏                        | 3387/13570 [3:55:26<56:04:57, 19.83s/it]

 25%|████████▏                        | 3388/13570 [3:55:46<55:57:13, 19.78s/it]

 25%|████████▏                        | 3389/13570 [3:56:05<55:48:04, 19.73s/it]

 25%|████████▏                        | 3390/13570 [3:56:25<55:56:39, 19.78s/it]

 25%|████████▏                        | 3391/13570 [3:56:45<55:38:27, 19.68s/it]

 25%|████████▏                        | 3392/13570 [3:57:05<55:44:15, 19.71s/it]

 25%|████████▎                        | 3393/13570 [3:57:24<55:46:02, 19.73s/it]

 25%|████████▎                        | 3394/13570 [3:57:44<55:49:03, 19.75s/it]

 25%|████████▎                        | 3395/13570 [3:58:04<55:49:02, 19.75s/it]

 25%|████████▎                        | 3396/13570 [3:58:24<55:56:54, 19.80s/it]

 25%|████████▎                        | 3397/13570 [3:58:44<56:11:29, 19.88s/it]

 25%|████████▎                        | 3398/13570 [3:59:04<56:24:09, 19.96s/it]

 25%|████████▎                        | 3399/13570 [3:59:24<56:25:00, 19.97s/it]

 25%|████████▎                        | 3400/13570 [3:59:44<56:04:46, 19.85s/it]

                                                                                {'loss': 1.7002, 'grad_norm': 0.33574825525283813, 'learning_rate': 0.0003, 'epoch': 1.25}
 25%|████████▎                        | 3400/13570 [3:59:44<56:04:46, 19.85s/it][INFO|trainer.py:3512] 2024-04-19 20:06:47,252 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 20:06:47,252 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 20:06:47,252 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.21s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:01,  2.55s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:03,  2.75s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:03,  2.88s/it][A


 25%|███████████                                 | 7/28 [00:18<01:02,  2.96s/it][A


 29%|████████████▌                               | 8/28 [00:21<01:00,  3.01s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.05s/it][A


 36%|███████████████▎                           | 10/28 [00:28<00:55,  3.07s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.08s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.09s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.10s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.11s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.11s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:53<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:56<00:27,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.11s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:21<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8081092834472656, 'eval_runtime': 87.5877, 'eval_samples_per_second': 1.256, 'eval_steps_per_second': 0.32, 'epoch': 1.25}
 25%|████████▎                        | 3400/13570 [4:01:11<56:04:46, 19.85s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 20:08:14,843 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3400


[INFO|configuration_utils.py:726] 2024-04-19 20:08:15,091 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 20:08:15,093 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 20:08:15,292 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3400/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 20:08:15,292 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3400/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 20:08:15,672 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 20:08:15,672 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 20:08:15,677 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1400] due to args.save_total_limit


 25%|████████                        | 3401/13570 [4:01:32<130:59:12, 46.37s/it]

 25%|████████                        | 3402/13570 [4:01:52<108:33:32, 38.44s/it]

 25%|████████▎                        | 3403/13570 [4:02:12<93:09:12, 32.98s/it]

 25%|████████▎                        | 3404/13570 [4:02:32<82:04:25, 29.06s/it]

 25%|████████▎                        | 3405/13570 [4:02:52<74:08:04, 26.26s/it]

 25%|████████▎                        | 3406/13570 [4:03:12<68:48:23, 24.37s/it]

 25%|████████▎                        | 3407/13570 [4:03:32<65:04:38, 23.05s/it]

 25%|████████▎                        | 3408/13570 [4:03:52<62:29:37, 22.14s/it]

 25%|████████▎                        | 3409/13570 [4:04:12<60:42:32, 21.51s/it]

 25%|████████▎                        | 3410/13570 [4:04:31<59:17:53, 21.01s/it]

 25%|████████▎                        | 3411/13570 [4:04:52<58:36:20, 20.77s/it]

 25%|████████▎                        | 3412/13570 [4:05:11<57:34:01, 20.40s/it]

 25%|████████▎                        | 3413/13570 [4:05:31<57:04:41, 20.23s/it]

 25%|████████▎                        | 3414/13570 [4:05:51<56:45:53, 20.12s/it]

 25%|████████▎                        | 3415/13570 [4:06:11<56:24:56, 20.00s/it]

 25%|████████▎                        | 3416/13570 [4:06:30<56:13:49, 19.94s/it]

 25%|████████▎                        | 3417/13570 [4:06:50<55:57:57, 19.84s/it]

 25%|████████▎                        | 3418/13570 [4:07:10<55:56:28, 19.84s/it]

 25%|████████▎                        | 3419/13570 [4:07:29<55:32:49, 19.70s/it]

 25%|████████▎                        | 3420/13570 [4:07:49<55:45:54, 19.78s/it]

 25%|████████▎                        | 3421/13570 [4:08:09<55:30:02, 19.69s/it]

 25%|████████▎                        | 3422/13570 [4:08:29<55:45:46, 19.78s/it]

 25%|████████▎                        | 3423/13570 [4:08:49<55:51:38, 19.82s/it]

 25%|████████▎                        | 3424/13570 [4:09:08<55:44:20, 19.78s/it]

 25%|████████▎                        | 3425/13570 [4:09:28<55:53:30, 19.83s/it]

 25%|████████▎                        | 3426/13570 [4:09:48<55:38:45, 19.75s/it]

 25%|████████▎                        | 3427/13570 [4:10:07<55:35:00, 19.73s/it]

 25%|████████▎                        | 3428/13570 [4:10:27<55:33:12, 19.72s/it]

 25%|████████▎                        | 3429/13570 [4:10:47<55:37:16, 19.75s/it]

 25%|████████▎                        | 3430/13570 [4:11:06<55:17:13, 19.63s/it]

 25%|████████▎                        | 3431/13570 [4:11:26<55:24:47, 19.68s/it]

 25%|████████▎                        | 3432/13570 [4:11:46<55:29:49, 19.71s/it]

 25%|████████▎                        | 3433/13570 [4:12:05<55:12:46, 19.61s/it]

 25%|████████▎                        | 3434/13570 [4:12:25<55:24:59, 19.68s/it]

 25%|████████▎                        | 3435/13570 [4:12:45<55:18:41, 19.65s/it]

 25%|████████▎                        | 3436/13570 [4:13:04<55:15:59, 19.63s/it]

 25%|████████▎                        | 3437/13570 [4:13:24<55:12:29, 19.61s/it]

 25%|████████▎                        | 3438/13570 [4:13:44<55:24:27, 19.69s/it]

 25%|████████▎                        | 3439/13570 [4:14:03<55:28:42, 19.71s/it]

 25%|████████▎                        | 3440/13570 [4:14:23<55:23:57, 19.69s/it]

 25%|████████▎                        | 3441/13570 [4:14:43<55:44:12, 19.81s/it]

 25%|████████▎                        | 3442/13570 [4:15:03<55:44:41, 19.81s/it]

 25%|████████▎                        | 3443/13570 [4:15:23<55:54:13, 19.87s/it]

 25%|████████▍                        | 3444/13570 [4:15:43<55:59:54, 19.91s/it]

 25%|████████▍                        | 3445/13570 [4:16:03<55:45:34, 19.83s/it]

 25%|████████▍                        | 3446/13570 [4:16:23<56:02:45, 19.93s/it]

 25%|████████▍                        | 3447/13570 [4:16:43<56:08:33, 19.97s/it]

 25%|████████▍                        | 3448/13570 [4:17:03<56:02:59, 19.93s/it]

 25%|████████▍                        | 3449/13570 [4:17:23<56:04:11, 19.94s/it]

 25%|████████▍                        | 3450/13570 [4:17:43<56:16:39, 20.02s/it]                                                                                {'loss': 1.6759, 'grad_norm': 0.5359129905700684, 'learning_rate': 0.0003, 'epoch': 1.27}
 25%|████████▍                        | 3450/13570 [4:17:43<56:16:39, 20.02s/it]

 25%|████████▍                        | 3451/13570 [4:18:03<56:19:09, 20.04s/it]

 25%|████████▍                        | 3452/13570 [4:18:23<56:06:01, 19.96s/it]

 25%|████████▍                        | 3453/13570 [4:18:43<56:06:09, 19.96s/it]

 25%|████████▍                        | 3454/13570 [4:19:02<55:48:10, 19.86s/it]

 25%|████████▍                        | 3455/13570 [4:19:22<55:42:34, 19.83s/it]

 25%|████████▍                        | 3456/13570 [4:19:42<55:30:58, 19.76s/it]

 25%|████████▍                        | 3457/13570 [4:20:01<55:13:18, 19.66s/it]

 25%|████████▍                        | 3458/13570 [4:20:21<55:23:14, 19.72s/it]

 25%|████████▍                        | 3459/13570 [4:20:41<55:38:34, 19.81s/it]

 25%|████████▍                        | 3460/13570 [4:21:01<55:48:43, 19.87s/it]

 26%|████████▍                        | 3461/13570 [4:21:21<56:06:17, 19.98s/it]

 26%|████████▍                        | 3462/13570 [4:21:41<55:41:44, 19.84s/it]

 26%|████████▍                        | 3463/13570 [4:22:00<55:38:32, 19.82s/it]

 26%|████████▍                        | 3464/13570 [4:22:20<55:24:07, 19.74s/it]

 26%|████████▍                        | 3465/13570 [4:22:40<55:36:09, 19.81s/it]

 26%|████████▍                        | 3466/13570 [4:23:00<55:46:30, 19.87s/it]

 26%|████████▍                        | 3467/13570 [4:23:19<55:22:42, 19.73s/it]

 26%|████████▍                        | 3468/13570 [4:23:39<55:13:19, 19.68s/it]

 26%|████████▍                        | 3469/13570 [4:23:59<55:04:44, 19.63s/it]

 26%|████████▍                        | 3470/13570 [4:24:18<55:10:06, 19.66s/it]

 26%|████████▍                        | 3471/13570 [4:24:38<54:58:54, 19.60s/it]

 26%|████████▍                        | 3472/13570 [4:24:58<55:17:20, 19.71s/it]

 26%|████████▍                        | 3473/13570 [4:25:18<55:34:34, 19.82s/it]

 26%|████████▍                        | 3474/13570 [4:25:37<55:27:22, 19.77s/it]

 26%|████████▍                        | 3475/13570 [4:25:57<55:33:21, 19.81s/it]

 26%|████████▍                        | 3476/13570 [4:26:17<55:33:07, 19.81s/it]

 26%|████████▍                        | 3477/13570 [4:26:37<55:15:33, 19.71s/it]

 26%|████████▍                        | 3478/13570 [4:26:57<55:34:00, 19.82s/it]

 26%|████████▍                        | 3479/13570 [4:27:16<55:33:22, 19.82s/it]

 26%|████████▍                        | 3480/13570 [4:27:36<55:29:45, 19.80s/it]

 26%|████████▍                        | 3481/13570 [4:27:56<55:27:18, 19.79s/it]

 26%|████████▍                        | 3482/13570 [4:28:15<55:05:57, 19.66s/it]

 26%|████████▍                        | 3483/13570 [4:28:35<55:06:43, 19.67s/it]

 26%|████████▍                        | 3484/13570 [4:28:55<55:21:31, 19.76s/it]

 26%|████████▍                        | 3485/13570 [4:29:15<55:33:06, 19.83s/it]

 26%|████████▍                        | 3486/13570 [4:29:35<55:16:55, 19.74s/it]

 26%|████████▍                        | 3487/13570 [4:29:54<55:20:12, 19.76s/it]

 26%|████████▍                        | 3488/13570 [4:30:14<55:09:23, 19.69s/it]

 26%|████████▍                        | 3489/13570 [4:30:34<55:31:37, 19.83s/it]

 26%|████████▍                        | 3490/13570 [4:30:54<55:21:30, 19.77s/it]

 26%|████████▍                        | 3491/13570 [4:31:14<55:24:54, 19.79s/it]

 26%|████████▍                        | 3492/13570 [4:31:33<55:07:07, 19.69s/it]

 26%|████████▍                        | 3493/13570 [4:31:53<55:24:33, 19.79s/it]

 26%|████████▍                        | 3494/13570 [4:32:13<55:11:32, 19.72s/it]

 26%|████████▍                        | 3495/13570 [4:32:32<55:17:35, 19.76s/it]

 26%|████████▌                        | 3496/13570 [4:32:52<55:15:38, 19.75s/it]

 26%|████████▌                        | 3497/13570 [4:33:12<55:31:17, 19.84s/it]

 26%|████████▌                        | 3498/13570 [4:33:32<55:28:38, 19.83s/it]

 26%|████████▌                        | 3499/13570 [4:33:52<55:31:25, 19.85s/it]

 26%|████████▌                        | 3500/13570 [4:34:12<55:37:49, 19.89s/it]                                                                                {'loss': 1.6811, 'grad_norm': 0.44531187415122986, 'learning_rate': 0.0003, 'epoch': 1.29}
 26%|████████▌                        | 3500/13570 [4:34:12<55:37:49, 19.89s/it][INFO|trainer.py:3512] 2024-04-19 20:41:15,603 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 20:41:15,603 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 20:41:15,603 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.18s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.97s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:48,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.08s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.08s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:07<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.10s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.10s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.812827467918396, 'eval_runtime': 86.8434, 'eval_samples_per_second': 1.267, 'eval_steps_per_second': 0.322, 'epoch': 1.29}
 26%|████████▌                        | 3500/13570 [4:35:39<55:37:49, 19.89s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 20:42:42,449 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3500


[INFO|configuration_utils.py:726] 2024-04-19 20:42:42,709 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 20:42:42,710 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 20:42:42,911 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 20:42:42,912 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3500/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 20:42:43,242 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 20:42:43,243 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json


[INFO|trainer.py:3295] 2024-04-19 20:42:43,248 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1500] due to args.save_total_limit


 26%|████████▎                       | 3501/13570 [4:35:59<129:08:09, 46.17s/it]

 26%|████████▎                       | 3502/13570 [4:36:19<107:08:15, 38.31s/it]

 26%|████████▌                        | 3503/13570 [4:36:39<91:26:33, 32.70s/it]

 26%|████████▌                        | 3504/13570 [4:36:59<80:32:36, 28.81s/it]

 26%|████████▌                        | 3505/13570 [4:37:19<73:06:11, 26.15s/it]

 26%|████████▌                        | 3506/13570 [4:37:38<67:45:28, 24.24s/it]

 26%|████████▌                        | 3507/13570 [4:37:58<64:01:44, 22.91s/it]

 26%|████████▌                        | 3508/13570 [4:38:18<61:42:33, 22.08s/it]

 26%|████████▌                        | 3509/13570 [4:38:38<59:53:48, 21.43s/it]

 26%|████████▌                        | 3510/13570 [4:38:58<58:30:02, 20.93s/it]

 26%|████████▌                        | 3511/13570 [4:39:18<57:44:37, 20.67s/it]

 26%|████████▌                        | 3512/13570 [4:39:38<56:59:25, 20.40s/it]

 26%|████████▌                        | 3513/13570 [4:39:58<56:26:39, 20.20s/it]

 26%|████████▌                        | 3514/13570 [4:40:17<56:00:13, 20.05s/it]

 26%|████████▌                        | 3515/13570 [4:40:37<55:43:00, 19.95s/it]

 26%|████████▌                        | 3516/13570 [4:40:57<55:31:49, 19.88s/it]

 26%|████████▌                        | 3517/13570 [4:41:17<55:36:50, 19.92s/it]

 26%|████████▌                        | 3518/13570 [4:41:36<55:27:37, 19.86s/it]

 26%|████████▌                        | 3519/13570 [4:41:56<55:28:14, 19.87s/it]

 26%|████████▌                        | 3520/13570 [4:42:16<55:09:24, 19.76s/it]

 26%|████████▌                        | 3521/13570 [4:42:35<54:58:11, 19.69s/it]

 26%|████████▌                        | 3522/13570 [4:42:55<55:02:49, 19.72s/it]

 26%|████████▌                        | 3523/13570 [4:43:16<55:37:54, 19.93s/it]

 26%|████████▌                        | 3524/13570 [4:43:36<55:56:54, 20.05s/it]

 26%|████████▌                        | 3525/13570 [4:43:56<55:48:33, 20.00s/it]

 26%|████████▌                        | 3526/13570 [4:44:16<55:48:20, 20.00s/it]

 26%|████████▌                        | 3527/13570 [4:44:36<55:50:19, 20.02s/it]

 26%|████████▌                        | 3528/13570 [4:44:55<55:29:52, 19.90s/it]

 26%|████████▌                        | 3529/13570 [4:45:16<55:38:44, 19.95s/it]

 26%|████████▌                        | 3530/13570 [4:45:35<55:36:17, 19.94s/it]

 26%|████████▌                        | 3531/13570 [4:45:55<55:15:40, 19.82s/it]

 26%|████████▌                        | 3532/13570 [4:46:14<54:58:04, 19.71s/it]

 26%|████████▌                        | 3533/13570 [4:46:34<55:07:15, 19.77s/it]

 26%|████████▌                        | 3534/13570 [4:46:55<55:27:35, 19.89s/it]

 26%|████████▌                        | 3535/13570 [4:47:14<55:17:44, 19.84s/it]

 26%|████████▌                        | 3536/13570 [4:47:34<55:18:30, 19.84s/it]

 26%|████████▌                        | 3537/13570 [4:47:54<55:23:54, 19.88s/it]

 26%|████████▌                        | 3538/13570 [4:48:14<55:16:42, 19.84s/it]

 26%|████████▌                        | 3539/13570 [4:48:34<55:20:20, 19.86s/it]

 26%|████████▌                        | 3540/13570 [4:48:54<55:37:37, 19.97s/it]

 26%|████████▌                        | 3541/13570 [4:49:14<55:25:08, 19.89s/it]

 26%|████████▌                        | 3542/13570 [4:49:34<55:42:27, 20.00s/it]

 26%|████████▌                        | 3543/13570 [4:49:54<55:53:23, 20.07s/it]

 26%|████████▌                        | 3544/13570 [4:50:14<55:31:40, 19.94s/it]

 26%|████████▌                        | 3545/13570 [4:50:33<55:14:59, 19.84s/it]

 26%|████████▌                        | 3546/13570 [4:50:53<55:04:53, 19.78s/it]

 26%|████████▋                        | 3547/13570 [4:51:13<55:14:45, 19.84s/it]

 26%|████████▋                        | 3548/13570 [4:51:33<55:05:38, 19.79s/it]

 26%|████████▋                        | 3549/13570 [4:51:53<55:25:15, 19.91s/it]

 26%|████████▋                        | 3550/13570 [4:52:13<55:17:30, 19.87s/it]                                                                                {'loss': 1.6772, 'grad_norm': 0.303440660238266, 'learning_rate': 0.0003, 'epoch': 1.31}
 26%|████████▋                        | 3550/13570 [4:52:13<55:17:30, 19.87s/it]

 26%|████████▋                        | 3551/13570 [4:52:32<55:06:17, 19.80s/it]

 26%|████████▋                        | 3552/13570 [4:52:52<55:04:18, 19.79s/it]

 26%|████████▋                        | 3553/13570 [4:53:12<55:12:01, 19.84s/it]

 26%|████████▋                        | 3554/13570 [4:53:32<55:34:38, 19.98s/it]

 26%|████████▋                        | 3555/13570 [4:53:52<55:33:26, 19.97s/it]

 26%|████████▋                        | 3556/13570 [4:54:12<55:06:48, 19.81s/it]

 26%|████████▋                        | 3557/13570 [4:54:31<54:55:40, 19.75s/it]

 26%|████████▋                        | 3558/13570 [4:54:51<54:48:42, 19.71s/it]

 26%|████████▋                        | 3559/13570 [4:55:10<54:41:09, 19.67s/it]

 26%|████████▋                        | 3560/13570 [4:55:30<54:45:03, 19.69s/it]

 26%|████████▋                        | 3561/13570 [4:55:50<54:48:34, 19.71s/it]

 26%|████████▋                        | 3562/13570 [4:56:10<54:48:31, 19.72s/it]

 26%|████████▋                        | 3563/13570 [4:56:29<54:48:00, 19.71s/it]

 26%|████████▋                        | 3564/13570 [4:56:49<54:32:39, 19.62s/it]

 26%|████████▋                        | 3565/13570 [4:57:08<54:28:32, 19.60s/it]

 26%|████████▋                        | 3566/13570 [4:57:28<54:43:06, 19.69s/it]

 26%|████████▋                        | 3567/13570 [4:57:48<54:48:47, 19.73s/it]

 26%|████████▋                        | 3568/13570 [4:58:08<54:42:37, 19.69s/it]

 26%|████████▋                        | 3569/13570 [4:58:27<54:33:41, 19.64s/it]

 26%|████████▋                        | 3570/13570 [4:58:47<55:03:40, 19.82s/it]

 26%|████████▋                        | 3571/13570 [4:59:07<55:04:32, 19.83s/it]

 26%|████████▋                        | 3572/13570 [4:59:27<54:59:57, 19.80s/it]

 26%|████████▋                        | 3573/13570 [4:59:47<54:51:08, 19.75s/it]

 26%|████████▋                        | 3574/13570 [5:00:06<54:40:03, 19.69s/it]

 26%|████████▋                        | 3575/13570 [5:00:26<54:47:18, 19.73s/it]

 26%|████████▋                        | 3576/13570 [5:00:46<54:41:52, 19.70s/it]

 26%|████████▋                        | 3577/13570 [5:01:06<54:52:03, 19.77s/it]

 26%|████████▋                        | 3578/13570 [5:01:26<55:09:11, 19.87s/it]

 26%|████████▋                        | 3579/13570 [5:01:46<55:12:21, 19.89s/it]

 26%|████████▋                        | 3580/13570 [5:02:05<55:04:00, 19.84s/it]

 26%|████████▋                        | 3581/13570 [5:02:25<54:58:18, 19.81s/it]

 26%|████████▋                        | 3582/13570 [5:02:45<55:04:18, 19.85s/it]

 26%|████████▋                        | 3583/13570 [5:03:05<55:18:06, 19.93s/it]

 26%|████████▋                        | 3584/13570 [5:03:25<55:23:16, 19.97s/it]

 26%|████████▋                        | 3585/13570 [5:03:45<55:03:51, 19.85s/it]

 26%|████████▋                        | 3586/13570 [5:04:05<55:11:37, 19.90s/it]

 26%|████████▋                        | 3587/13570 [5:04:25<55:09:06, 19.89s/it]

 26%|████████▋                        | 3588/13570 [5:04:44<55:02:30, 19.85s/it]

 26%|████████▋                        | 3589/13570 [5:05:04<55:10:25, 19.90s/it]

 26%|████████▋                        | 3590/13570 [5:05:25<55:19:39, 19.96s/it]

 26%|████████▋                        | 3591/13570 [5:05:44<54:53:31, 19.80s/it]

 26%|████████▋                        | 3592/13570 [5:06:04<55:05:06, 19.87s/it]

 26%|████████▋                        | 3593/13570 [5:06:24<55:04:10, 19.87s/it]

 26%|████████▋                        | 3594/13570 [5:06:44<54:49:11, 19.78s/it]

 26%|████████▋                        | 3595/13570 [5:07:03<54:32:31, 19.68s/it]

 26%|████████▋                        | 3596/13570 [5:07:23<54:47:27, 19.78s/it]

 27%|████████▋                        | 3597/13570 [5:07:43<55:03:23, 19.87s/it]

 27%|████████▋                        | 3598/13570 [5:08:03<55:17:50, 19.96s/it]

 27%|████████▊                        | 3599/13570 [5:08:23<54:55:23, 19.83s/it]

 27%|████████▊                        | 3600/13570 [5:08:43<54:54:07, 19.82s/it]                                                                                {'loss': 1.6565, 'grad_norm': 0.3197615146636963, 'learning_rate': 0.0003, 'epoch': 1.33}
 27%|████████▊                        | 3600/13570 [5:08:43<54:54:07, 19.82s/it][INFO|trainer.py:3512] 2024-04-19 21:15:46,298 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 21:15:46,298 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 21:15:46,298 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.21s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:01,  2.55s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:03,  2.75s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:03,  2.87s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.95s/it][A


 29%|████████████▌                               | 8/28 [00:21<01:00,  3.00s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.04s/it][A


 36%|███████████████▎                           | 10/28 [00:28<00:55,  3.06s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.08s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.09s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.10s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.10s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.11s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:56<00:28,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.12s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.12s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.12s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.12s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:21<00:03,  3.12s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8097385168075562, 'eval_runtime': 87.5535, 'eval_samples_per_second': 1.256, 'eval_steps_per_second': 0.32, 'epoch': 1.33}
 27%|████████▊                        | 3600/13570 [5:10:10<54:54:07, 19.82s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 21:17:13,854 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3600


[INFO|configuration_utils.py:726] 2024-04-19 21:17:14,305 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 21:17:14,307 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 21:17:14,517 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3600/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 21:17:14,517 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3600/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 21:17:14,861 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 21:17:14,861 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 21:17:14,865 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1600] due to args.save_total_limit


 27%|████████▍                       | 3601/13570 [5:10:31<128:25:51, 46.38s/it]

 27%|████████▍                       | 3602/13570 [5:10:51<106:14:49, 38.37s/it]

 27%|████████▊                        | 3603/13570 [5:11:11<91:03:44, 32.89s/it]

 27%|████████▊                        | 3604/13570 [5:11:30<80:02:39, 28.91s/it]

 27%|████████▊                        | 3605/13570 [5:11:50<72:23:00, 26.15s/it]

 27%|████████▊                        | 3606/13570 [5:12:10<66:56:57, 24.19s/it]

 27%|████████▊                        | 3607/13570 [5:12:30<63:28:15, 22.93s/it]

 27%|████████▊                        | 3608/13570 [5:12:50<61:16:13, 22.14s/it]

 27%|████████▊                        | 3609/13570 [5:13:10<59:25:53, 21.48s/it]

 27%|████████▊                        | 3610/13570 [5:13:30<58:00:49, 20.97s/it]

 27%|████████▊                        | 3611/13570 [5:13:50<57:25:13, 20.76s/it]

 27%|████████▊                        | 3612/13570 [5:14:09<56:22:48, 20.38s/it]

 27%|████████▊                        | 3613/13570 [5:14:29<56:05:56, 20.28s/it]

 27%|████████▊                        | 3614/13570 [5:14:49<55:25:04, 20.04s/it]

 27%|████████▊                        | 3615/13570 [5:15:08<54:59:20, 19.89s/it]

 27%|████████▊                        | 3616/13570 [5:15:28<54:49:58, 19.83s/it]

 27%|████████▊                        | 3617/13570 [5:15:48<54:56:54, 19.87s/it]

 27%|████████▊                        | 3618/13570 [5:16:08<54:50:12, 19.84s/it]

 27%|████████▊                        | 3619/13570 [5:16:28<54:58:50, 19.89s/it]

 27%|████████▊                        | 3620/13570 [5:16:48<54:53:16, 19.86s/it]

 27%|████████▊                        | 3621/13570 [5:17:07<54:36:52, 19.76s/it]

 27%|████████▊                        | 3622/13570 [5:17:27<54:22:53, 19.68s/it]

 27%|████████▊                        | 3623/13570 [5:17:47<54:42:57, 19.80s/it]

 27%|████████▊                        | 3624/13570 [5:18:07<54:52:33, 19.86s/it]

 27%|████████▊                        | 3625/13570 [5:18:27<54:59:10, 19.90s/it]

 27%|████████▊                        | 3626/13570 [5:18:46<54:47:42, 19.84s/it]

 27%|████████▊                        | 3627/13570 [5:19:06<54:45:45, 19.83s/it]

 27%|████████▊                        | 3628/13570 [5:19:26<54:43:02, 19.81s/it]

 27%|████████▊                        | 3629/13570 [5:19:46<54:35:24, 19.77s/it]

 27%|████████▊                        | 3630/13570 [5:20:06<54:48:33, 19.85s/it]

 27%|████████▊                        | 3631/13570 [5:20:26<54:54:00, 19.89s/it]

 27%|████████▊                        | 3632/13570 [5:20:46<54:47:32, 19.85s/it]

 27%|████████▊                        | 3633/13570 [5:21:05<54:47:59, 19.85s/it]

 27%|████████▊                        | 3634/13570 [5:21:25<54:55:22, 19.90s/it]

 27%|████████▊                        | 3635/13570 [5:21:45<54:52:03, 19.88s/it]

 27%|████████▊                        | 3636/13570 [5:22:05<55:00:01, 19.93s/it]

 27%|████████▊                        | 3637/13570 [5:22:25<54:43:31, 19.83s/it]

 27%|████████▊                        | 3638/13570 [5:22:45<54:40:13, 19.82s/it]

 27%|████████▊                        | 3639/13570 [5:23:05<54:55:55, 19.91s/it]

 27%|████████▊                        | 3640/13570 [5:23:24<54:31:53, 19.77s/it]

 27%|████████▊                        | 3641/13570 [5:23:44<54:39:40, 19.82s/it]

 27%|████████▊                        | 3642/13570 [5:24:04<54:42:01, 19.83s/it]

 27%|████████▊                        | 3643/13570 [5:24:24<54:39:46, 19.82s/it]

 27%|████████▊                        | 3644/13570 [5:24:43<54:23:09, 19.72s/it]

 27%|████████▊                        | 3645/13570 [5:25:03<54:22:01, 19.72s/it]

 27%|████████▊                        | 3646/13570 [5:25:23<54:32:33, 19.79s/it]

 27%|████████▊                        | 3647/13570 [5:25:43<54:52:44, 19.91s/it]

 27%|████████▊                        | 3648/13570 [5:26:03<54:41:58, 19.85s/it]

 27%|████████▊                        | 3649/13570 [5:26:23<54:53:02, 19.92s/it]

 27%|████████▉                        | 3650/13570 [5:26:42<54:33:59, 19.80s/it]                                                                                {'loss': 1.6922, 'grad_norm': 0.3871593773365021, 'learning_rate': 0.0003, 'epoch': 1.34}
 27%|████████▉                        | 3650/13570 [5:26:42<54:33:59, 19.80s/it]

 27%|████████▉                        | 3651/13570 [5:27:02<54:27:37, 19.77s/it]

 27%|████████▉                        | 3652/13570 [5:27:22<54:24:24, 19.75s/it]

 27%|████████▉                        | 3653/13570 [5:27:42<54:22:24, 19.74s/it]

 27%|████████▉                        | 3654/13570 [5:28:02<54:34:15, 19.81s/it]

 27%|████████▉                        | 3655/13570 [5:28:22<55:02:22, 19.98s/it]

 27%|████████▉                        | 3656/13570 [5:28:42<54:52:09, 19.92s/it]

 27%|████████▉                        | 3657/13570 [5:29:02<54:51:45, 19.92s/it]

 27%|████████▉                        | 3658/13570 [5:29:21<54:32:38, 19.81s/it]

 27%|████████▉                        | 3659/13570 [5:29:41<54:44:44, 19.89s/it]

 27%|████████▉                        | 3660/13570 [5:30:01<54:28:38, 19.79s/it]

 27%|████████▉                        | 3661/13570 [5:30:21<54:30:03, 19.80s/it]

 27%|████████▉                        | 3662/13570 [5:30:40<54:27:47, 19.79s/it]

 27%|████████▉                        | 3663/13570 [5:31:00<54:30:10, 19.81s/it]

 27%|████████▉                        | 3664/13570 [5:31:20<54:44:25, 19.89s/it]

 27%|████████▉                        | 3665/13570 [5:31:40<54:37:50, 19.86s/it]

 27%|████████▉                        | 3666/13570 [5:32:00<54:18:17, 19.74s/it]

 27%|████████▉                        | 3667/13570 [5:32:19<54:02:22, 19.64s/it]

 27%|████████▉                        | 3668/13570 [5:32:39<54:26:34, 19.79s/it]

 27%|████████▉                        | 3669/13570 [5:32:59<54:28:58, 19.81s/it]

 27%|████████▉                        | 3670/13570 [5:33:19<54:24:21, 19.78s/it]

 27%|████████▉                        | 3671/13570 [5:33:38<54:13:15, 19.72s/it]

 27%|████████▉                        | 3672/13570 [5:33:58<54:32:51, 19.84s/it]

 27%|████████▉                        | 3673/13570 [5:34:18<54:30:44, 19.83s/it]

 27%|████████▉                        | 3674/13570 [5:34:38<54:21:45, 19.78s/it]

 27%|████████▉                        | 3675/13570 [5:34:58<54:33:35, 19.85s/it]

 27%|████████▉                        | 3676/13570 [5:35:18<54:26:12, 19.81s/it]

 27%|████████▉                        | 3677/13570 [5:35:37<54:27:12, 19.82s/it]

 27%|████████▉                        | 3678/13570 [5:35:57<54:11:12, 19.72s/it]

 27%|████████▉                        | 3679/13570 [5:36:17<54:39:14, 19.89s/it]

 27%|████████▉                        | 3680/13570 [5:36:37<54:27:55, 19.83s/it]

 27%|████████▉                        | 3681/13570 [5:36:57<54:29:46, 19.84s/it]

 27%|████████▉                        | 3682/13570 [5:37:17<54:46:46, 19.94s/it]

 27%|████████▉                        | 3683/13570 [5:37:37<54:32:24, 19.86s/it]

 27%|████████▉                        | 3684/13570 [5:37:56<54:30:18, 19.85s/it]

 27%|████████▉                        | 3685/13570 [5:38:17<54:43:15, 19.93s/it]

 27%|████████▉                        | 3686/13570 [5:38:36<54:23:18, 19.81s/it]

 27%|████████▉                        | 3687/13570 [5:38:56<54:33:36, 19.87s/it]

 27%|████████▉                        | 3688/13570 [5:39:16<54:46:23, 19.95s/it]

 27%|████████▉                        | 3689/13570 [5:39:36<54:29:38, 19.85s/it]

 27%|████████▉                        | 3690/13570 [5:39:56<54:19:00, 19.79s/it]

 27%|████████▉                        | 3691/13570 [5:40:16<54:35:02, 19.89s/it]

 27%|████████▉                        | 3692/13570 [5:40:35<54:18:29, 19.79s/it]

 27%|████████▉                        | 3693/13570 [5:40:55<54:05:05, 19.71s/it]

 27%|████████▉                        | 3694/13570 [5:41:15<54:08:35, 19.74s/it]

 27%|████████▉                        | 3695/13570 [5:41:34<54:08:02, 19.73s/it]

 27%|████████▉                        | 3696/13570 [5:41:54<54:08:59, 19.74s/it]

 27%|████████▉                        | 3697/13570 [5:42:13<53:52:28, 19.64s/it]

 27%|████████▉                        | 3698/13570 [5:42:33<53:39:02, 19.56s/it]

 27%|████████▉                        | 3699/13570 [5:42:53<53:51:38, 19.64s/it]

 27%|████████▉                        | 3700/13570 [5:43:12<53:48:18, 19.62s/it]                                                                                {'loss': 1.7127, 'grad_norm': 0.5771322846412659, 'learning_rate': 0.0003, 'epoch': 1.36}
 27%|████████▉                        | 3700/13570 [5:43:12<53:48:18, 19.62s/it][INFO|trainer.py:3512] 2024-04-19 21:50:15,973 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 21:50:15,974 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 21:50:15,974 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.97s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:49,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.09s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.09s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.10s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.10s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.10s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.10s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.11s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8076400756835938, 'eval_runtime': 87.0728, 'eval_samples_per_second': 1.263, 'eval_steps_per_second': 0.322, 'epoch': 1.36}
 27%|████████▉                        | 3700/13570 [5:44:39<53:48:18, 19.62s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 21:51:43,049 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3700


[INFO|configuration_utils.py:726] 2024-04-19 21:51:43,331 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 21:51:43,332 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 21:51:43,529 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3700/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 21:51:43,530 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3700/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 21:51:43,877 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 21:51:43,878 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 21:51:43,880 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1700] due to args.save_total_limit


 27%|████████▋                       | 3701/13570 [5:45:00<126:20:54, 46.09s/it]

 27%|████████▋                       | 3702/13570 [5:45:20<104:50:40, 38.25s/it]

 27%|█████████                        | 3703/13570 [5:45:40<89:25:23, 32.63s/it]

 27%|█████████                        | 3704/13570 [5:46:00<79:13:27, 28.91s/it]

 27%|█████████                        | 3705/13570 [5:46:19<71:27:27, 26.08s/it]

 27%|█████████                        | 3706/13570 [5:46:39<66:25:35, 24.24s/it]

 27%|█████████                        | 3707/13570 [5:46:59<62:32:04, 22.83s/it]

 27%|█████████                        | 3708/13570 [5:47:19<60:02:45, 21.92s/it]

 27%|█████████                        | 3709/13570 [5:47:39<58:29:20, 21.35s/it]

 27%|█████████                        | 3710/13570 [5:47:59<57:19:59, 20.93s/it]

 27%|█████████                        | 3711/13570 [5:48:18<56:24:50, 20.60s/it]

 27%|█████████                        | 3712/13570 [5:48:38<55:44:41, 20.36s/it]

 27%|█████████                        | 3713/13570 [5:48:58<55:11:07, 20.15s/it]

 27%|█████████                        | 3714/13570 [5:49:18<55:06:23, 20.13s/it]

 27%|█████████                        | 3715/13570 [5:49:38<55:01:07, 20.10s/it]

 27%|█████████                        | 3716/13570 [5:49:58<54:59:24, 20.09s/it]

 27%|█████████                        | 3717/13570 [5:50:18<55:02:25, 20.11s/it]

 27%|█████████                        | 3718/13570 [5:50:38<54:30:40, 19.92s/it]

 27%|█████████                        | 3719/13570 [5:50:57<54:12:01, 19.81s/it]

 27%|█████████                        | 3720/13570 [5:51:17<54:14:06, 19.82s/it]

 27%|█████████                        | 3721/13570 [5:51:37<54:21:14, 19.87s/it]

 27%|█████████                        | 3722/13570 [5:51:57<54:19:48, 19.86s/it]

 27%|█████████                        | 3723/13570 [5:52:17<54:19:12, 19.86s/it]

 27%|█████████                        | 3724/13570 [5:52:36<54:07:53, 19.79s/it]

 27%|█████████                        | 3725/13570 [5:52:57<54:31:47, 19.94s/it]

 27%|█████████                        | 3726/13570 [5:53:16<54:13:09, 19.83s/it]

 27%|█████████                        | 3727/13570 [5:53:36<54:02:30, 19.77s/it]

 27%|█████████                        | 3728/13570 [5:53:56<54:06:49, 19.79s/it]

 27%|█████████                        | 3729/13570 [5:54:16<54:18:16, 19.87s/it]

 27%|█████████                        | 3730/13570 [5:54:36<54:19:37, 19.88s/it]

 27%|█████████                        | 3731/13570 [5:54:55<54:09:23, 19.82s/it]

 28%|█████████                        | 3732/13570 [5:55:15<54:15:01, 19.85s/it]

 28%|█████████                        | 3733/13570 [5:55:35<54:34:41, 19.97s/it]

 28%|█████████                        | 3734/13570 [5:55:55<54:27:19, 19.93s/it]

 28%|█████████                        | 3735/13570 [5:56:15<54:21:24, 19.90s/it]

 28%|█████████                        | 3736/13570 [5:56:35<54:17:53, 19.88s/it]

 28%|█████████                        | 3737/13570 [5:56:55<54:25:05, 19.92s/it]

 28%|█████████                        | 3738/13570 [5:57:15<54:29:35, 19.95s/it]

 28%|█████████                        | 3739/13570 [5:57:35<54:13:44, 19.86s/it]

 28%|█████████                        | 3740/13570 [5:57:55<54:37:01, 20.00s/it]

 28%|█████████                        | 3741/13570 [5:58:15<54:26:39, 19.94s/it]

 28%|█████████                        | 3742/13570 [5:58:35<54:26:40, 19.94s/it]

 28%|█████████                        | 3743/13570 [5:58:55<54:23:03, 19.92s/it]

 28%|█████████                        | 3744/13570 [5:59:14<54:01:23, 19.79s/it]

 28%|█████████                        | 3745/13570 [5:59:34<54:05:58, 19.82s/it]

 28%|█████████                        | 3746/13570 [5:59:54<54:28:11, 19.96s/it]

 28%|█████████                        | 3747/13570 [6:00:14<54:31:45, 19.98s/it]

 28%|█████████                        | 3748/13570 [6:00:34<54:17:18, 19.90s/it]

 28%|█████████                        | 3749/13570 [6:00:54<54:33:58, 20.00s/it]

 28%|█████████                        | 3750/13570 [6:01:14<54:17:25, 19.90s/it]                                                                                {'loss': 1.6926, 'grad_norm': 0.3645577132701874, 'learning_rate': 0.0003, 'epoch': 1.38}
 28%|█████████                        | 3750/13570 [6:01:14<54:17:25, 19.90s/it]

 28%|█████████                        | 3751/13570 [6:01:34<54:12:25, 19.87s/it]

 28%|█████████                        | 3752/13570 [6:01:53<54:03:00, 19.82s/it]

 28%|█████████▏                       | 3753/13570 [6:02:13<54:07:49, 19.85s/it]

 28%|█████████▏                       | 3754/13570 [6:02:33<54:10:44, 19.87s/it]

 28%|█████████▏                       | 3755/13570 [6:02:53<54:09:55, 19.87s/it]

 28%|█████████▏                       | 3756/13570 [6:03:13<54:26:19, 19.97s/it]

 28%|█████████▏                       | 3757/13570 [6:03:33<54:11:56, 19.88s/it]

 28%|█████████▏                       | 3758/13570 [6:03:53<54:03:01, 19.83s/it]

 28%|█████████▏                       | 3759/13570 [6:04:12<53:54:00, 19.78s/it]

 28%|█████████▏                       | 3760/13570 [6:04:32<54:09:50, 19.88s/it]

 28%|█████████▏                       | 3761/13570 [6:04:52<54:17:40, 19.93s/it]

 28%|█████████▏                       | 3762/13570 [6:05:13<54:23:53, 19.97s/it]

 28%|█████████▏                       | 3763/13570 [6:05:32<54:12:35, 19.90s/it]

 28%|█████████▏                       | 3764/13570 [6:05:52<54:07:49, 19.87s/it]

 28%|█████████▏                       | 3765/13570 [6:06:12<53:58:17, 19.82s/it]

 28%|█████████▏                       | 3766/13570 [6:06:31<53:42:37, 19.72s/it]

 28%|█████████▏                       | 3767/13570 [6:06:51<53:38:28, 19.70s/it]

 28%|█████████▏                       | 3768/13570 [6:07:10<53:27:34, 19.63s/it]

 28%|█████████▏                       | 3769/13570 [6:07:30<53:29:12, 19.65s/it]

 28%|█████████▏                       | 3770/13570 [6:07:50<53:36:57, 19.70s/it]

 28%|█████████▏                       | 3771/13570 [6:08:10<53:50:07, 19.78s/it]

 28%|█████████▏                       | 3772/13570 [6:08:29<53:31:47, 19.67s/it]

 28%|█████████▏                       | 3773/13570 [6:08:49<53:24:53, 19.63s/it]

 28%|█████████▏                       | 3774/13570 [6:09:08<53:23:28, 19.62s/it]

 28%|█████████▏                       | 3775/13570 [6:09:28<53:34:18, 19.69s/it]

 28%|█████████▏                       | 3776/13570 [6:09:48<53:23:26, 19.62s/it]

 28%|█████████▏                       | 3777/13570 [6:10:07<53:24:05, 19.63s/it]

 28%|█████████▏                       | 3778/13570 [6:10:27<53:09:48, 19.55s/it]

 28%|█████████▏                       | 3779/13570 [6:10:47<53:39:33, 19.73s/it]

 28%|█████████▏                       | 3780/13570 [6:11:07<53:45:15, 19.77s/it]

 28%|█████████▏                       | 3781/13570 [6:11:27<53:47:34, 19.78s/it]

 28%|█████████▏                       | 3782/13570 [6:11:46<53:46:14, 19.78s/it]

 28%|█████████▏                       | 3783/13570 [6:12:06<53:42:51, 19.76s/it]

 28%|█████████▏                       | 3784/13570 [6:12:26<53:52:10, 19.82s/it]

 28%|█████████▏                       | 3785/13570 [6:12:46<53:49:58, 19.81s/it]

 28%|█████████▏                       | 3786/13570 [6:13:06<53:51:10, 19.82s/it]

 28%|█████████▏                       | 3787/13570 [6:13:26<54:12:15, 19.95s/it]

 28%|█████████▏                       | 3788/13570 [6:13:46<54:12:39, 19.95s/it]

 28%|█████████▏                       | 3789/13570 [6:14:05<53:48:39, 19.81s/it]

 28%|█████████▏                       | 3790/13570 [6:14:25<53:54:13, 19.84s/it]

 28%|█████████▏                       | 3791/13570 [6:14:45<54:06:33, 19.92s/it]

 28%|█████████▏                       | 3792/13570 [6:15:06<54:18:37, 20.00s/it]

 28%|█████████▏                       | 3793/13570 [6:15:25<53:38:54, 19.75s/it]

 28%|█████████▏                       | 3794/13570 [6:15:45<53:41:29, 19.77s/it]

 28%|█████████▏                       | 3795/13570 [6:16:04<53:51:17, 19.83s/it]

 28%|█████████▏                       | 3796/13570 [6:16:24<53:54:41, 19.86s/it]

 28%|█████████▏                       | 3797/13570 [6:16:44<53:41:12, 19.78s/it]

 28%|█████████▏                       | 3798/13570 [6:17:04<53:34:47, 19.74s/it]

 28%|█████████▏                       | 3799/13570 [6:17:23<53:20:33, 19.65s/it]

 28%|█████████▏                       | 3800/13570 [6:17:42<53:07:09, 19.57s/it]                                                                                {'loss': 1.6857, 'grad_norm': 0.41821321845054626, 'learning_rate': 0.0003, 'epoch': 1.4}
 28%|█████████▏                       | 3800/13570 [6:17:42<53:07:09, 19.57s/it][INFO|trainer.py:3512] 2024-04-19 22:24:46,216 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 22:24:46,216 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 22:24:46,216 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.98s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:49,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.08s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.09s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:07<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A                                                                                
                                                                                [A{'eval_loss': 1.8100290298461914, 'eval_runtime': 86.8113, 'eval_samples_per_second': 1.267, 'eval_steps_per_second': 0.323, 'epoch': 1.4}
 28%|█████████▏                       | 3800/13570 [6:19:09<53:07:09, 19.57s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 22:26:13,030 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3800


[INFO|configuration_utils.py:726] 2024-04-19 22:26:13,273 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 22:26:13,274 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 22:26:13,501 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3800/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 22:26:13,501 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3800/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 22:26:13,869 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 22:26:13,870 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 22:26:13,873 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1800] due to args.save_total_limit


 28%|████████▉                       | 3801/13570 [6:19:30<124:42:12, 45.95s/it]

 28%|████████▉                       | 3802/13570 [6:19:50<103:15:15, 38.05s/it]

 28%|█████████▏                       | 3803/13570 [6:20:09<88:15:38, 32.53s/it]

 28%|█████████▎                       | 3804/13570 [6:20:29<77:48:32, 28.68s/it]

 28%|█████████▎                       | 3805/13570 [6:20:49<70:25:26, 25.96s/it]

 28%|█████████▎                       | 3806/13570 [6:21:08<65:24:31, 24.12s/it]

 28%|█████████▎                       | 3807/13570 [6:21:28<62:01:14, 22.87s/it]

 28%|█████████▎                       | 3808/13570 [6:21:48<59:33:30, 21.96s/it]

 28%|█████████▎                       | 3809/13570 [6:22:08<57:41:49, 21.28s/it]

 28%|█████████▎                       | 3810/13570 [6:22:27<56:14:01, 20.74s/it]

 28%|█████████▎                       | 3811/13570 [6:22:47<55:11:57, 20.36s/it]

 28%|█████████▎                       | 3812/13570 [6:23:07<54:42:51, 20.19s/it]

 28%|█████████▎                       | 3813/13570 [6:23:26<54:10:48, 19.99s/it]

 28%|█████████▎                       | 3814/13570 [6:23:46<54:13:49, 20.01s/it]

 28%|█████████▎                       | 3815/13570 [6:24:06<53:43:54, 19.83s/it]

 28%|█████████▎                       | 3816/13570 [6:24:25<53:32:38, 19.76s/it]

 28%|█████████▎                       | 3817/13570 [6:24:45<53:25:29, 19.72s/it]

 28%|█████████▎                       | 3818/13570 [6:25:04<53:22:07, 19.70s/it]

 28%|█████████▎                       | 3819/13570 [6:25:24<53:13:39, 19.65s/it]

 28%|█████████▎                       | 3820/13570 [6:25:43<53:02:44, 19.59s/it]

 28%|█████████▎                       | 3821/13570 [6:26:03<53:20:29, 19.70s/it]

 28%|█████████▎                       | 3822/13570 [6:26:23<53:22:56, 19.71s/it]

 28%|█████████▎                       | 3823/13570 [6:26:43<53:26:09, 19.74s/it]

 28%|█████████▎                       | 3824/13570 [6:27:03<53:20:54, 19.71s/it]

 28%|█████████▎                       | 3825/13570 [6:27:22<53:15:38, 19.68s/it]

 28%|█████████▎                       | 3826/13570 [6:27:42<53:20:26, 19.71s/it]

 28%|█████████▎                       | 3827/13570 [6:28:02<53:16:07, 19.68s/it]

 28%|█████████▎                       | 3828/13570 [6:28:22<53:28:45, 19.76s/it]

 28%|█████████▎                       | 3829/13570 [6:28:42<53:52:11, 19.91s/it]

 28%|█████████▎                       | 3830/13570 [6:29:02<53:51:03, 19.90s/it]

 28%|█████████▎                       | 3831/13570 [6:29:22<53:48:06, 19.89s/it]

 28%|█████████▎                       | 3832/13570 [6:29:41<53:24:46, 19.75s/it]

 28%|█████████▎                       | 3833/13570 [6:30:01<53:21:11, 19.73s/it]

 28%|█████████▎                       | 3834/13570 [6:30:21<53:35:58, 19.82s/it]

 28%|█████████▎                       | 3835/13570 [6:30:40<53:18:16, 19.71s/it]

 28%|█████████▎                       | 3836/13570 [6:31:00<53:09:47, 19.66s/it]

 28%|█████████▎                       | 3837/13570 [6:31:20<53:29:39, 19.79s/it]

 28%|█████████▎                       | 3838/13570 [6:31:40<53:27:23, 19.77s/it]

 28%|█████████▎                       | 3839/13570 [6:31:59<53:21:20, 19.74s/it]

 28%|█████████▎                       | 3840/13570 [6:32:19<53:14:10, 19.70s/it]

 28%|█████████▎                       | 3841/13570 [6:32:38<53:04:38, 19.64s/it]

 28%|█████████▎                       | 3842/13570 [6:32:58<53:10:55, 19.68s/it]

 28%|█████████▎                       | 3843/13570 [6:33:18<53:29:04, 19.79s/it]

 28%|█████████▎                       | 3844/13570 [6:33:38<53:37:05, 19.85s/it]

 28%|█████████▎                       | 3845/13570 [6:33:57<53:10:27, 19.68s/it]

 28%|█████████▎                       | 3846/13570 [6:34:17<53:09:31, 19.68s/it]

 28%|█████████▎                       | 3847/13570 [6:34:37<53:02:00, 19.64s/it]

 28%|█████████▎                       | 3848/13570 [6:34:56<53:04:25, 19.65s/it]

 28%|█████████▎                       | 3849/13570 [6:35:16<52:50:26, 19.57s/it]

 28%|█████████▎                       | 3850/13570 [6:35:36<53:06:23, 19.67s/it]                                                                                {'loss': 1.6847, 'grad_norm': 0.4348907768726349, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.42}
 28%|█████████▎                       | 3850/13570 [6:35:36<53:06:23, 19.67s/it]

 28%|█████████▎                       | 3851/13570 [6:35:55<53:08:20, 19.68s/it]

 28%|█████████▎                       | 3852/13570 [6:36:15<53:15:54, 19.73s/it]

 28%|█████████▎                       | 3853/13570 [6:36:35<53:06:37, 19.68s/it]

 28%|█████████▎                       | 3854/13570 [6:36:54<53:11:03, 19.71s/it]

 28%|█████████▎                       | 3855/13570 [6:37:14<53:13:23, 19.72s/it]

 28%|█████████▍                       | 3856/13570 [6:37:34<53:25:20, 19.80s/it]

 28%|█████████▍                       | 3857/13570 [6:37:54<53:22:17, 19.78s/it]

 28%|█████████▍                       | 3858/13570 [6:38:14<53:15:35, 19.74s/it]

 28%|█████████▍                       | 3859/13570 [6:38:33<53:10:57, 19.72s/it]

 28%|█████████▍                       | 3860/13570 [6:38:53<53:11:14, 19.72s/it]

 28%|█████████▍                       | 3861/13570 [6:39:13<53:24:32, 19.80s/it]

 28%|█████████▍                       | 3862/13570 [6:39:32<53:04:03, 19.68s/it]

 28%|█████████▍                       | 3863/13570 [6:39:52<53:12:55, 19.74s/it]

 28%|█████████▍                       | 3864/13570 [6:40:12<53:01:35, 19.67s/it]

 28%|█████████▍                       | 3865/13570 [6:40:31<53:01:57, 19.67s/it]

 28%|█████████▍                       | 3866/13570 [6:40:51<53:11:22, 19.73s/it]

 28%|█████████▍                       | 3867/13570 [6:41:11<53:20:12, 19.79s/it]

 29%|█████████▍                       | 3868/13570 [6:41:31<53:10:13, 19.73s/it]

 29%|█████████▍                       | 3869/13570 [6:41:50<53:07:06, 19.71s/it]

 29%|█████████▍                       | 3870/13570 [6:42:10<53:17:49, 19.78s/it]

 29%|█████████▍                       | 3871/13570 [6:42:30<53:29:51, 19.86s/it]

 29%|█████████▍                       | 3872/13570 [6:42:50<53:27:10, 19.84s/it]

 29%|█████████▍                       | 3873/13570 [6:43:10<53:29:41, 19.86s/it]

 29%|█████████▍                       | 3874/13570 [6:43:30<53:18:57, 19.80s/it]

 29%|█████████▍                       | 3875/13570 [6:43:50<53:26:49, 19.85s/it]

 29%|█████████▍                       | 3876/13570 [6:44:09<53:20:58, 19.81s/it]

 29%|█████████▍                       | 3877/13570 [6:44:29<53:17:51, 19.79s/it]

 29%|█████████▍                       | 3878/13570 [6:44:49<53:17:16, 19.79s/it]

 29%|█████████▍                       | 3879/13570 [6:45:09<53:09:07, 19.74s/it]

 29%|█████████▍                       | 3880/13570 [6:45:28<53:08:33, 19.74s/it]

 29%|█████████▍                       | 3881/13570 [6:45:48<53:12:49, 19.77s/it]

 29%|█████████▍                       | 3882/13570 [6:46:08<52:59:41, 19.69s/it]

 29%|█████████▍                       | 3883/13570 [6:46:28<53:09:16, 19.75s/it]

 29%|█████████▍                       | 3884/13570 [6:46:48<53:14:58, 19.79s/it]

 29%|█████████▍                       | 3885/13570 [6:47:07<53:19:28, 19.82s/it]

 29%|█████████▍                       | 3886/13570 [6:47:27<53:03:09, 19.72s/it]

 29%|█████████▍                       | 3887/13570 [6:47:47<53:08:54, 19.76s/it]

 29%|█████████▍                       | 3888/13570 [6:48:07<53:30:53, 19.90s/it]

 29%|█████████▍                       | 3889/13570 [6:48:27<53:19:24, 19.83s/it]

 29%|█████████▍                       | 3890/13570 [6:48:46<53:09:20, 19.77s/it]

 29%|█████████▍                       | 3891/13570 [6:49:06<53:10:31, 19.78s/it]

 29%|█████████▍                       | 3892/13570 [6:49:26<53:13:05, 19.80s/it]

 29%|█████████▍                       | 3893/13570 [6:49:45<52:55:33, 19.69s/it]

 29%|█████████▍                       | 3894/13570 [6:50:05<53:02:16, 19.73s/it]

 29%|█████████▍                       | 3895/13570 [6:50:25<53:06:09, 19.76s/it]

 29%|█████████▍                       | 3896/13570 [6:50:45<53:22:35, 19.86s/it]

 29%|█████████▍                       | 3897/13570 [6:51:05<53:04:10, 19.75s/it]

 29%|█████████▍                       | 3898/13570 [6:51:25<53:14:33, 19.82s/it]

 29%|█████████▍                       | 3899/13570 [6:51:44<53:15:47, 19.83s/it]

 29%|█████████▍                       | 3900/13570 [6:52:04<53:00:14, 19.73s/it]                                                                                {'loss': 1.6842, 'grad_norm': 0.5230944752693176, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.44}
 29%|█████████▍                       | 3900/13570 [6:52:04<53:00:14, 19.73s/it][INFO|trainer.py:3512] 2024-04-19 22:59:07,663 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 22:59:07,663 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 22:59:07,663 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.21s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:01,  2.55s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:03,  2.75s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:03,  2.87s/it][A


 25%|███████████                                 | 7/28 [00:18<01:02,  2.95s/it][A


 29%|████████████▌                               | 8/28 [00:21<01:00,  3.01s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.04s/it][A


 36%|███████████████▎                           | 10/28 [00:28<00:55,  3.06s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.08s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.09s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.10s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.10s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.11s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:56<00:27,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.11s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.802942156791687, 'eval_runtime': 87.4763, 'eval_samples_per_second': 1.257, 'eval_steps_per_second': 0.32, 'epoch': 1.44}
 29%|█████████▍                       | 3900/13570 [6:53:31<53:00:14, 19.73s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 23:00:35,142 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-3900


[INFO|configuration_utils.py:726] 2024-04-19 23:00:35,394 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 23:00:35,395 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 23:00:35,617 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-3900/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 23:00:35,618 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-3900/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 23:00:35,978 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 23:00:35,979 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 23:00:35,983 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-1900] due to args.save_total_limit


 29%|█████████▏                      | 3901/13570 [6:53:52<124:08:31, 46.22s/it]

 29%|█████████▏                      | 3902/13570 [6:54:12<102:47:22, 38.28s/it]

 29%|█████████▍                       | 3903/13570 [6:54:31<87:46:01, 32.68s/it]

 29%|█████████▍                       | 3904/13570 [6:54:51<77:31:33, 28.87s/it]

 29%|█████████▍                       | 3905/13570 [6:55:11<70:22:55, 26.22s/it]

 29%|█████████▍                       | 3906/13570 [6:55:31<64:59:49, 24.21s/it]

 29%|█████████▌                       | 3907/13570 [6:55:50<61:17:29, 22.83s/it]

 29%|█████████▌                       | 3908/13570 [6:56:10<58:43:32, 21.88s/it]

 29%|█████████▌                       | 3909/13570 [6:56:30<57:15:56, 21.34s/it]

 29%|█████████▌                       | 3910/13570 [6:56:50<55:45:34, 20.78s/it]

 29%|█████████▌                       | 3911/13570 [6:57:09<54:56:09, 20.48s/it]

 29%|█████████▌                       | 3912/13570 [6:57:29<54:28:34, 20.31s/it]

 29%|█████████▌                       | 3913/13570 [6:57:49<54:04:29, 20.16s/it]

 29%|█████████▌                       | 3914/13570 [6:58:09<53:44:40, 20.04s/it]

 29%|█████████▌                       | 3915/13570 [6:58:28<53:20:12, 19.89s/it]

 29%|█████████▌                       | 3916/13570 [6:58:48<53:06:42, 19.81s/it]

 29%|█████████▌                       | 3917/13570 [6:59:08<53:10:44, 19.83s/it]

 29%|█████████▌                       | 3918/13570 [6:59:28<53:13:20, 19.85s/it]

 29%|█████████▌                       | 3919/13570 [6:59:48<53:16:03, 19.87s/it]

 29%|█████████▌                       | 3920/13570 [7:00:07<52:56:47, 19.75s/it]

 29%|█████████▌                       | 3921/13570 [7:00:27<53:02:45, 19.79s/it]

 29%|█████████▌                       | 3922/13570 [7:00:47<53:21:31, 19.91s/it]

 29%|█████████▌                       | 3923/13570 [7:01:07<53:10:58, 19.85s/it]

 29%|█████████▌                       | 3924/13570 [7:01:27<53:07:29, 19.83s/it]

 29%|█████████▌                       | 3925/13570 [7:01:47<53:00:41, 19.79s/it]

 29%|█████████▌                       | 3926/13570 [7:02:06<52:56:48, 19.76s/it]

 29%|█████████▌                       | 3927/13570 [7:02:26<53:02:33, 19.80s/it]

 29%|█████████▌                       | 3928/13570 [7:02:46<52:44:13, 19.69s/it]

 29%|█████████▌                       | 3929/13570 [7:03:05<52:55:00, 19.76s/it]

 29%|█████████▌                       | 3930/13570 [7:03:25<52:55:16, 19.76s/it]

 29%|█████████▌                       | 3931/13570 [7:03:45<52:38:13, 19.66s/it]

 29%|█████████▌                       | 3932/13570 [7:04:04<52:42:55, 19.69s/it]

 29%|█████████▌                       | 3933/13570 [7:04:24<52:46:52, 19.72s/it]

 29%|█████████▌                       | 3934/13570 [7:04:44<52:47:29, 19.72s/it]

 29%|█████████▌                       | 3935/13570 [7:05:04<52:58:13, 19.79s/it]

 29%|█████████▌                       | 3936/13570 [7:05:24<53:04:03, 19.83s/it]

 29%|█████████▌                       | 3937/13570 [7:05:44<53:12:55, 19.89s/it]

 29%|█████████▌                       | 3938/13570 [7:06:04<53:24:58, 19.96s/it]

 29%|█████████▌                       | 3939/13570 [7:06:24<53:12:40, 19.89s/it]

 29%|█████████▌                       | 3940/13570 [7:06:43<52:59:28, 19.81s/it]

 29%|█████████▌                       | 3941/13570 [7:07:03<52:55:28, 19.79s/it]

 29%|█████████▌                       | 3942/13570 [7:07:23<52:58:02, 19.80s/it]

 29%|█████████▌                       | 3943/13570 [7:07:43<52:51:29, 19.77s/it]

 29%|█████████▌                       | 3944/13570 [7:08:02<52:43:28, 19.72s/it]

 29%|█████████▌                       | 3945/13570 [7:08:22<52:36:09, 19.67s/it]

 29%|█████████▌                       | 3946/13570 [7:08:41<52:25:18, 19.61s/it]

 29%|█████████▌                       | 3947/13570 [7:09:01<52:10:36, 19.52s/it]

 29%|█████████▌                       | 3948/13570 [7:09:20<52:24:38, 19.61s/it]

 29%|█████████▌                       | 3949/13570 [7:09:40<52:37:04, 19.69s/it]

 29%|█████████▌                       | 3950/13570 [7:10:00<52:44:34, 19.74s/it]                                                                                {'loss': 1.6795, 'grad_norm': 0.3055601716041565, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.46}
 29%|█████████▌                       | 3950/13570 [7:10:00<52:44:34, 19.74s/it]

 29%|█████████▌                       | 3951/13570 [7:10:20<52:50:12, 19.77s/it]

 29%|█████████▌                       | 3952/13570 [7:10:40<52:54:28, 19.80s/it]

 29%|█████████▌                       | 3953/13570 [7:10:59<52:45:21, 19.75s/it]

 29%|█████████▌                       | 3954/13570 [7:11:19<52:38:44, 19.71s/it]

 29%|█████████▌                       | 3955/13570 [7:11:39<52:38:31, 19.71s/it]

 29%|█████████▌                       | 3956/13570 [7:11:59<52:59:12, 19.84s/it]

 29%|█████████▌                       | 3957/13570 [7:12:19<53:01:04, 19.85s/it]

 29%|█████████▋                       | 3958/13570 [7:12:38<52:42:05, 19.74s/it]

 29%|█████████▋                       | 3959/13570 [7:12:58<52:38:11, 19.72s/it]

 29%|█████████▋                       | 3960/13570 [7:13:18<52:39:23, 19.73s/it]

 29%|█████████▋                       | 3961/13570 [7:13:37<52:38:54, 19.72s/it]

 29%|█████████▋                       | 3962/13570 [7:13:57<52:33:32, 19.69s/it]

 29%|█████████▋                       | 3963/13570 [7:14:17<52:33:36, 19.70s/it]

 29%|█████████▋                       | 3964/13570 [7:14:36<52:36:22, 19.71s/it]

 29%|█████████▋                       | 3965/13570 [7:14:56<52:51:39, 19.81s/it]

 29%|█████████▋                       | 3966/13570 [7:15:16<52:49:39, 19.80s/it]

 29%|█████████▋                       | 3967/13570 [7:15:36<52:41:54, 19.76s/it]

 29%|█████████▋                       | 3968/13570 [7:15:56<52:36:49, 19.73s/it]

 29%|█████████▋                       | 3969/13570 [7:16:15<52:36:05, 19.72s/it]

 29%|█████████▋                       | 3970/13570 [7:16:35<52:21:00, 19.63s/it]

 29%|█████████▋                       | 3971/13570 [7:16:54<52:26:42, 19.67s/it]

 29%|█████████▋                       | 3972/13570 [7:17:14<52:17:15, 19.61s/it]

 29%|█████████▋                       | 3973/13570 [7:17:34<52:34:17, 19.72s/it]

 29%|█████████▋                       | 3974/13570 [7:17:54<52:34:47, 19.73s/it]

 29%|█████████▋                       | 3975/13570 [7:18:13<52:32:24, 19.71s/it]

 29%|█████████▋                       | 3976/13570 [7:18:33<52:32:07, 19.71s/it]

 29%|█████████▋                       | 3977/13570 [7:18:53<52:44:24, 19.79s/it]

 29%|█████████▋                       | 3978/13570 [7:19:13<52:31:42, 19.71s/it]

 29%|█████████▋                       | 3979/13570 [7:19:32<52:32:22, 19.72s/it]

 29%|█████████▋                       | 3980/13570 [7:19:52<52:37:05, 19.75s/it]

 29%|█████████▋                       | 3981/13570 [7:20:12<52:31:48, 19.72s/it]

 29%|█████████▋                       | 3982/13570 [7:20:32<52:53:32, 19.86s/it]

 29%|█████████▋                       | 3983/13570 [7:20:51<52:36:57, 19.76s/it]

 29%|█████████▋                       | 3984/13570 [7:21:11<52:34:47, 19.75s/it]

 29%|█████████▋                       | 3985/13570 [7:21:31<52:52:00, 19.86s/it]

 29%|█████████▋                       | 3986/13570 [7:21:51<53:00:01, 19.91s/it]

 29%|█████████▋                       | 3987/13570 [7:22:11<53:02:16, 19.92s/it]

 29%|█████████▋                       | 3988/13570 [7:22:31<53:00:21, 19.91s/it]

 29%|█████████▋                       | 3989/13570 [7:22:51<52:57:11, 19.90s/it]

 29%|█████████▋                       | 3990/13570 [7:23:11<52:37:08, 19.77s/it]

 29%|█████████▋                       | 3991/13570 [7:23:30<52:28:08, 19.72s/it]

 29%|█████████▋                       | 3992/13570 [7:23:50<52:32:28, 19.75s/it]

 29%|█████████▋                       | 3993/13570 [7:24:10<52:35:50, 19.77s/it]

 29%|█████████▋                       | 3994/13570 [7:24:29<52:24:16, 19.70s/it]

 29%|█████████▋                       | 3995/13570 [7:24:49<52:25:47, 19.71s/it]

 29%|█████████▋                       | 3996/13570 [7:25:09<52:20:14, 19.68s/it]

 29%|█████████▋                       | 3997/13570 [7:25:28<52:15:02, 19.65s/it]

 29%|█████████▋                       | 3998/13570 [7:25:48<52:07:19, 19.60s/it]

 29%|█████████▋                       | 3999/13570 [7:26:07<52:04:07, 19.58s/it]

 29%|█████████▋                       | 4000/13570 [7:26:27<52:29:51, 19.75s/it]                                                                                {'loss': 1.6762, 'grad_norm': 0.34449824690818787, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.47}
 29%|█████████▋                       | 4000/13570 [7:26:27<52:29:51, 19.75s/it][INFO|trainer.py:3512] 2024-04-19 23:33:31,121 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-19 23:33:31,121 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-19 23:33:31,121 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.85s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.93s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.98s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.04s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.08s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:36,  3.08s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.08s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:07<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A                                                                                
                                                                                [A{'eval_loss': 1.801178216934204, 'eval_runtime': 86.8319, 'eval_samples_per_second': 1.267, 'eval_steps_per_second': 0.322, 'epoch': 1.47}
 29%|█████████▋                       | 4000/13570 [7:27:54<52:29:51, 19.75s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.11s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-19 23:34:57,956 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4000


[INFO|configuration_utils.py:726] 2024-04-19 23:34:58,213 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-19 23:34:58,215 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-19 23:34:58,416 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 23:34:58,416 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4000/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-19 23:34:59,281 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-19 23:34:59,281 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-19 23:34:59,285 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2000] due to args.save_total_limit


 29%|█████████▍                      | 4001/13570 [7:28:15<122:37:07, 46.13s/it]

 29%|█████████▍                      | 4002/13570 [7:28:35<101:22:46, 38.14s/it]

 29%|█████████▋                       | 4003/13570 [7:28:54<86:30:50, 32.55s/it]

 30%|█████████▋                       | 4004/13570 [7:29:14<76:14:30, 28.69s/it]

 30%|█████████▋                       | 4005/13570 [7:29:34<69:05:53, 26.01s/it]

 30%|█████████▋                       | 4006/13570 [7:29:53<64:05:05, 24.12s/it]

 30%|█████████▋                       | 4007/13570 [7:30:13<60:37:27, 22.82s/it]

 30%|█████████▋                       | 4008/13570 [7:30:33<58:02:46, 21.85s/it]

 30%|█████████▋                       | 4009/13570 [7:30:53<56:31:05, 21.28s/it]

 30%|█████████▊                       | 4010/13570 [7:31:12<54:59:32, 20.71s/it]

 30%|█████████▊                       | 4011/13570 [7:31:32<54:21:14, 20.47s/it]

 30%|█████████▊                       | 4012/13570 [7:31:52<53:49:37, 20.27s/it]

 30%|█████████▊                       | 4013/13570 [7:32:11<53:12:11, 20.04s/it]

 30%|█████████▊                       | 4014/13570 [7:32:31<53:04:48, 20.00s/it]

 30%|█████████▊                       | 4015/13570 [7:32:51<52:52:22, 19.92s/it]

 30%|█████████▊                       | 4016/13570 [7:33:11<52:43:05, 19.86s/it]

 30%|█████████▊                       | 4017/13570 [7:33:30<52:29:14, 19.78s/it]

 30%|█████████▊                       | 4018/13570 [7:33:50<52:12:54, 19.68s/it]

 30%|█████████▊                       | 4019/13570 [7:34:09<52:22:59, 19.74s/it]

 30%|█████████▊                       | 4020/13570 [7:34:29<52:17:34, 19.71s/it]

 30%|█████████▊                       | 4021/13570 [7:34:49<52:05:20, 19.64s/it]

 30%|█████████▊                       | 4022/13570 [7:35:09<52:21:34, 19.74s/it]

 30%|█████████▊                       | 4023/13570 [7:35:28<52:27:27, 19.78s/it]

 30%|█████████▊                       | 4024/13570 [7:35:48<52:29:56, 19.80s/it]

 30%|█████████▊                       | 4025/13570 [7:36:08<52:34:28, 19.83s/it]

 30%|█████████▊                       | 4026/13570 [7:36:28<52:30:08, 19.80s/it]

 30%|█████████▊                       | 4027/13570 [7:36:48<52:28:54, 19.80s/it]

 30%|█████████▊                       | 4028/13570 [7:37:08<52:40:04, 19.87s/it]

 30%|█████████▊                       | 4029/13570 [7:37:27<52:32:37, 19.83s/it]

 30%|█████████▊                       | 4030/13570 [7:37:47<52:34:26, 19.84s/it]

 30%|█████████▊                       | 4031/13570 [7:38:07<52:09:49, 19.69s/it]

 30%|█████████▊                       | 4032/13570 [7:38:26<51:56:02, 19.60s/it]

 30%|█████████▊                       | 4033/13570 [7:38:46<52:00:49, 19.63s/it]

 30%|█████████▊                       | 4034/13570 [7:39:05<51:59:26, 19.63s/it]

 30%|█████████▊                       | 4035/13570 [7:39:25<51:59:50, 19.63s/it]

 30%|█████████▊                       | 4036/13570 [7:39:45<52:08:11, 19.69s/it]

 30%|█████████▊                       | 4037/13570 [7:40:05<52:18:01, 19.75s/it]

 30%|█████████▊                       | 4038/13570 [7:40:24<52:07:07, 19.68s/it]

 30%|█████████▊                       | 4039/13570 [7:40:44<51:56:35, 19.62s/it]

 30%|█████████▊                       | 4040/13570 [7:41:03<51:49:45, 19.58s/it]

 30%|█████████▊                       | 4041/13570 [7:41:23<52:15:52, 19.75s/it]

 30%|█████████▊                       | 4042/13570 [7:41:43<52:20:57, 19.78s/it]

 30%|█████████▊                       | 4043/13570 [7:42:03<52:09:40, 19.71s/it]

 30%|█████████▊                       | 4044/13570 [7:42:23<52:14:21, 19.74s/it]

 30%|█████████▊                       | 4045/13570 [7:42:42<52:16:25, 19.76s/it]

 30%|█████████▊                       | 4046/13570 [7:43:02<52:10:52, 19.72s/it]

 30%|█████████▊                       | 4047/13570 [7:43:22<52:18:28, 19.77s/it]

 30%|█████████▊                       | 4048/13570 [7:43:42<52:17:58, 19.77s/it]

 30%|█████████▊                       | 4049/13570 [7:44:01<52:19:27, 19.78s/it]

 30%|█████████▊                       | 4050/13570 [7:44:21<52:27:35, 19.84s/it]                                                                                {'loss': 1.6855, 'grad_norm': 0.36664265394210815, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.49}
 30%|█████████▊                       | 4050/13570 [7:44:21<52:27:35, 19.84s/it]

 30%|█████████▊                       | 4051/13570 [7:44:42<52:43:22, 19.94s/it]

 30%|█████████▊                       | 4052/13570 [7:45:02<52:50:32, 19.99s/it]

 30%|█████████▊                       | 4053/13570 [7:45:22<53:01:00, 20.05s/it]

 30%|█████████▊                       | 4054/13570 [7:45:42<53:02:08, 20.06s/it]

 30%|█████████▊                       | 4055/13570 [7:46:02<52:55:23, 20.02s/it]

 30%|█████████▊                       | 4056/13570 [7:46:22<53:05:25, 20.09s/it]

 30%|█████████▊                       | 4057/13570 [7:46:42<52:50:09, 19.99s/it]

 30%|█████████▊                       | 4058/13570 [7:47:02<52:47:33, 19.98s/it]

 30%|█████████▊                       | 4059/13570 [7:47:22<52:41:41, 19.95s/it]

 30%|█████████▊                       | 4060/13570 [7:47:42<52:38:36, 19.93s/it]

 30%|█████████▉                       | 4061/13570 [7:48:02<52:45:50, 19.98s/it]

 30%|█████████▉                       | 4062/13570 [7:48:22<52:46:15, 19.98s/it]

 30%|█████████▉                       | 4063/13570 [7:48:41<52:33:19, 19.90s/it]

 30%|█████████▉                       | 4064/13570 [7:49:01<52:33:52, 19.91s/it]

 30%|█████████▉                       | 4065/13570 [7:49:21<52:12:53, 19.78s/it]

 30%|█████████▉                       | 4066/13570 [7:49:41<52:23:34, 19.85s/it]

 30%|█████████▉                       | 4067/13570 [7:50:00<52:07:14, 19.74s/it]

 30%|█████████▉                       | 4068/13570 [7:50:21<52:25:40, 19.86s/it]

 30%|█████████▉                       | 4069/13570 [7:50:40<52:15:39, 19.80s/it]

 30%|█████████▉                       | 4070/13570 [7:51:00<52:08:03, 19.76s/it]

 30%|█████████▉                       | 4071/13570 [7:51:20<52:17:52, 19.82s/it]

 30%|█████████▉                       | 4072/13570 [7:51:40<52:16:20, 19.81s/it]

 30%|█████████▉                       | 4073/13570 [7:51:59<52:20:47, 19.84s/it]

 30%|█████████▉                       | 4074/13570 [7:52:19<52:06:22, 19.75s/it]

 30%|█████████▉                       | 4075/13570 [7:52:39<52:06:57, 19.76s/it]

 30%|█████████▉                       | 4076/13570 [7:52:59<52:14:39, 19.81s/it]

 30%|█████████▉                       | 4077/13570 [7:53:19<52:25:03, 19.88s/it]

 30%|█████████▉                       | 4078/13570 [7:53:39<52:25:16, 19.88s/it]

 30%|█████████▉                       | 4079/13570 [7:53:58<52:17:49, 19.84s/it]

 30%|█████████▉                       | 4080/13570 [7:54:18<52:11:11, 19.80s/it]

 30%|█████████▉                       | 4081/13570 [7:54:38<52:05:49, 19.76s/it]

 30%|█████████▉                       | 4082/13570 [7:54:57<52:02:37, 19.75s/it]

 30%|█████████▉                       | 4083/13570 [7:55:18<52:14:43, 19.83s/it]

 30%|█████████▉                       | 4084/13570 [7:55:37<52:16:55, 19.84s/it]

 30%|█████████▉                       | 4085/13570 [7:55:57<52:29:43, 19.92s/it]

 30%|█████████▉                       | 4086/13570 [7:56:18<52:34:00, 19.95s/it]

 30%|█████████▉                       | 4087/13570 [7:56:37<52:30:14, 19.93s/it]

 30%|█████████▉                       | 4088/13570 [7:56:57<52:30:18, 19.93s/it]

 30%|█████████▉                       | 4089/13570 [7:57:17<52:23:11, 19.89s/it]

 30%|█████████▉                       | 4090/13570 [7:57:37<52:18:10, 19.86s/it]

 30%|█████████▉                       | 4091/13570 [7:57:57<52:10:12, 19.81s/it]

 30%|█████████▉                       | 4092/13570 [7:58:16<51:49:43, 19.69s/it]

 30%|█████████▉                       | 4093/13570 [7:58:36<51:51:13, 19.70s/it]

 30%|█████████▉                       | 4094/13570 [7:58:56<52:06:31, 19.80s/it]

 30%|█████████▉                       | 4095/13570 [7:59:15<51:57:58, 19.74s/it]

 30%|█████████▉                       | 4096/13570 [7:59:35<52:01:16, 19.77s/it]

 30%|█████████▉                       | 4097/13570 [7:59:55<52:08:30, 19.82s/it]

 30%|█████████▉                       | 4098/13570 [8:00:15<52:25:10, 19.92s/it]

 30%|█████████▉                       | 4099/13570 [8:00:35<52:15:20, 19.86s/it]

 30%|█████████▉                       | 4100/13570 [8:00:55<51:57:18, 19.75s/it]                                                                                {'loss': 1.6635, 'grad_norm': 0.4350913166999817, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.51}
 30%|█████████▉                       | 4100/13570 [8:00:55<51:57:18, 19.75s/it][INFO|trainer.py:3512] 2024-04-20 00:07:58,264 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 00:07:58,264 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 00:07:58,264 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.21s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:01,  2.55s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:03,  2.75s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:03,  2.87s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.95s/it][A


 29%|████████████▌                               | 8/28 [00:21<01:00,  3.01s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.04s/it][A


 36%|███████████████▎                           | 10/28 [00:28<00:55,  3.07s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.09s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.10s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.11s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.11s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.11s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:53<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:56<00:28,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.12s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.12s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.11s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:21<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A                                                                                
                                                                                [A{'eval_loss': 1.79993736743927, 'eval_runtime': 87.6196, 'eval_samples_per_second': 1.255, 'eval_steps_per_second': 0.32, 'epoch': 1.51}
 30%|█████████▉                       | 4100/13570 [8:02:22<51:57:18, 19.75s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 00:09:25,886 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4100


[INFO|configuration_utils.py:726] 2024-04-20 00:09:26,272 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 00:09:26,273 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 00:09:26,469 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4100/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 00:09:26,469 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4100/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 00:09:26,810 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 00:09:26,811 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 00:09:26,815 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2100] due to args.save_total_limit


 30%|█████████▋                      | 4101/13570 [8:02:43<121:53:25, 46.34s/it]

 30%|█████████▋                      | 4102/13570 [8:03:02<100:45:00, 38.31s/it]

 30%|█████████▉                       | 4103/13570 [8:03:22<85:53:16, 32.66s/it]

 30%|█████████▉                       | 4104/13570 [8:03:42<75:57:50, 28.89s/it]

 30%|█████████▉                       | 4105/13570 [8:04:01<68:25:42, 26.03s/it]

 30%|█████████▉                       | 4106/13570 [8:04:21<63:43:31, 24.24s/it]

 30%|█████████▉                       | 4107/13570 [8:04:41<60:20:41, 22.96s/it]

 30%|█████████▉                       | 4108/13570 [8:05:01<57:42:48, 21.96s/it]

 30%|█████████▉                       | 4109/13570 [8:05:21<56:11:08, 21.38s/it]

 30%|█████████▉                       | 4110/13570 [8:05:41<54:46:48, 20.85s/it]

 30%|█████████▉                       | 4111/13570 [8:06:01<54:21:56, 20.69s/it]

 30%|█████████▉                       | 4112/13570 [8:06:21<53:41:52, 20.44s/it]

 30%|██████████                       | 4113/13570 [8:06:40<53:00:22, 20.18s/it]

 30%|██████████                       | 4114/13570 [8:07:00<52:40:24, 20.05s/it]

 30%|██████████                       | 4115/13570 [8:07:20<52:20:15, 19.93s/it]

 30%|██████████                       | 4116/13570 [8:07:39<52:07:18, 19.85s/it]

 30%|██████████                       | 4117/13570 [8:07:59<52:10:42, 19.87s/it]

 30%|██████████                       | 4118/13570 [8:08:19<51:52:08, 19.76s/it]

 30%|██████████                       | 4119/13570 [8:08:39<52:13:30, 19.89s/it]

 30%|██████████                       | 4120/13570 [8:08:59<52:23:54, 19.96s/it]

 30%|██████████                       | 4121/13570 [8:09:19<52:22:09, 19.95s/it]

 30%|██████████                       | 4122/13570 [8:09:39<52:07:28, 19.86s/it]

 30%|██████████                       | 4123/13570 [8:09:59<52:05:25, 19.85s/it]

 30%|██████████                       | 4124/13570 [8:10:18<52:04:30, 19.85s/it]

 30%|██████████                       | 4125/13570 [8:10:38<52:05:05, 19.85s/it]

 30%|██████████                       | 4126/13570 [8:10:58<51:49:29, 19.76s/it]

 30%|██████████                       | 4127/13570 [8:11:17<51:41:41, 19.71s/it]

 30%|██████████                       | 4128/13570 [8:11:37<51:40:14, 19.70s/it]

 30%|██████████                       | 4129/13570 [8:11:57<51:43:45, 19.73s/it]

 30%|██████████                       | 4130/13570 [8:12:17<52:00:47, 19.84s/it]

 30%|██████████                       | 4131/13570 [8:12:37<51:55:21, 19.80s/it]

 30%|██████████                       | 4132/13570 [8:12:57<52:05:11, 19.87s/it]

 30%|██████████                       | 4133/13570 [8:13:17<52:11:03, 19.91s/it]

 30%|██████████                       | 4134/13570 [8:13:37<52:07:38, 19.89s/it]

 30%|██████████                       | 4135/13570 [8:13:56<51:49:43, 19.78s/it]

 30%|██████████                       | 4136/13570 [8:14:16<52:04:33, 19.87s/it]

 30%|██████████                       | 4137/13570 [8:14:36<51:48:13, 19.77s/it]

 30%|██████████                       | 4138/13570 [8:14:56<52:02:09, 19.86s/it]

 31%|██████████                       | 4139/13570 [8:15:16<51:54:25, 19.81s/it]

 31%|██████████                       | 4140/13570 [8:15:35<51:40:34, 19.73s/it]

 31%|██████████                       | 4141/13570 [8:15:55<52:07:31, 19.90s/it]

 31%|██████████                       | 4142/13570 [8:16:15<52:10:00, 19.92s/it]

 31%|██████████                       | 4143/13570 [8:16:35<52:10:26, 19.92s/it]

 31%|██████████                       | 4144/13570 [8:16:55<52:07:35, 19.91s/it]

 31%|██████████                       | 4145/13570 [8:17:15<51:48:47, 19.79s/it]

 31%|██████████                       | 4146/13570 [8:17:34<51:45:28, 19.77s/it]

 31%|██████████                       | 4147/13570 [8:17:54<51:50:33, 19.81s/it]

 31%|██████████                       | 4148/13570 [8:18:14<51:47:30, 19.79s/it]

 31%|██████████                       | 4149/13570 [8:18:34<51:35:42, 19.72s/it]

 31%|██████████                       | 4150/13570 [8:18:53<51:27:34, 19.67s/it]                                                                                {'loss': 1.6635, 'grad_norm': 0.43372225761413574, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.53}
 31%|██████████                       | 4150/13570 [8:18:53<51:27:34, 19.67s/it]

 31%|██████████                       | 4151/13570 [8:19:13<51:30:29, 19.69s/it]

 31%|██████████                       | 4152/13570 [8:19:33<51:44:36, 19.78s/it]

 31%|██████████                       | 4153/13570 [8:19:53<51:47:06, 19.80s/it]

 31%|██████████                       | 4154/13570 [8:20:13<51:47:41, 19.80s/it]

 31%|██████████                       | 4155/13570 [8:20:32<51:56:29, 19.86s/it]

 31%|██████████                       | 4156/13570 [8:20:52<51:47:22, 19.80s/it]

 31%|██████████                       | 4157/13570 [8:21:12<51:46:18, 19.80s/it]

 31%|██████████                       | 4158/13570 [8:21:31<51:32:39, 19.72s/it]

 31%|██████████                       | 4159/13570 [8:21:51<51:17:59, 19.62s/it]

 31%|██████████                       | 4160/13570 [8:22:10<51:12:44, 19.59s/it]

 31%|██████████                       | 4161/13570 [8:22:30<51:29:26, 19.70s/it]

 31%|██████████                       | 4162/13570 [8:22:50<51:36:01, 19.75s/it]

 31%|██████████                       | 4163/13570 [8:23:10<51:37:24, 19.76s/it]

 31%|██████████▏                      | 4164/13570 [8:23:29<51:21:46, 19.66s/it]

 31%|██████████▏                      | 4165/13570 [8:23:49<51:19:04, 19.64s/it]

 31%|██████████▏                      | 4166/13570 [8:24:09<51:27:30, 19.70s/it]

 31%|██████████▏                      | 4167/13570 [8:24:29<51:30:48, 19.72s/it]

 31%|██████████▏                      | 4168/13570 [8:24:48<51:18:21, 19.64s/it]

 31%|██████████▏                      | 4169/13570 [8:25:07<51:02:52, 19.55s/it]

 31%|██████████▏                      | 4170/13570 [8:25:27<51:12:01, 19.61s/it]

 31%|██████████▏                      | 4171/13570 [8:25:47<51:16:25, 19.64s/it]

 31%|██████████▏                      | 4172/13570 [8:26:07<51:23:23, 19.69s/it]

 31%|██████████▏                      | 4173/13570 [8:26:26<51:29:29, 19.73s/it]

 31%|██████████▏                      | 4174/13570 [8:26:46<51:18:10, 19.66s/it]

 31%|██████████▏                      | 4175/13570 [8:27:06<51:20:26, 19.67s/it]

 31%|██████████▏                      | 4176/13570 [8:27:25<51:24:27, 19.70s/it]

 31%|██████████▏                      | 4177/13570 [8:27:45<51:18:07, 19.66s/it]

 31%|██████████▏                      | 4178/13570 [8:28:05<51:23:09, 19.70s/it]

 31%|██████████▏                      | 4179/13570 [8:28:25<51:23:08, 19.70s/it]

 31%|██████████▏                      | 4180/13570 [8:28:44<51:27:24, 19.73s/it]

 31%|██████████▏                      | 4181/13570 [8:29:04<51:31:36, 19.76s/it]

 31%|██████████▏                      | 4182/13570 [8:29:24<51:26:41, 19.73s/it]

 31%|██████████▏                      | 4183/13570 [8:29:44<51:33:00, 19.77s/it]

 31%|██████████▏                      | 4184/13570 [8:30:04<51:39:29, 19.81s/it]

 31%|██████████▏                      | 4185/13570 [8:30:24<51:54:34, 19.91s/it]

 31%|██████████▏                      | 4186/13570 [8:30:43<51:26:34, 19.74s/it]

 31%|██████████▏                      | 4187/13570 [8:31:03<51:15:26, 19.67s/it]

 31%|██████████▏                      | 4188/13570 [8:31:22<51:02:37, 19.59s/it]

 31%|██████████▏                      | 4189/13570 [8:31:42<51:08:06, 19.62s/it]

 31%|██████████▏                      | 4190/13570 [8:32:01<50:53:04, 19.53s/it]

 31%|██████████▏                      | 4191/13570 [8:32:21<50:55:25, 19.55s/it]

 31%|██████████▏                      | 4192/13570 [8:32:40<50:48:48, 19.51s/it]

 31%|██████████▏                      | 4193/13570 [8:33:00<51:00:29, 19.58s/it]

 31%|██████████▏                      | 4194/13570 [8:33:19<51:02:03, 19.60s/it]

 31%|██████████▏                      | 4195/13570 [8:33:39<50:52:13, 19.53s/it]

 31%|██████████▏                      | 4196/13570 [8:33:58<50:47:29, 19.51s/it]

 31%|██████████▏                      | 4197/13570 [8:34:18<50:55:22, 19.56s/it]

 31%|██████████▏                      | 4198/13570 [8:34:38<51:00:19, 19.59s/it]

 31%|██████████▏                      | 4199/13570 [8:34:57<51:00:33, 19.60s/it]

 31%|██████████▏                      | 4200/13570 [8:35:17<51:08:41, 19.65s/it]                                                                                {'loss': 1.6976, 'grad_norm': 0.32453060150146484, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.55}
 31%|██████████▏                      | 4200/13570 [8:35:17<51:08:41, 19.65s/it][INFO|trainer.py:3512] 2024-04-20 00:42:20,665 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 00:42:20,665 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 00:42:20,665 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.18s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.97s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:48,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.07s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.08s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.08s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:07<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.10s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7980643510818481, 'eval_runtime': 86.8403, 'eval_samples_per_second': 1.267, 'eval_steps_per_second': 0.322, 'epoch': 1.55}
 31%|██████████▏                      | 4200/13570 [8:36:44<51:08:41, 19.65s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 00:43:47,508 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4200


[INFO|configuration_utils.py:726] 2024-04-20 00:43:47,770 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 00:43:47,772 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 00:43:47,972 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4200/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 00:43:47,973 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4200/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 00:43:48,311 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 00:43:48,312 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json


[INFO|trainer.py:3295] 2024-04-20 00:43:48,315 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2200] due to args.save_total_limit


 31%|█████████▉                      | 4201/13570 [8:37:05<119:52:06, 46.06s/it]

 31%|██████████▏                      | 4202/13570 [8:37:25<99:36:39, 38.28s/it]

 31%|██████████▏                      | 4203/13570 [8:37:44<84:52:41, 32.62s/it]

 31%|██████████▏                      | 4204/13570 [8:38:04<74:36:45, 28.68s/it]

 31%|██████████▏                      | 4205/13570 [8:38:23<67:41:12, 26.02s/it]

 31%|██████████▏                      | 4206/13570 [8:38:43<62:45:15, 24.13s/it]

 31%|██████████▏                      | 4207/13570 [8:39:03<59:05:53, 22.72s/it]

 31%|██████████▏                      | 4208/13570 [8:39:23<56:58:00, 21.91s/it]

 31%|██████████▏                      | 4209/13570 [8:39:42<54:59:56, 21.15s/it]

 31%|██████████▏                      | 4210/13570 [8:40:02<53:47:14, 20.69s/it]

 31%|██████████▏                      | 4211/13570 [8:40:21<52:49:25, 20.32s/it]

 31%|██████████▏                      | 4212/13570 [8:40:41<52:21:39, 20.14s/it]

 31%|██████████▏                      | 4213/13570 [8:41:01<52:05:03, 20.04s/it]

 31%|██████████▏                      | 4214/13570 [8:41:20<51:48:11, 19.93s/it]

 31%|██████████▎                      | 4215/13570 [8:41:40<51:33:52, 19.84s/it]

 31%|██████████▎                      | 4216/13570 [8:42:00<51:30:14, 19.82s/it]

 31%|██████████▎                      | 4217/13570 [8:42:19<51:21:06, 19.77s/it]

 31%|██████████▎                      | 4218/13570 [8:42:39<51:24:55, 19.79s/it]

 31%|██████████▎                      | 4219/13570 [8:42:59<51:24:20, 19.79s/it]

 31%|██████████▎                      | 4220/13570 [8:43:18<51:11:52, 19.71s/it]

 31%|██████████▎                      | 4221/13570 [8:43:38<50:49:39, 19.57s/it]

 31%|██████████▎                      | 4222/13570 [8:43:57<50:54:32, 19.61s/it]

 31%|██████████▎                      | 4223/13570 [8:44:17<50:55:15, 19.61s/it]

 31%|██████████▎                      | 4224/13570 [8:44:37<51:05:01, 19.68s/it]

 31%|██████████▎                      | 4225/13570 [8:44:57<51:21:27, 19.78s/it]

 31%|██████████▎                      | 4226/13570 [8:45:17<51:22:27, 19.79s/it]

 31%|██████████▎                      | 4227/13570 [8:45:36<51:21:05, 19.79s/it]

 31%|██████████▎                      | 4228/13570 [8:45:56<51:06:48, 19.70s/it]

 31%|██████████▎                      | 4229/13570 [8:46:16<51:01:36, 19.67s/it]

 31%|██████████▎                      | 4230/13570 [8:46:35<51:11:23, 19.73s/it]

 31%|██████████▎                      | 4231/13570 [8:46:55<51:08:00, 19.71s/it]

 31%|██████████▎                      | 4232/13570 [8:47:15<51:16:00, 19.76s/it]

 31%|██████████▎                      | 4233/13570 [8:47:35<51:22:00, 19.81s/it]

 31%|██████████▎                      | 4234/13570 [8:47:55<51:28:08, 19.85s/it]

 31%|██████████▎                      | 4235/13570 [8:48:14<51:18:17, 19.79s/it]

 31%|██████████▎                      | 4236/13570 [8:48:34<51:24:12, 19.83s/it]

 31%|██████████▎                      | 4237/13570 [8:48:54<51:21:57, 19.81s/it]

 31%|██████████▎                      | 4238/13570 [8:49:14<51:17:26, 19.79s/it]

 31%|██████████▎                      | 4239/13570 [8:49:34<51:16:30, 19.78s/it]

 31%|██████████▎                      | 4240/13570 [8:49:54<51:21:59, 19.82s/it]

 31%|██████████▎                      | 4241/13570 [8:50:13<51:24:40, 19.84s/it]

 31%|██████████▎                      | 4242/13570 [8:50:33<51:23:53, 19.84s/it]

 31%|██████████▎                      | 4243/13570 [8:50:53<51:28:07, 19.87s/it]

 31%|██████████▎                      | 4244/13570 [8:51:13<51:21:40, 19.83s/it]

 31%|██████████▎                      | 4245/13570 [8:51:33<51:38:45, 19.94s/it]

 31%|██████████▎                      | 4246/13570 [8:51:53<51:30:54, 19.89s/it]

 31%|██████████▎                      | 4247/13570 [8:52:13<51:16:21, 19.80s/it]

 31%|██████████▎                      | 4248/13570 [8:52:32<51:14:56, 19.79s/it]

 31%|██████████▎                      | 4249/13570 [8:52:52<51:29:55, 19.89s/it]

 31%|██████████▎                      | 4250/13570 [8:53:12<51:33:11, 19.91s/it]                                                                                {'loss': 1.6739, 'grad_norm': 0.3846580386161804, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.57}
 31%|██████████▎                      | 4250/13570 [8:53:12<51:33:11, 19.91s/it]

 31%|██████████▎                      | 4251/13570 [8:53:32<51:19:03, 19.82s/it]

 31%|██████████▎                      | 4252/13570 [8:53:52<51:27:03, 19.88s/it]

 31%|██████████▎                      | 4253/13570 [8:54:12<51:23:31, 19.86s/it]

 31%|██████████▎                      | 4254/13570 [8:54:32<51:37:32, 19.95s/it]

 31%|██████████▎                      | 4255/13570 [8:54:52<51:37:19, 19.95s/it]

 31%|██████████▎                      | 4256/13570 [8:55:11<51:16:29, 19.82s/it]

 31%|██████████▎                      | 4257/13570 [8:55:31<51:19:58, 19.84s/it]

 31%|██████████▎                      | 4258/13570 [8:55:51<51:26:01, 19.88s/it]

 31%|██████████▎                      | 4259/13570 [8:56:11<51:26:35, 19.89s/it]

 31%|██████████▎                      | 4260/13570 [8:56:31<51:23:22, 19.87s/it]

 31%|██████████▎                      | 4261/13570 [8:56:51<51:28:26, 19.91s/it]

 31%|██████████▎                      | 4262/13570 [8:57:11<51:38:51, 19.98s/it]

 31%|██████████▎                      | 4263/13570 [8:57:31<51:52:29, 20.07s/it]

 31%|██████████▎                      | 4264/13570 [8:57:51<51:40:03, 19.99s/it]

 31%|██████████▎                      | 4265/13570 [8:58:11<51:44:32, 20.02s/it]

 31%|██████████▎                      | 4266/13570 [8:58:31<51:44:36, 20.02s/it]

 31%|██████████▍                      | 4267/13570 [8:58:51<51:35:10, 19.96s/it]

 31%|██████████▍                      | 4268/13570 [8:59:11<51:35:50, 19.97s/it]

 31%|██████████▍                      | 4269/13570 [8:59:31<51:30:30, 19.94s/it]

 31%|██████████▍                      | 4270/13570 [8:59:51<51:38:13, 19.99s/it]

 31%|██████████▍                      | 4271/13570 [9:00:11<51:48:57, 20.06s/it]

 31%|██████████▍                      | 4272/13570 [9:00:31<51:34:42, 19.97s/it]

 31%|██████████▍                      | 4273/13570 [9:00:51<51:21:08, 19.88s/it]

 31%|██████████▍                      | 4274/13570 [9:01:10<51:08:18, 19.80s/it]

 32%|██████████▍                      | 4275/13570 [9:01:30<50:58:59, 19.75s/it]

 32%|██████████▍                      | 4276/13570 [9:01:50<50:45:47, 19.66s/it]

 32%|██████████▍                      | 4277/13570 [9:02:10<51:04:10, 19.78s/it]

 32%|██████████▍                      | 4278/13570 [9:02:29<50:59:23, 19.76s/it]

 32%|██████████▍                      | 4279/13570 [9:02:49<50:58:26, 19.75s/it]

 32%|██████████▍                      | 4280/13570 [9:03:09<51:16:00, 19.87s/it]

 32%|██████████▍                      | 4281/13570 [9:03:29<51:31:21, 19.97s/it]

 32%|██████████▍                      | 4282/13570 [9:03:50<51:39:03, 20.02s/it]

 32%|██████████▍                      | 4283/13570 [9:04:09<51:25:05, 19.93s/it]

 32%|██████████▍                      | 4284/13570 [9:04:30<51:40:55, 20.04s/it]

 32%|██████████▍                      | 4285/13570 [9:04:49<51:23:52, 19.93s/it]

 32%|██████████▍                      | 4286/13570 [9:05:09<51:15:04, 19.87s/it]

 32%|██████████▍                      | 4287/13570 [9:05:29<51:07:23, 19.83s/it]

 32%|██████████▍                      | 4288/13570 [9:05:48<51:03:05, 19.80s/it]

 32%|██████████▍                      | 4289/13570 [9:06:08<50:59:50, 19.78s/it]

 32%|██████████▍                      | 4290/13570 [9:06:28<51:12:16, 19.86s/it]

 32%|██████████▍                      | 4291/13570 [9:06:48<51:07:59, 19.84s/it]

 32%|██████████▍                      | 4292/13570 [9:07:08<51:04:48, 19.82s/it]

 32%|██████████▍                      | 4293/13570 [9:07:28<51:09:44, 19.85s/it]

 32%|██████████▍                      | 4294/13570 [9:07:48<51:18:13, 19.91s/it]

 32%|██████████▍                      | 4295/13570 [9:08:07<51:10:02, 19.86s/it]

 32%|██████████▍                      | 4296/13570 [9:08:27<51:07:58, 19.85s/it]

 32%|██████████▍                      | 4297/13570 [9:08:47<50:53:06, 19.75s/it]

 32%|██████████▍                      | 4298/13570 [9:09:06<50:46:02, 19.71s/it]

 32%|██████████▍                      | 4299/13570 [9:09:26<50:37:24, 19.66s/it]

 32%|██████████▍                      | 4300/13570 [9:09:46<50:53:32, 19.76s/it]                                                                                {'loss': 1.6749, 'grad_norm': 0.3468688130378723, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.58}
 32%|██████████▍                      | 4300/13570 [9:09:46<50:53:32, 19.76s/it]

[INFO|trainer.py:3512] 2024-04-20 01:16:49,724 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 01:16:49,724 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 01:16:49,724 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.98s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.04s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:49,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.08s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.08s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:07<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.09s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7970198392868042, 'eval_runtime': 86.8901, 'eval_samples_per_second': 1.266, 'eval_steps_per_second': 0.322, 'epoch': 1.58}
 32%|██████████▍                      | 4300/13570 [9:11:13<50:53:32, 19.76s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 01:18:16,617 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4300


[INFO|configuration_utils.py:726] 2024-04-20 01:18:16,925 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 01:18:16,926 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 01:18:17,124 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4300/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 01:18:17,124 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4300/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 01:18:17,472 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 01:18:17,473 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 01:18:17,477 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2300] due to args.save_total_limit


 32%|██████████▏                     | 4301/13570 [9:11:34<118:44:20, 46.12s/it]

 32%|██████████▍                      | 4302/13570 [9:11:53<98:24:25, 38.22s/it]

 32%|██████████▍                      | 4303/13570 [9:12:13<84:09:10, 32.69s/it]

 32%|██████████▍                      | 4304/13570 [9:12:33<74:01:45, 28.76s/it]

 32%|██████████▍                      | 4305/13570 [9:12:52<66:57:16, 26.02s/it]

 32%|██████████▍                      | 4306/13570 [9:13:12<61:57:50, 24.08s/it]

 32%|██████████▍                      | 4307/13570 [9:13:31<58:26:11, 22.71s/it]

 32%|██████████▍                      | 4308/13570 [9:13:51<56:11:08, 21.84s/it]

 32%|██████████▍                      | 4309/13570 [9:14:11<54:39:38, 21.25s/it]

 32%|██████████▍                      | 4310/13570 [9:14:31<53:24:36, 20.76s/it]

 32%|██████████▍                      | 4311/13570 [9:14:51<52:56:32, 20.58s/it]

 32%|██████████▍                      | 4312/13570 [9:15:11<52:27:18, 20.40s/it]

 32%|██████████▍                      | 4313/13570 [9:15:31<51:56:10, 20.20s/it]

 32%|██████████▍                      | 4314/13570 [9:15:51<51:48:39, 20.15s/it]

 32%|██████████▍                      | 4315/13570 [9:16:10<51:33:18, 20.05s/it]

 32%|██████████▍                      | 4316/13570 [9:16:30<51:05:55, 19.88s/it]

 32%|██████████▍                      | 4317/13570 [9:16:50<51:17:09, 19.95s/it]

 32%|██████████▌                      | 4318/13570 [9:17:10<51:10:03, 19.91s/it]

 32%|██████████▌                      | 4319/13570 [9:17:30<51:04:01, 19.87s/it]

 32%|██████████▌                      | 4320/13570 [9:17:50<51:15:10, 19.95s/it]

 32%|██████████▌                      | 4321/13570 [9:18:10<51:13:54, 19.94s/it]

 32%|██████████▌                      | 4322/13570 [9:18:29<50:47:54, 19.77s/it]

 32%|██████████▌                      | 4323/13570 [9:18:49<50:44:55, 19.76s/it]

 32%|██████████▌                      | 4324/13570 [9:19:09<50:57:15, 19.84s/it]

 32%|██████████▌                      | 4325/13570 [9:19:29<51:10:58, 19.93s/it]

 32%|██████████▌                      | 4326/13570 [9:19:49<51:17:39, 19.98s/it]

 32%|██████████▌                      | 4327/13570 [9:20:09<50:59:53, 19.86s/it]

 32%|██████████▌                      | 4328/13570 [9:20:28<50:40:11, 19.74s/it]

 32%|██████████▌                      | 4329/13570 [9:20:48<50:31:28, 19.68s/it]

 32%|██████████▌                      | 4330/13570 [9:21:08<50:39:14, 19.74s/it]

 32%|██████████▌                      | 4331/13570 [9:21:27<50:38:40, 19.73s/it]

 32%|██████████▌                      | 4332/13570 [9:21:47<50:34:13, 19.71s/it]

 32%|██████████▌                      | 4333/13570 [9:22:07<50:41:58, 19.76s/it]

 32%|██████████▌                      | 4334/13570 [9:22:27<50:41:57, 19.76s/it]

 32%|██████████▌                      | 4335/13570 [9:22:46<50:48:06, 19.80s/it]

 32%|██████████▌                      | 4336/13570 [9:23:07<51:03:27, 19.91s/it]

 32%|██████████▌                      | 4337/13570 [9:23:27<51:14:01, 19.98s/it]

 32%|██████████▌                      | 4338/13570 [9:23:47<51:26:29, 20.06s/it]

 32%|██████████▌                      | 4339/13570 [9:24:07<51:09:38, 19.95s/it]

 32%|██████████▌                      | 4340/13570 [9:24:27<51:17:32, 20.01s/it]

 32%|██████████▌                      | 4341/13570 [9:24:47<51:10:52, 19.96s/it]

 32%|██████████▌                      | 4342/13570 [9:25:06<51:00:49, 19.90s/it]

 32%|██████████▌                      | 4343/13570 [9:25:26<50:54:20, 19.86s/it]

 32%|██████████▌                      | 4344/13570 [9:25:46<50:30:11, 19.71s/it]

 32%|██████████▌                      | 4345/13570 [9:26:05<50:20:38, 19.65s/it]

 32%|██████████▌                      | 4346/13570 [9:26:25<50:14:35, 19.61s/it]

 32%|██████████▌                      | 4347/13570 [9:26:44<50:25:33, 19.68s/it]

 32%|██████████▌                      | 4348/13570 [9:27:04<50:35:48, 19.75s/it]

 32%|██████████▌                      | 4349/13570 [9:27:24<50:25:40, 19.69s/it]

 32%|██████████▌                      | 4350/13570 [9:27:44<50:32:26, 19.73s/it]                                                                                {'loss': 1.6794, 'grad_norm': 0.3626391291618347, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.6}
 32%|██████████▌                      | 4350/13570 [9:27:44<50:32:26, 19.73s/it]

 32%|██████████▌                      | 4351/13570 [9:28:04<50:45:52, 19.82s/it]

 32%|██████████▌                      | 4352/13570 [9:28:23<50:31:42, 19.73s/it]

 32%|██████████▌                      | 4353/13570 [9:28:43<50:27:05, 19.71s/it]

 32%|██████████▌                      | 4354/13570 [9:29:03<50:34:03, 19.75s/it]

 32%|██████████▌                      | 4355/13570 [9:29:22<50:25:43, 19.70s/it]

 32%|██████████▌                      | 4356/13570 [9:29:42<50:40:44, 19.80s/it]

 32%|██████████▌                      | 4357/13570 [9:30:02<50:30:15, 19.73s/it]

 32%|██████████▌                      | 4358/13570 [9:30:22<50:46:47, 19.84s/it]

 32%|██████████▌                      | 4359/13570 [9:30:42<51:03:57, 19.96s/it]

 32%|██████████▌                      | 4360/13570 [9:31:02<50:48:37, 19.86s/it]

 32%|██████████▌                      | 4361/13570 [9:31:22<50:39:30, 19.80s/it]

 32%|██████████▌                      | 4362/13570 [9:31:41<50:21:52, 19.69s/it]

 32%|██████████▌                      | 4363/13570 [9:32:01<50:11:27, 19.63s/it]

 32%|██████████▌                      | 4364/13570 [9:32:20<50:21:33, 19.69s/it]

 32%|██████████▌                      | 4365/13570 [9:32:40<50:17:15, 19.67s/it]

 32%|██████████▌                      | 4366/13570 [9:33:00<50:24:08, 19.71s/it]

 32%|██████████▌                      | 4367/13570 [9:33:19<50:21:58, 19.70s/it]

 32%|██████████▌                      | 4368/13570 [9:33:40<50:40:05, 19.82s/it]

 32%|██████████▌                      | 4369/13570 [9:33:59<50:24:54, 19.73s/it]

 32%|██████████▋                      | 4370/13570 [9:34:19<50:36:34, 19.80s/it]

 32%|██████████▋                      | 4371/13570 [9:34:39<50:35:45, 19.80s/it]

 32%|██████████▋                      | 4372/13570 [9:34:58<50:20:16, 19.70s/it]

 32%|██████████▋                      | 4373/13570 [9:35:18<50:16:27, 19.68s/it]

 32%|██████████▋                      | 4374/13570 [9:35:38<50:22:51, 19.72s/it]

 32%|██████████▋                      | 4375/13570 [9:35:57<50:12:17, 19.66s/it]

 32%|██████████▋                      | 4376/13570 [9:36:17<50:16:55, 19.69s/it]

 32%|██████████▋                      | 4377/13570 [9:36:37<50:22:14, 19.73s/it]

 32%|██████████▋                      | 4378/13570 [9:36:57<50:22:57, 19.73s/it]

 32%|██████████▋                      | 4379/13570 [9:37:16<50:18:34, 19.71s/it]

 32%|██████████▋                      | 4380/13570 [9:37:36<50:30:05, 19.78s/it]

 32%|██████████▋                      | 4381/13570 [9:37:56<50:16:50, 19.70s/it]

 32%|██████████▋                      | 4382/13570 [9:38:16<50:29:41, 19.78s/it]

 32%|██████████▋                      | 4383/13570 [9:38:35<50:24:52, 19.76s/it]

 32%|██████████▋                      | 4384/13570 [9:38:55<50:29:15, 19.79s/it]

 32%|██████████▋                      | 4385/13570 [9:39:15<50:11:37, 19.67s/it]

 32%|██████████▋                      | 4386/13570 [9:39:34<50:10:50, 19.67s/it]

 32%|██████████▋                      | 4387/13570 [9:39:54<50:27:15, 19.78s/it]

 32%|██████████▋                      | 4388/13570 [9:40:14<50:30:34, 19.80s/it]

 32%|██████████▋                      | 4389/13570 [9:40:34<50:34:38, 19.83s/it]

 32%|██████████▋                      | 4390/13570 [9:40:54<50:30:51, 19.81s/it]

 32%|██████████▋                      | 4391/13570 [9:41:14<50:26:35, 19.78s/it]

 32%|██████████▋                      | 4392/13570 [9:41:33<50:29:25, 19.80s/it]

 32%|██████████▋                      | 4393/13570 [9:41:53<50:33:45, 19.83s/it]

 32%|██████████▋                      | 4394/13570 [9:42:13<50:17:49, 19.73s/it]

 32%|██████████▋                      | 4395/13570 [9:42:33<50:13:19, 19.71s/it]

 32%|██████████▋                      | 4396/13570 [9:42:52<50:12:43, 19.70s/it]

 32%|██████████▋                      | 4397/13570 [9:43:12<50:11:59, 19.70s/it]

 32%|██████████▋                      | 4398/13570 [9:43:32<50:08:38, 19.68s/it]

 32%|██████████▋                      | 4399/13570 [9:43:51<50:17:41, 19.74s/it]

 32%|██████████▋                      | 4400/13570 [9:44:11<50:07:02, 19.68s/it]                                                                                {'loss': 1.6663, 'grad_norm': 0.444447785615921, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.62}
 32%|██████████▋                      | 4400/13570 [9:44:11<50:07:02, 19.68s/it][INFO|trainer.py:3512] 2024-04-20 01:51:14,680 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 01:51:14,680 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 01:51:14,680 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.55s/it][A




 11%|████▋                                       | 3/28 [00:06<00:54,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.53s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.73s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.86s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.94s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.99s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.03s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.07s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.10s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.11s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:28,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.12s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.12s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.12s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.12s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.12s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.12s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.12s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.12s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7959309816360474, 'eval_runtime': 87.4751, 'eval_samples_per_second': 1.258, 'eval_steps_per_second': 0.32, 'epoch': 1.62}
 32%|██████████▋                      | 4400/13570 [9:45:38<50:07:02, 19.68s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.14s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 01:52:42,158 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4400


[INFO|configuration_utils.py:726] 2024-04-20 01:52:42,420 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 01:52:42,422 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 01:52:42,617 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4400/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 01:52:42,617 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4400/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 01:52:42,965 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 01:52:42,966 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 01:52:42,969 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2400] due to args.save_total_limit


 32%|██████████▍                     | 4401/13570 [9:45:59<117:32:04, 46.15s/it]

 32%|██████████▋                      | 4402/13570 [9:46:19<97:16:43, 38.20s/it]

 32%|██████████▋                      | 4403/13570 [9:46:38<83:21:10, 32.73s/it]

 32%|██████████▋                      | 4404/13570 [9:46:58<73:24:49, 28.83s/it]

 32%|██████████▋                      | 4405/13570 [9:47:18<66:42:46, 26.20s/it]

 32%|██████████▋                      | 4406/13570 [9:47:38<61:48:29, 24.28s/it]

 32%|██████████▋                      | 4407/13570 [9:47:58<58:19:05, 22.91s/it]

 32%|██████████▋                      | 4408/13570 [9:48:18<55:54:20, 21.97s/it]

 32%|██████████▋                      | 4409/13570 [9:48:37<54:14:57, 21.32s/it]

 32%|██████████▋                      | 4410/13570 [9:48:57<53:16:46, 20.94s/it]

 33%|██████████▋                      | 4411/13570 [9:49:17<52:30:34, 20.64s/it]

 33%|██████████▋                      | 4412/13570 [9:49:37<51:34:59, 20.28s/it]

 33%|██████████▋                      | 4413/13570 [9:49:57<51:22:03, 20.19s/it]

 33%|██████████▋                      | 4414/13570 [9:50:17<51:18:55, 20.18s/it]

 33%|██████████▋                      | 4415/13570 [9:50:37<50:54:43, 20.02s/it]

 33%|██████████▋                      | 4416/13570 [9:50:56<50:47:15, 19.97s/it]

 33%|██████████▋                      | 4417/13570 [9:51:16<50:43:37, 19.95s/it]

 33%|██████████▋                      | 4418/13570 [9:51:36<50:40:26, 19.93s/it]

 33%|██████████▋                      | 4419/13570 [9:51:56<50:52:10, 20.01s/it]

 33%|██████████▋                      | 4420/13570 [9:52:16<50:39:01, 19.93s/it]

 33%|██████████▊                      | 4421/13570 [9:52:36<50:23:06, 19.83s/it]

 33%|██████████▊                      | 4422/13570 [9:52:56<50:31:04, 19.88s/it]

 33%|██████████▊                      | 4423/13570 [9:53:16<50:35:18, 19.91s/it]

 33%|██████████▊                      | 4424/13570 [9:53:36<50:32:31, 19.89s/it]

 33%|██████████▊                      | 4425/13570 [9:53:55<50:26:49, 19.86s/it]

 33%|██████████▊                      | 4426/13570 [9:54:15<50:17:35, 19.80s/it]

 33%|██████████▊                      | 4427/13570 [9:54:35<50:19:57, 19.82s/it]

 33%|██████████▊                      | 4428/13570 [9:54:55<50:23:37, 19.84s/it]

 33%|██████████▊                      | 4429/13570 [9:55:15<50:39:33, 19.95s/it]

 33%|██████████▊                      | 4430/13570 [9:55:35<50:38:05, 19.94s/it]

 33%|██████████▊                      | 4431/13570 [9:55:55<50:36:30, 19.94s/it]

 33%|██████████▊                      | 4432/13570 [9:56:15<50:34:19, 19.92s/it]

 33%|██████████▊                      | 4433/13570 [9:56:35<50:27:10, 19.88s/it]

 33%|██████████▊                      | 4434/13570 [9:56:54<50:12:00, 19.78s/it]

 33%|██████████▊                      | 4435/13570 [9:57:14<50:05:56, 19.74s/it]

 33%|██████████▊                      | 4436/13570 [9:57:34<50:11:50, 19.78s/it]

 33%|██████████▊                      | 4437/13570 [9:57:54<50:28:26, 19.90s/it]

 33%|██████████▊                      | 4438/13570 [9:58:14<50:29:28, 19.90s/it]

 33%|██████████▊                      | 4439/13570 [9:58:34<50:38:30, 19.97s/it]

 33%|██████████▊                      | 4440/13570 [9:58:54<50:47:39, 20.03s/it]

 33%|██████████▊                      | 4441/13570 [9:59:14<50:52:30, 20.06s/it]

 33%|██████████▊                      | 4442/13570 [9:59:34<50:42:59, 20.00s/it]

 33%|██████████▊                      | 4443/13570 [9:59:54<50:51:30, 20.06s/it]

 33%|██████████▍                     | 4444/13570 [10:00:14<50:37:13, 19.97s/it]

 33%|██████████▍                     | 4445/13570 [10:00:33<50:18:20, 19.85s/it]

 33%|██████████▍                     | 4446/13570 [10:00:53<50:16:03, 19.83s/it]

 33%|██████████▍                     | 4447/13570 [10:01:13<50:13:34, 19.82s/it]

 33%|██████████▍                     | 4448/13570 [10:01:33<49:59:20, 19.73s/it]

 33%|██████████▍                     | 4449/13570 [10:01:52<49:58:15, 19.72s/it]

 33%|██████████▍                     | 4450/13570 [10:02:12<49:58:35, 19.73s/it]                                                                                {'loss': 1.6592, 'grad_norm': 0.36630871891975403, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.64}
 33%|██████████▍                     | 4450/13570 [10:02:12<49:58:35, 19.73s/it]

 33%|██████████▍                     | 4451/13570 [10:02:32<50:05:27, 19.77s/it]

 33%|██████████▍                     | 4452/13570 [10:02:52<50:04:18, 19.77s/it]

 33%|██████████▌                     | 4453/13570 [10:03:12<50:11:55, 19.82s/it]

 33%|██████████▌                     | 4454/13570 [10:03:32<50:19:02, 19.87s/it]

 33%|██████████▌                     | 4455/13570 [10:03:51<50:04:43, 19.78s/it]

 33%|██████████▌                     | 4456/13570 [10:04:11<50:18:20, 19.87s/it]

 33%|██████████▌                     | 4457/13570 [10:04:31<50:19:32, 19.88s/it]

 33%|██████████▌                     | 4458/13570 [10:04:51<50:21:09, 19.89s/it]

 33%|██████████▌                     | 4459/13570 [10:05:11<50:20:14, 19.89s/it]

 33%|██████████▌                     | 4460/13570 [10:05:31<50:22:07, 19.90s/it]

 33%|██████████▌                     | 4461/13570 [10:05:51<50:17:20, 19.87s/it]

 33%|██████████▌                     | 4462/13570 [10:06:11<50:18:25, 19.88s/it]

 33%|██████████▌                     | 4463/13570 [10:06:31<50:21:30, 19.91s/it]

 33%|██████████▌                     | 4464/13570 [10:06:50<50:09:22, 19.83s/it]

 33%|██████████▌                     | 4465/13570 [10:07:10<50:10:25, 19.84s/it]

 33%|██████████▌                     | 4466/13570 [10:07:30<50:10:42, 19.84s/it]

 33%|██████████▌                     | 4467/13570 [10:07:49<49:56:32, 19.75s/it]

 33%|██████████▌                     | 4468/13570 [10:08:09<49:46:46, 19.69s/it]

 33%|██████████▌                     | 4469/13570 [10:08:29<49:59:35, 19.78s/it]

 33%|██████████▌                     | 4470/13570 [10:08:49<49:49:53, 19.71s/it]

 33%|██████████▌                     | 4471/13570 [10:09:09<50:06:26, 19.82s/it]

 33%|██████████▌                     | 4472/13570 [10:09:29<50:11:49, 19.86s/it]

 33%|██████████▌                     | 4473/13570 [10:09:48<49:57:16, 19.77s/it]

 33%|██████████▌                     | 4474/13570 [10:10:08<49:54:58, 19.76s/it]

 33%|██████████▌                     | 4475/13570 [10:10:28<50:01:16, 19.80s/it]

 33%|██████████▌                     | 4476/13570 [10:10:48<49:59:58, 19.79s/it]

 33%|██████████▌                     | 4477/13570 [10:11:07<49:55:04, 19.76s/it]

 33%|██████████▌                     | 4478/13570 [10:11:27<49:58:43, 19.79s/it]

 33%|██████████▌                     | 4479/13570 [10:11:47<50:00:55, 19.81s/it]

 33%|██████████▌                     | 4480/13570 [10:12:07<49:50:36, 19.74s/it]

 33%|██████████▌                     | 4481/13570 [10:12:26<49:42:32, 19.69s/it]

 33%|██████████▌                     | 4482/13570 [10:12:46<49:33:40, 19.63s/it]

 33%|██████████▌                     | 4483/13570 [10:13:05<49:28:40, 19.60s/it]

 33%|██████████▌                     | 4484/13570 [10:13:25<49:36:11, 19.65s/it]

 33%|██████████▌                     | 4485/13570 [10:13:45<49:39:23, 19.68s/it]

 33%|██████████▌                     | 4486/13570 [10:14:05<49:51:38, 19.76s/it]

 33%|██████████▌                     | 4487/13570 [10:14:24<49:39:15, 19.68s/it]

 33%|██████████▌                     | 4488/13570 [10:14:44<50:02:45, 19.84s/it]

 33%|██████████▌                     | 4489/13570 [10:15:04<50:06:36, 19.87s/it]

 33%|██████████▌                     | 4490/13570 [10:15:24<50:11:24, 19.90s/it]

 33%|██████████▌                     | 4491/13570 [10:15:44<49:58:33, 19.82s/it]

 33%|██████████▌                     | 4492/13570 [10:16:03<49:48:36, 19.75s/it]

 33%|██████████▌                     | 4493/13570 [10:16:23<49:52:58, 19.78s/it]

 33%|██████████▌                     | 4494/13570 [10:16:43<49:32:48, 19.65s/it]

 33%|██████████▌                     | 4495/13570 [10:17:03<49:43:43, 19.73s/it]

 33%|██████████▌                     | 4496/13570 [10:17:23<49:59:38, 19.83s/it]

 33%|██████████▌                     | 4497/13570 [10:17:43<50:04:59, 19.87s/it]

 33%|██████████▌                     | 4498/13570 [10:18:02<49:59:37, 19.84s/it]

 33%|██████████▌                     | 4499/13570 [10:18:22<49:51:27, 19.79s/it]

 33%|██████████▌                     | 4500/13570 [10:18:42<49:47:26, 19.76s/it]                                                                                {'loss': 1.6933, 'grad_norm': 0.38507190346717834, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.66}
 33%|██████████▌                     | 4500/13570 [10:18:42<49:47:26, 19.76s/it][INFO|trainer.py:3512] 2024-04-20 02:25:45,450 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 02:25:45,450 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 02:25:45,450 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.19s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.72s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A




 29%|████████████▌                               | 8/28 [00:21<00:59,  2.97s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:49,  3.07s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.08s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.09s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.09s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:33,  3.09s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:30,  3.09s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.09s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.09s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.09s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.09s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.09s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.10s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.10s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7950669527053833, 'eval_runtime': 86.9391, 'eval_samples_per_second': 1.265, 'eval_steps_per_second': 0.322, 'epoch': 1.66}
 33%|██████████▌                     | 4500/13570 [10:20:09<49:47:26, 19.76s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.12s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 02:27:12,392 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4500


[INFO|configuration_utils.py:726] 2024-04-20 02:27:12,897 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 02:27:12,898 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 02:27:13,099 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 02:27:13,100 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4500/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 02:27:13,442 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 02:27:13,443 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 02:27:13,446 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2500] due to args.save_total_limit


 33%|██████████▎                    | 4501/13570 [10:20:30<116:25:38, 46.22s/it]

 33%|██████████▌                     | 4502/13570 [10:20:49<96:14:51, 38.21s/it]

 33%|██████████▌                     | 4503/13570 [10:21:09<82:10:54, 32.63s/it]

 33%|██████████▌                     | 4504/13570 [10:21:29<72:36:42, 28.83s/it]

 33%|██████████▌                     | 4505/13570 [10:21:48<65:42:43, 26.10s/it]

 33%|██████████▋                     | 4506/13570 [10:22:08<60:39:07, 24.09s/it]

 33%|██████████▋                     | 4507/13570 [10:22:28<57:21:49, 22.79s/it]

 33%|██████████▋                     | 4508/13570 [10:22:48<55:13:50, 21.94s/it]

 33%|██████████▋                     | 4509/13570 [10:23:07<53:24:48, 21.22s/it]

 33%|██████████▋                     | 4510/13570 [10:23:27<52:18:56, 20.79s/it]

 33%|██████████▋                     | 4511/13570 [10:23:47<51:43:32, 20.56s/it]

 33%|██████████▋                     | 4512/13570 [10:24:07<51:06:03, 20.31s/it]

 33%|██████████▋                     | 4513/13570 [10:24:26<50:27:11, 20.05s/it]

 33%|██████████▋                     | 4514/13570 [10:24:46<49:58:18, 19.87s/it]

 33%|██████████▋                     | 4515/13570 [10:25:05<49:59:42, 19.88s/it]

 33%|██████████▋                     | 4516/13570 [10:25:25<49:52:36, 19.83s/it]

 33%|██████████▋                     | 4517/13570 [10:25:45<49:42:46, 19.77s/it]

 33%|██████████▋                     | 4518/13570 [10:26:05<49:45:41, 19.79s/it]

 33%|██████████▋                     | 4519/13570 [10:26:24<49:43:22, 19.78s/it]

 33%|██████████▋                     | 4520/13570 [10:26:45<50:01:19, 19.90s/it]

 33%|██████████▋                     | 4521/13570 [10:27:04<49:36:13, 19.73s/it]

 33%|██████████▋                     | 4522/13570 [10:27:23<49:28:27, 19.68s/it]

 33%|██████████▋                     | 4523/13570 [10:27:43<49:20:36, 19.63s/it]

 33%|██████████▋                     | 4524/13570 [10:28:03<49:15:16, 19.60s/it]

 33%|██████████▋                     | 4525/13570 [10:28:22<49:26:26, 19.68s/it]

 33%|██████████▋                     | 4526/13570 [10:28:42<49:18:47, 19.63s/it]

 33%|██████████▋                     | 4527/13570 [10:29:02<49:37:23, 19.75s/it]

 33%|██████████▋                     | 4528/13570 [10:29:22<49:39:37, 19.77s/it]

 33%|██████████▋                     | 4529/13570 [10:29:42<49:50:10, 19.84s/it]

 33%|██████████▋                     | 4530/13570 [10:30:02<49:47:04, 19.83s/it]

 33%|██████████▋                     | 4531/13570 [10:30:21<49:21:44, 19.66s/it]

 33%|██████████▋                     | 4532/13570 [10:30:40<49:18:47, 19.64s/it]

 33%|██████████▋                     | 4533/13570 [10:31:00<49:16:02, 19.63s/it]

 33%|██████████▋                     | 4534/13570 [10:31:20<49:16:13, 19.63s/it]

 33%|██████████▋                     | 4535/13570 [10:31:40<49:27:02, 19.70s/it]

 33%|██████████▋                     | 4536/13570 [10:31:59<49:26:35, 19.70s/it]

 33%|██████████▋                     | 4537/13570 [10:32:19<49:18:12, 19.65s/it]

 33%|██████████▋                     | 4538/13570 [10:32:38<49:03:02, 19.55s/it]

 33%|██████████▋                     | 4539/13570 [10:32:58<49:07:42, 19.58s/it]

 33%|██████████▋                     | 4540/13570 [10:33:18<49:16:36, 19.65s/it]

 33%|██████████▋                     | 4541/13570 [10:33:37<49:28:58, 19.73s/it]

 33%|██████████▋                     | 4542/13570 [10:33:57<49:23:49, 19.70s/it]

 33%|██████████▋                     | 4543/13570 [10:34:17<49:24:25, 19.70s/it]

 33%|██████████▋                     | 4544/13570 [10:34:37<49:30:31, 19.75s/it]

 33%|██████████▋                     | 4545/13570 [10:34:56<49:33:19, 19.77s/it]

 34%|██████████▋                     | 4546/13570 [10:35:16<49:20:13, 19.68s/it]

 34%|██████████▋                     | 4547/13570 [10:35:35<49:11:37, 19.63s/it]

 34%|██████████▋                     | 4548/13570 [10:35:55<49:24:55, 19.72s/it]

 34%|██████████▋                     | 4549/13570 [10:36:15<49:10:20, 19.62s/it]

 34%|██████████▋                     | 4550/13570 [10:36:35<49:21:42, 19.70s/it]                                                                                {'loss': 1.6605, 'grad_norm': 0.36175772547721863, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.68}
 34%|██████████▋                     | 4550/13570 [10:36:35<49:21:42, 19.70s/it]

 34%|██████████▋                     | 4551/13570 [10:36:54<49:22:06, 19.71s/it]

 34%|██████████▋                     | 4552/13570 [10:37:14<49:13:26, 19.65s/it]

 34%|██████████▋                     | 4553/13570 [10:37:33<49:09:11, 19.62s/it]

 34%|██████████▋                     | 4554/13570 [10:37:53<49:11:15, 19.64s/it]

 34%|██████████▋                     | 4555/13570 [10:38:13<49:31:17, 19.78s/it]

 34%|██████████▋                     | 4556/13570 [10:38:33<49:26:26, 19.75s/it]

 34%|██████████▋                     | 4557/13570 [10:38:52<49:12:23, 19.65s/it]

 34%|██████████▋                     | 4558/13570 [10:39:12<49:07:35, 19.62s/it]

 34%|██████████▊                     | 4559/13570 [10:39:32<49:22:29, 19.73s/it]

 34%|██████████▊                     | 4560/13570 [10:39:51<49:13:28, 19.67s/it]

 34%|██████████▊                     | 4561/13570 [10:40:11<49:00:43, 19.59s/it]

 34%|██████████▊                     | 4562/13570 [10:40:30<48:55:01, 19.55s/it]

 34%|██████████▊                     | 4563/13570 [10:40:50<48:46:49, 19.50s/it]

 34%|██████████▊                     | 4564/13570 [10:41:09<48:46:20, 19.50s/it]

 34%|██████████▊                     | 4565/13570 [10:41:29<48:57:10, 19.57s/it]

 34%|██████████▊                     | 4566/13570 [10:41:49<49:01:04, 19.60s/it]

 34%|██████████▊                     | 4567/13570 [10:42:08<49:14:16, 19.69s/it]

 34%|██████████▊                     | 4568/13570 [10:42:28<49:12:07, 19.68s/it]

 34%|██████████▊                     | 4569/13570 [10:42:48<49:02:46, 19.62s/it]

 34%|██████████▊                     | 4570/13570 [10:43:07<49:05:09, 19.63s/it]

 34%|██████████▊                     | 4571/13570 [10:43:27<49:05:57, 19.64s/it]

 34%|██████████▊                     | 4572/13570 [10:43:46<48:47:59, 19.52s/it]

 34%|██████████▊                     | 4573/13570 [10:44:06<48:58:08, 19.59s/it]

 34%|██████████▊                     | 4574/13570 [10:44:26<49:28:44, 19.80s/it]

 34%|██████████▊                     | 4575/13570 [10:44:46<49:11:41, 19.69s/it]

 34%|██████████▊                     | 4576/13570 [10:45:05<49:08:30, 19.67s/it]

 34%|██████████▊                     | 4577/13570 [10:45:25<49:32:39, 19.83s/it]

 34%|██████████▊                     | 4578/13570 [10:45:45<49:22:17, 19.77s/it]

 34%|██████████▊                     | 4579/13570 [10:46:05<49:30:52, 19.83s/it]

 34%|██████████▊                     | 4580/13570 [10:46:25<49:36:01, 19.86s/it]

 34%|██████████▊                     | 4581/13570 [10:46:45<49:23:27, 19.78s/it]

 34%|██████████▊                     | 4582/13570 [10:47:05<49:41:45, 19.90s/it]

 34%|██████████▊                     | 4583/13570 [10:47:25<49:44:50, 19.93s/it]

 34%|██████████▊                     | 4584/13570 [10:47:45<49:41:37, 19.91s/it]

 34%|██████████▊                     | 4585/13570 [10:48:04<49:26:00, 19.81s/it]

 34%|██████████▊                     | 4586/13570 [10:48:24<49:25:16, 19.80s/it]

 34%|██████████▊                     | 4587/13570 [10:48:44<49:40:38, 19.91s/it]

 34%|██████████▊                     | 4588/13570 [10:49:04<49:56:50, 20.02s/it]

 34%|██████████▊                     | 4589/13570 [10:49:24<49:56:15, 20.02s/it]

 34%|██████████▊                     | 4590/13570 [10:49:44<49:35:05, 19.88s/it]

 34%|██████████▊                     | 4591/13570 [10:50:04<49:25:13, 19.81s/it]

 34%|██████████▊                     | 4592/13570 [10:50:23<49:07:37, 19.70s/it]

 34%|██████████▊                     | 4593/13570 [10:50:43<49:13:38, 19.74s/it]

 34%|██████████▊                     | 4594/13570 [10:51:03<49:23:07, 19.81s/it]

 34%|██████████▊                     | 4595/13570 [10:51:23<49:33:50, 19.88s/it]

 34%|██████████▊                     | 4596/13570 [10:51:43<49:23:01, 19.81s/it]

 34%|██████████▊                     | 4597/13570 [10:52:03<49:40:51, 19.93s/it]

 34%|██████████▊                     | 4598/13570 [10:52:23<49:51:20, 20.00s/it]

 34%|██████████▊                     | 4599/13570 [10:52:43<49:41:54, 19.94s/it]

 34%|██████████▊                     | 4600/13570 [10:53:03<49:54:21, 20.03s/it]                                                                                {'loss': 1.6643, 'grad_norm': 0.3474549651145935, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.69}
 34%|██████████▊                     | 4600/13570 [10:53:03<49:54:21, 20.03s/it][INFO|trainer.py:3512] 2024-04-20 03:00:06,710 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 03:00:06,710 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 03:00:06,710 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.56s/it][A


 11%|████▋                                       | 3/28 [00:06<00:55,  2.20s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.54s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.74s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.86s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.94s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  3.00s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.03s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.05s/it][A


 39%|████████████████▉                          | 11/28 [00:31<00:52,  3.07s/it][A


 43%|██████████████████▍                        | 12/28 [00:34<00:49,  3.08s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.09s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.09s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.10s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.10s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.11s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.11s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.11s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:59<00:24,  3.11s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:02<00:21,  3.11s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:05<00:18,  3.11s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.11s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.11s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.11s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.11s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.11s/it][A


100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A                                                                                
                                                                                [A{'eval_loss': 1.794425368309021, 'eval_runtime': 87.3843, 'eval_samples_per_second': 1.259, 'eval_steps_per_second': 0.32, 'epoch': 1.69}
 34%|██████████▊                     | 4600/13570 [10:54:30<49:54:21, 20.03s/it]
100%|███████████████████████████████████████████| 28/28 [01:24<00:00,  3.13s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 03:01:34,097 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4600


[INFO|configuration_utils.py:726] 2024-04-20 03:01:34,352 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 03:01:34,354 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 03:01:34,549 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4600/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 03:01:34,550 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4600/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 03:01:34,897 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 03:01:34,897 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 03:01:34,902 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2600] due to args.save_total_limit


 34%|██████████▌                    | 4601/13570 [10:54:51<115:40:38, 46.43s/it]

 34%|██████████▊                     | 4602/13570 [10:55:11<95:42:44, 38.42s/it]

 34%|██████████▊                     | 4603/13570 [10:55:31<81:58:33, 32.91s/it]

 34%|██████████▊                     | 4604/13570 [10:55:50<72:03:06, 28.93s/it]

 34%|██████████▊                     | 4605/13570 [10:56:10<65:21:48, 26.25s/it]

 34%|██████████▊                     | 4606/13570 [10:56:30<60:34:19, 24.33s/it]

 34%|██████████▊                     | 4607/13570 [10:56:50<57:08:39, 22.95s/it]

 34%|██████████▊                     | 4608/13570 [10:57:09<54:29:58, 21.89s/it]

 34%|██████████▊                     | 4609/13570 [10:57:29<52:37:49, 21.14s/it]

 34%|██████████▊                     | 4610/13570 [10:57:49<51:32:29, 20.71s/it]

 34%|██████████▊                     | 4611/13570 [10:58:08<50:43:08, 20.38s/it]

 34%|██████████▉                     | 4612/13570 [10:58:28<50:01:12, 20.10s/it]

 34%|██████████▉                     | 4613/13570 [10:58:47<49:43:43, 19.99s/it]

 34%|██████████▉                     | 4614/13570 [10:59:07<49:37:11, 19.95s/it]

 34%|██████████▉                     | 4615/13570 [10:59:27<49:40:15, 19.97s/it]

 34%|██████████▉                     | 4616/13570 [10:59:47<49:37:37, 19.95s/it]

 34%|██████████▉                     | 4617/13570 [11:00:07<49:38:01, 19.96s/it]

 34%|██████████▉                     | 4618/13570 [11:00:27<49:44:30, 20.00s/it]

 34%|██████████▉                     | 4619/13570 [11:00:47<49:47:01, 20.02s/it]

 34%|██████████▉                     | 4620/13570 [11:01:07<49:38:40, 19.97s/it]

 34%|██████████▉                     | 4621/13570 [11:01:27<49:39:36, 19.98s/it]

 34%|██████████▉                     | 4622/13570 [11:01:47<49:46:24, 20.03s/it]

 34%|██████████▉                     | 4623/13570 [11:02:07<49:50:19, 20.05s/it]

 34%|██████████▉                     | 4624/13570 [11:02:27<49:44:14, 20.02s/it]

 34%|██████████▉                     | 4625/13570 [11:02:47<49:16:23, 19.83s/it]

 34%|██████████▉                     | 4626/13570 [11:03:06<49:09:51, 19.79s/it]

 34%|██████████▉                     | 4627/13570 [11:03:27<49:29:48, 19.92s/it]

 34%|██████████▉                     | 4628/13570 [11:03:46<49:26:41, 19.91s/it]

 34%|██████████▉                     | 4629/13570 [11:04:06<49:07:00, 19.78s/it]

 34%|██████████▉                     | 4630/13570 [11:04:26<48:57:30, 19.71s/it]

 34%|██████████▉                     | 4631/13570 [11:04:46<49:15:48, 19.84s/it]

 34%|██████████▉                     | 4632/13570 [11:05:05<49:04:12, 19.76s/it]

 34%|██████████▉                     | 4633/13570 [11:05:25<49:09:11, 19.80s/it]

 34%|██████████▉                     | 4634/13570 [11:05:45<49:05:20, 19.78s/it]

 34%|██████████▉                     | 4635/13570 [11:06:04<48:50:39, 19.68s/it]

 34%|██████████▉                     | 4636/13570 [11:06:24<48:40:31, 19.61s/it]

 34%|██████████▉                     | 4637/13570 [11:06:43<48:37:18, 19.59s/it]

 34%|██████████▉                     | 4638/13570 [11:07:03<48:35:46, 19.59s/it]

 34%|██████████▉                     | 4639/13570 [11:07:22<48:27:51, 19.54s/it]

 34%|██████████▉                     | 4640/13570 [11:07:42<48:43:38, 19.64s/it]

 34%|██████████▉                     | 4641/13570 [11:08:02<48:48:53, 19.68s/it]

 34%|██████████▉                     | 4642/13570 [11:08:22<48:56:13, 19.73s/it]

 34%|██████████▉                     | 4643/13570 [11:08:42<49:14:12, 19.86s/it]

 34%|██████████▉                     | 4644/13570 [11:09:02<49:10:58, 19.84s/it]

 34%|██████████▉                     | 4645/13570 [11:09:22<49:14:48, 19.86s/it]

 34%|██████████▉                     | 4646/13570 [11:09:41<49:07:34, 19.82s/it]

 34%|██████████▉                     | 4647/13570 [11:10:01<49:06:42, 19.81s/it]

 34%|██████████▉                     | 4648/13570 [11:10:21<49:03:19, 19.79s/it]

 34%|██████████▉                     | 4649/13570 [11:10:41<48:58:29, 19.76s/it]

 34%|██████████▉                     | 4650/13570 [11:11:00<48:50:31, 19.71s/it]                                                                                {'loss': 1.677, 'grad_norm': 0.29836419224739075, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.71}
 34%|██████████▉                     | 4650/13570 [11:11:00<48:50:31, 19.71s/it]

 34%|██████████▉                     | 4651/13570 [11:11:20<49:03:06, 19.80s/it]

 34%|██████████▉                     | 4652/13570 [11:11:40<48:52:46, 19.73s/it]

 34%|██████████▉                     | 4653/13570 [11:12:00<48:52:10, 19.73s/it]

 34%|██████████▉                     | 4654/13570 [11:12:19<49:01:56, 19.80s/it]

 34%|██████████▉                     | 4655/13570 [11:12:40<49:14:40, 19.89s/it]

 34%|██████████▉                     | 4656/13570 [11:12:59<49:06:50, 19.84s/it]

 34%|██████████▉                     | 4657/13570 [11:13:19<48:44:44, 19.69s/it]

 34%|██████████▉                     | 4658/13570 [11:13:38<48:46:06, 19.70s/it]

 34%|██████████▉                     | 4659/13570 [11:13:58<48:54:31, 19.76s/it]

 34%|██████████▉                     | 4660/13570 [11:14:18<48:57:00, 19.78s/it]

 34%|██████████▉                     | 4661/13570 [11:14:38<48:48:13, 19.72s/it]

 34%|██████████▉                     | 4662/13570 [11:14:57<48:32:56, 19.62s/it]

 34%|██████████▉                     | 4663/13570 [11:15:17<48:38:25, 19.66s/it]

 34%|██████████▉                     | 4664/13570 [11:15:36<48:31:34, 19.62s/it]

 34%|███████████                     | 4665/13570 [11:15:56<48:36:20, 19.65s/it]

 34%|███████████                     | 4666/13570 [11:16:16<48:41:31, 19.69s/it]

 34%|███████████                     | 4667/13570 [11:16:35<48:41:33, 19.69s/it]

 34%|███████████                     | 4668/13570 [11:16:55<48:47:50, 19.73s/it]

 34%|███████████                     | 4669/13570 [11:17:15<48:53:55, 19.78s/it]

 34%|███████████                     | 4670/13570 [11:17:35<48:47:46, 19.74s/it]

 34%|███████████                     | 4671/13570 [11:17:55<48:56:44, 19.80s/it]

 34%|███████████                     | 4672/13570 [11:18:14<48:46:33, 19.73s/it]

 34%|███████████                     | 4673/13570 [11:18:34<48:40:00, 19.69s/it]

 34%|███████████                     | 4674/13570 [11:18:54<48:39:45, 19.69s/it]

 34%|███████████                     | 4675/13570 [11:19:13<48:24:44, 19.59s/it]

 34%|███████████                     | 4676/13570 [11:19:33<48:20:02, 19.56s/it]

 34%|███████████                     | 4677/13570 [11:19:53<48:40:10, 19.70s/it]

 34%|███████████                     | 4678/13570 [11:20:12<48:40:53, 19.71s/it]

 34%|███████████                     | 4679/13570 [11:20:32<48:37:38, 19.69s/it]

 34%|███████████                     | 4680/13570 [11:20:52<48:35:22, 19.68s/it]

 34%|███████████                     | 4681/13570 [11:21:12<48:47:39, 19.76s/it]

 35%|███████████                     | 4682/13570 [11:21:31<48:41:34, 19.72s/it]

 35%|███████████                     | 4683/13570 [11:21:51<48:44:55, 19.75s/it]

 35%|███████████                     | 4684/13570 [11:22:11<48:37:58, 19.70s/it]

 35%|███████████                     | 4685/13570 [11:22:30<48:36:39, 19.70s/it]

 35%|███████████                     | 4686/13570 [11:22:50<48:31:32, 19.66s/it]

 35%|███████████                     | 4687/13570 [11:23:09<48:26:01, 19.63s/it]

 35%|███████████                     | 4688/13570 [11:23:29<48:22:51, 19.61s/it]

 35%|███████████                     | 4689/13570 [11:23:49<48:26:36, 19.64s/it]

 35%|███████████                     | 4690/13570 [11:24:08<48:21:02, 19.60s/it]

 35%|███████████                     | 4691/13570 [11:24:28<48:17:25, 19.58s/it]

 35%|███████████                     | 4692/13570 [11:24:47<48:26:24, 19.64s/it]

 35%|███████████                     | 4693/13570 [11:25:07<48:22:46, 19.62s/it]

 35%|███████████                     | 4694/13570 [11:25:27<48:27:05, 19.65s/it]

 35%|███████████                     | 4695/13570 [11:25:47<48:41:38, 19.75s/it]

 35%|███████████                     | 4696/13570 [11:26:07<48:56:17, 19.85s/it]

 35%|███████████                     | 4697/13570 [11:26:27<48:57:25, 19.86s/it]

 35%|███████████                     | 4698/13570 [11:26:47<49:03:04, 19.90s/it]

 35%|███████████                     | 4699/13570 [11:27:06<48:54:19, 19.85s/it]

 35%|███████████                     | 4700/13570 [11:27:26<48:43:51, 19.78s/it]                                                                                {'loss': 1.6825, 'grad_norm': 0.308776319026947, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.73}
 35%|███████████                     | 4700/13570 [11:27:26<48:43:51, 19.78s/it][INFO|trainer.py:3512] 2024-04-20 03:34:29,803 >> ***** Running Evaluation *****
[INFO|trainer.py:3514] 2024-04-20 03:34:29,803 >>   Num examples = 110
[INFO|trainer.py:3517] 2024-04-20 03:34:29,803 >>   Batch size = 2



  0%|                                                    | 0/28 [00:00<?, ?it/s][A


  7%|███▏                                        | 2/28 [00:03<00:40,  1.54s/it][A


 11%|████▋                                       | 3/28 [00:06<00:54,  2.18s/it][A


 14%|██████▎                                     | 4/28 [00:09<01:00,  2.52s/it][A


 18%|███████▊                                    | 5/28 [00:12<01:02,  2.71s/it][A


 21%|█████████▍                                  | 6/28 [00:15<01:02,  2.84s/it][A


 25%|███████████                                 | 7/28 [00:18<01:01,  2.92s/it][A


 29%|████████████▌                               | 8/28 [00:21<00:59,  2.97s/it][A


 32%|██████████████▏                             | 9/28 [00:24<00:57,  3.01s/it][A


 36%|███████████████▎                           | 10/28 [00:27<00:54,  3.03s/it][A


 39%|████████████████▉                          | 11/28 [00:30<00:51,  3.05s/it][A


 43%|██████████████████▍                        | 12/28 [00:33<00:48,  3.06s/it][A


 46%|███████████████████▉                       | 13/28 [00:37<00:46,  3.07s/it][A


 50%|█████████████████████▌                     | 14/28 [00:40<00:43,  3.08s/it][A


 54%|███████████████████████                    | 15/28 [00:43<00:40,  3.09s/it][A


 57%|████████████████████████▌                  | 16/28 [00:46<00:37,  3.09s/it][A


 61%|██████████████████████████                 | 17/28 [00:49<00:34,  3.10s/it][A


 64%|███████████████████████████▋               | 18/28 [00:52<00:31,  3.10s/it][A


 68%|█████████████████████████████▏             | 19/28 [00:55<00:27,  3.10s/it][A


 71%|██████████████████████████████▋            | 20/28 [00:58<00:24,  3.10s/it][A


 75%|████████████████████████████████▎          | 21/28 [01:01<00:21,  3.10s/it][A


 79%|█████████████████████████████████▊         | 22/28 [01:04<00:18,  3.10s/it][A


 82%|███████████████████████████████████▎       | 23/28 [01:08<00:15,  3.10s/it][A


 86%|████████████████████████████████████▊      | 24/28 [01:11<00:12,  3.09s/it][A


 89%|██████████████████████████████████████▍    | 25/28 [01:14<00:09,  3.09s/it][A


 93%|███████████████████████████████████████▉   | 26/28 [01:17<00:06,  3.09s/it][A


 96%|█████████████████████████████████████████▍ | 27/28 [01:20<00:03,  3.08s/it][A


100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.10s/it][A                                                                                
                                                                                [A{'eval_loss': 1.7936995029449463, 'eval_runtime': 86.8079, 'eval_samples_per_second': 1.267, 'eval_steps_per_second': 0.323, 'epoch': 1.73}
 35%|███████████                     | 4700/13570 [11:28:53<48:43:51, 19.78s/it]
100%|███████████████████████████████████████████| 28/28 [01:23<00:00,  3.10s/it][A
                                                                                [A[INFO|trainer.py:3203] 2024-04-20 03:35:56,614 >> Saving model checkpoint to /kaggle/working/llama2/checkpoint-4700


[INFO|configuration_utils.py:726] 2024-04-20 03:35:57,268 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--unsloth--llama-2-7b-bnb-4bit/snapshots/33c7b76e4203224fa1f148467e90a4a37f0f3604/config.json
[INFO|configuration_utils.py:789] 2024-04-20 03:35:57,269 >> Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 2,
  "pretraining_tp": 1,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_e

[INFO|tokenization_utils_base.py:2502] 2024-04-20 03:35:57,464 >> tokenizer config file saved in /kaggle/working/llama2/checkpoint-4700/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 03:35:57,464 >> Special tokens file saved in /kaggle/working/llama2/checkpoint-4700/special_tokens_map.json


[INFO|tokenization_utils_base.py:2502] 2024-04-20 03:35:57,809 >> tokenizer config file saved in /kaggle/working/llama2/tokenizer_config.json
[INFO|tokenization_utils_base.py:2511] 2024-04-20 03:35:57,809 >> Special tokens file saved in /kaggle/working/llama2/special_tokens_map.json
[INFO|trainer.py:3295] 2024-04-20 03:35:57,813 >> Deleting older checkpoint [/kaggle/working/llama2/checkpoint-2700] due to args.save_total_limit


 35%|██████████▋                    | 4701/13570 [11:29:14<113:58:22, 46.26s/it]

 35%|███████████                     | 4702/13570 [11:29:34<94:24:25, 38.32s/it]

 35%|███████████                     | 4703/13570 [11:29:53<80:26:56, 32.66s/it]

 35%|███████████                     | 4704/13570 [11:30:13<70:51:57, 28.77s/it]

 35%|███████████                     | 4705/13570 [11:30:33<64:15:04, 26.09s/it]

 35%|███████████                     | 4706/13570 [11:30:52<59:23:42, 24.12s/it]

 35%|███████████                     | 4707/13570 [11:31:12<56:01:38, 22.76s/it]

 35%|███████████                     | 4708/13570 [11:31:32<53:36:53, 21.78s/it]

 35%|███████████                     | 4709/13570 [11:31:51<52:02:07, 21.14s/it]

 35%|███████████                     | 4710/13570 [11:32:11<50:56:26, 20.70s/it]

 35%|███████████                     | 4711/13570 [11:32:31<50:14:10, 20.41s/it]

 35%|███████████                     | 4712/13570 [11:32:50<49:38:54, 20.18s/it]

 35%|███████████                     | 4713/13570 [11:33:09<48:58:57, 19.91s/it]

 35%|███████████                     | 4714/13570 [11:33:29<48:34:11, 19.74s/it]

 35%|███████████                     | 4715/13570 [11:33:48<48:28:35, 19.71s/it]

 35%|███████████                     | 4716/13570 [11:34:08<48:23:40, 19.68s/it]

 35%|███████████                     | 4717/13570 [11:34:28<48:19:55, 19.65s/it]

 35%|███████████▏                    | 4718/13570 [11:34:47<48:17:03, 19.64s/it]

 35%|███████████▏                    | 4719/13570 [11:35:07<48:12:23, 19.61s/it]

 35%|███████████▏                    | 4720/13570 [11:35:27<48:20:20, 19.66s/it]

 35%|███████████▏                    | 4721/13570 [11:35:46<48:11:40, 19.61s/it]

 35%|███████████▏                    | 4722/13570 [11:36:06<48:14:16, 19.63s/it]

 35%|███████████▏                    | 4723/13570 [11:36:25<48:06:12, 19.57s/it]

 35%|███████████▏                    | 4724/13570 [11:36:45<47:55:47, 19.51s/it]

 35%|███████████▏                    | 4725/13570 [11:37:04<47:54:11, 19.50s/it]

 35%|███████████▏                    | 4726/13570 [11:37:24<48:15:28, 19.64s/it]

 35%|███████████▏                    | 4727/13570 [11:37:44<48:17:44, 19.66s/it]

 35%|███████████▏                    | 4728/13570 [11:38:03<48:19:43, 19.68s/it]

 35%|███████████▏                    | 4729/13570 [11:38:23<48:16:15, 19.66s/it]

 35%|███████████▏                    | 4730/13570 [11:38:43<48:26:32, 19.73s/it]

 35%|███████████▏                    | 4731/13570 [11:39:03<48:34:14, 19.78s/it]

 35%|███████████▏                    | 4732/13570 [11:39:23<48:45:17, 19.86s/it]

 35%|███████████▏                    | 4733/13570 [11:39:43<48:44:33, 19.86s/it]

 35%|███████████▏                    | 4734/13570 [11:40:02<48:36:53, 19.81s/it]

 35%|███████████▏                    | 4735/13570 [11:40:22<48:22:47, 19.71s/it]

 35%|███████████▏                    | 4736/13570 [11:40:41<48:15:49, 19.67s/it]

 35%|███████████▏                    | 4737/13570 [11:41:01<48:13:06, 19.65s/it]

 35%|███████████▏                    | 4738/13570 [11:41:20<47:48:54, 19.49s/it]

 35%|███████████▏                    | 4739/13570 [11:41:40<48:03:28, 19.59s/it]

 35%|███████████▏                    | 4740/13570 [11:42:00<47:59:44, 19.57s/it]

 35%|███████████▏                    | 4741/13570 [11:42:19<47:49:02, 19.50s/it]

 35%|███████████▏                    | 4742/13570 [11:42:38<47:45:25, 19.47s/it]

 35%|███████████▏                    | 4743/13570 [11:42:58<47:45:12, 19.48s/it]

 35%|███████████▏                    | 4744/13570 [11:43:18<48:00:07, 19.58s/it]

 35%|███████████▏                    | 4745/13570 [11:43:37<48:02:35, 19.60s/it]

 35%|███████████▏                    | 4746/13570 [11:43:57<48:16:34, 19.70s/it]

 35%|███████████▏                    | 4747/13570 [11:44:17<48:08:05, 19.64s/it]

 35%|███████████▏                    | 4748/13570 [11:44:37<48:23:32, 19.75s/it]

 35%|███████████▏                    | 4749/13570 [11:44:57<48:27:27, 19.78s/it]

 35%|███████████▏                    | 4750/13570 [11:45:16<48:13:02, 19.68s/it]                                                                                {'loss': 1.6994, 'grad_norm': 0.5032163262367249, 'learning_rate': 2.9999999999999997e-05, 'epoch': 1.75}
 35%|███████████▏                    | 4750/13570 [11:45:16<48:13:02, 19.68s/it]

 35%|███████████▏                    | 4751/13570 [11:45:36<48:10:11, 19.66s/it]

 35%|███████████▏                    | 4752/13570 [11:45:56<48:22:42, 19.75s/it]

 35%|███████████▏                    | 4753/13570 [11:46:16<48:31:49, 19.82s/it]

 35%|███████████▏                    | 4754/13570 [11:46:35<48:38:25, 19.86s/it]

 35%|███████████▏                    | 4755/13570 [11:46:55<48:38:14, 19.86s/it]

 35%|███████████▏                    | 4756/13570 [11:47:15<48:34:50, 19.84s/it]

 35%|███████████▏                    | 4757/13570 [11:47:35<48:45:18, 19.92s/it]

 35%|███████████▏                    | 4758/13570 [11:47:55<48:42:48, 19.90s/it]

 35%|███████████▏                    | 4759/13570 [11:48:15<48:28:26, 19.81s/it]

 35%|███████████▏                    | 4760/13570 [11:48:34<48:10:42, 19.69s/it]

 35%|███████████▏                    | 4761/13570 [11:48:54<48:02:57, 19.64s/it]

 35%|███████████▏                    | 4762/13570 [11:49:14<48:18:13, 19.74s/it]

 35%|███████████▏                    | 4763/13570 [11:49:33<48:09:58, 19.69s/it]

 35%|███████████▏                    | 4764/13570 [11:49:53<48:19:22, 19.76s/it]

 35%|███████████▏                    | 4765/13570 [11:50:13<48:21:47, 19.77s/it]

 35%|███████████▏                    | 4766/13570 [11:50:33<48:31:23, 19.84s/it]

 35%|███████████▏                    | 4767/13570 [11:50:52<48:20:28, 19.77s/it]

### Infer the fine-tuned model

In [None]:
# from llmtuner import ChatModel
# chat_model = ChatModel(dict(
#   model_name_or_path="Qwen/Qwen1.5-0.5B-Chat",
#   adapter_name_or_path="test_identity", # output dir of our training
#   finetuning_type="lora",
#   template="qwen", # change to alpaca
# ))
# messages = []
# while True:
#   query = input("\nUser: ")
#   if query.strip() == "exit":
#     break
#   if query.strip() == "clear":
#     messages = []
#     continue

#   messages.append({"role": "user", "content": query})
#   print("Assistant: ", end="", flush=True)
#   response = ""
#   for new_text in chat_model.stream_chat(messages):
#     print(new_text, end="", flush=True)
#     response += new_text
#   print()
#   messages.append({"role": "assistant", "content": response})

### Merge LoRA weights

In [None]:
# from llmtuner import export_model
# export_model(dict(
#   model_name_or_path="Qwen/Qwen1.5-0.5B-Chat",
#   adapter_name_or_path="test_identity",
#   finetuning_type="lora",
#   template="qwen",
#   export_dir="test_exported",
#   # export_hub_model_id="your_hf_id/test_identity",
# ))