# LLM JP instruction-tuning Finetuning with HuggingFace and Weights and Biases
<!--- @wandbcode{llm-finetune-hf} -->
- Fine-tune a lightweight LLM with LoRA and 8-bit quantization
- Checkpoint the LoRA adapter weights as artifacts

# Setting

In [1]:
import bitsandbytes as bnb
import copy
import glob
import os
import wandb
import json
from tqdm import tqdm
from types import SimpleNamespace
import datasets
from datasets import Dataset
import random
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_ENTITY"] = "japan-demo"
os.environ["WANDB_PROJECT"] = "jp-instruction-tuning"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=1024"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mkeisuke-kamata[0m ([33mwandb[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Install Model and Dataset

In [3]:
config = {
    "BASE_MODEL":"cyberagent/open-calm-3b",
    "lora_config":{
        "r":32,
        "lora_alpha":16,
        "target_modules":["query_key_value"],
        "lora_dropout":.1,
        "bias":"none",
        "task_type":"CAUSAL_LM"
    },
    "training_args":{
        "dataloader_num_workers":16,
        "evaluation_strategy":"steps",
        "per_device_train_batch_size":8,
        "max_steps": 100,
        "gradient_accumulation_steps":2,
        "report_to":"wandb",#wandb integration
        "warmup_steps":10,
        "num_train_epochs":1,
        "learning_rate":2e-4,
        "fp16":True,
        "logging_steps":10,
        "save_steps":25,
        "output_dir":'./outputs'
    }
}

In [4]:
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(
    config["BASE_MODEL"],
    load_in_8bit=True,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(config["BASE_MODEL"])

In [5]:
dolly_ja = datasets.load_dataset("kunishou/databricks-dolly-15k-ja")
dolly_ja = list(dolly_ja['train'])
split_index = int(len(dolly_ja) * 0.8)  # 例として80%をトレーニングデータとして使用
dolly_ja_train = dolly_ja[:split_index]
dolly_ja_val = dolly_ja[split_index:]

## Dataset

In [6]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task. Write a response that appropriately completes the request."
        "### Instruction:{instruction} \n\n Input:{input} \n\n ###Response"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. Write a response that appropriately completes the request."
        "### Instruction:{instruction} \n\n ###Response"
    )
}

class InstructDataset(Dataset):
    def __init__(self, json_list, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index
        self.features = []
        
        for j in tqdm(json_list):
            if 'input' in j:
                source_text = PROMPT_DICT['prompt_input'].format_map(j)
            else:
                source_text = PROMPT_DICT['prompt_no_input'].format_map(j)
            example_text = source_text + j['output'] + self.tokenizer.eos_token
            
            source_tokenized = self.tokenizer(
                source_text,
                padding='longest',
                truncation=True,
                max_length=512,
                return_length=True,
                return_tensors='pt'
            )
            
            example_tokenized = self.tokenizer(
                example_text, 
                padding='longest', 
                truncation=True, 
                max_length=512, 
                return_tensors='pt'
            )
            
            input_ids = example_tokenized['input_ids'][0]
            labels = copy.deepcopy(input_ids)
            source_len = source_tokenized['length'][0]
            labels[:source_len] = self.ignore_index
            
            self.features.append({
                'input_ids': input_ids,
                'labels': labels
            })
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx]


class InstructCollator():
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = -100

    def __call__(self, examples):
        input_batch = []
        label_batch = []
        for example in examples:
            input_batch.append(example['input_ids'])
            label_batch.append(example['labels'])
        input_ids = pad_sequence(
            input_batch, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = pad_sequence(
            label_batch, batch_first=True, padding_value=self.ignore_index
        )
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)   
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask
        }

In [7]:
train_dataset = InstructDataset(dolly_ja_train, tokenizer)
val_dataset = InstructDataset(dolly_ja_val, tokenizer)
collator = InstructCollator(tokenizer)

100%|██████████| 12012/12012 [00:06<00:00, 1913.98it/s]
100%|██████████| 3003/3003 [00:01<00:00, 1823.92it/s]


## Post processing on the model

In [None]:
# モデルの中身を確認
print(model.gpt_neox.layers[0].attention)
#GPTNeoXAttention(
#  (rotary_emb): RotaryEmbedding()
#  (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)
#  (dense): Linear(in_features=4096, out_features=4096, bias=True)

# cast the small parameters (e.g. layernorm) to fp32 for stability
for param in model.parameters():
    param.requires_grad = False # freeze the model - train adapters later
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.embed_out = CastOutputToFloat(model.embed_out)

## Training

In [None]:
with wandb.init(config=config, job_type="training") as run:
    # Setup for LoRa
    lora_config = LoraConfig(**wandb.config["lora_config"])
    model_peft = get_peft_model(model, lora_config)    
    model_peft.print_trainable_parameters()
    model_peft.config.use_cache = False

    trainer = transformers.Trainer(
        model=model_peft,
        data_collator=collator,
        args=transformers.TrainingArguments(**wandb.config["training_args"]),
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train()
    model_peft.save_pretrained("./output")
    model_ft = wandb.Artifact(f"finetuned-model", type="model")
    model_ft.add_dir("./output")
    run.log_artifact(model_ft)
    run.log_code()

## (Advanced) Sweep

In [8]:
sweep_configuration= {
    "method": "random",
    "metric": {"goal": "minimize", "name": "eval/loss"},
    "parameters": {
        "r":{"values": [2,4,8,16,32]},
        "lora_alpha":{"values": [2,4,8,16]},
        "learning_rate":{'max': 2e-3, 'min': 2e-4}
    }
}

default_config = {
    "BASE_MODEL":"cyberagent/open-calm-3b",
    "lora_config":{
        "r":32,
        "lora_alpha":16,
        "target_modules":["query_key_value"],
        "lora_dropout":.1,
        "bias":"none",
        "task_type":"CAUSAL_LM"
    },
    "training_args":{
        "dataloader_num_workers":16,
        "evaluation_strategy":"steps",
        "per_device_train_batch_size":8,
        "max_steps": 100,
        "gradient_accumulation_steps":2,
        "report_to":"wandb",#wandb integration
        "warmup_steps":10,
        "num_train_epochs":1,
        "learning_rate":2e-4,
        "fp16":True,
        "logging_steps":10,
        "save_steps":25,
        "output_dir":'./outputs'
    }
}

# cast the small parameters (e.g. layernorm) to fp32 for stability
for param in model.parameters():
    param.requires_grad = False # freeze the model - train adapters later
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.embed_out = CastOutputToFloat(model.embed_out)


def train_func():
    with wandb.init(config=config, job_type="training") as run:
        # Setup for LoRa
        default_config["lora_config"]["r"] = wandb.config["r"]
        default_config["lora_config"]["lora_alpha"] = wandb.config["lora_alpha"]
        default_config["training_args"]["learning_rate"] = wandb.config["learning_rate"]


        lora_config = LoraConfig(**default_config["lora_config"])
        model_peft = get_peft_model(model, lora_config)    
        model_peft.print_trainable_parameters()
        model_peft.config.use_cache = False
    
        trainer = transformers.Trainer(
            model=model_peft,
            data_collator=collator,
            args=transformers.TrainingArguments(**default_config["training_args"]),
            train_dataset=train_dataset,
            eval_dataset=val_dataset
        )
        trainer.train()
        model_peft.save_pretrained("./output")
        model_ft = wandb.Artifact(f"finetuned-model", type="model")
        model_ft.add_dir("./output")
        run.log_artifact(model_ft)
        run.log_code()

sweep_id = wandb.sweep(sweep=sweep_configuration)
wandb.agent(sweep_id, function=train_func, count=20)

Create sweep with ID: 8e17pj90
Sweep URL: https://wandb.ai/japan-demo/jp-instruction-tuning/sweeps/8e17pj90


[34m[1mwandb[0m: Agent Starting Run: g63ga8cb with config:
[34m[1mwandb[0m: 	learning_rate: 0.001107952583922356
[34m[1mwandb[0m: 	lora_alpha: 2
[34m[1mwandb[0m: 	r: 4
[34m[1mwandb[0m: Currently logged in as: [33mkeisuke-kamata[0m ([33mjapan-demo[0m). Use [1m`wandb login --relogin`[0m to force relogin




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9837,2.822979
20,2.8145,2.642505
30,2.6489,2.59368
40,2.6345,2.574844
50,2.6343,2.56066
60,2.4035,2.553237
70,2.7465,2.548208
80,2.6422,2.544298
90,2.6644,2.542321
100,2.5772,2.5414


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▂▂▁▁▁▁▁▁
eval/runtime,▅▁▃▇▇▄▂▃██
eval/samples_per_second,▄█▆▃▃▅▇▆▁▁
eval/steps_per_second,▅█▅▁▁▅▅▅▁▁
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.5414
eval/runtime,120.0953
eval/samples_per_second,25.005
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5772
train/total_flos,1.015513358303232e+16
train/train_loss,2.67496


[34m[1mwandb[0m: Agent Starting Run: 7v7v6wva with config:
[34m[1mwandb[0m: 	learning_rate: 0.0009351076911857216
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 32




trainable params: 10,485,760 || all params: 2,795,525,120 || trainable%: 0.37509088811192653




Step,Training Loss,Validation Loss
10,2.9475,2.710118
20,2.7577,2.603467
30,2.6226,2.574393
40,2.6195,2.557958
50,2.6169,2.546619
60,2.3929,2.539884
70,2.7325,2.535327
80,2.6299,2.53178
90,2.6528,2.529044
100,2.5614,2.52801


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.1s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▄▂▁█▃▇█▃▄▃
eval/samples_per_second,▅▇█▁▆▂▁▆▅▆
eval/steps_per_second,▅██▁▅▁▁▅▅▅
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.52801
eval/runtime,120.1162
eval/samples_per_second,25.001
eval/steps_per_second,3.13
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5614
train/total_flos,1.019025827856384e+16
train/train_loss,2.65338


[34m[1mwandb[0m: Agent Starting Run: 2osa068c with config:
[34m[1mwandb[0m: 	learning_rate: 0.0006140942853396663
[34m[1mwandb[0m: 	lora_alpha: 2
[34m[1mwandb[0m: 	r: 32




trainable params: 10,485,760 || all params: 2,795,525,120 || trainable%: 0.37509088811192653




Step,Training Loss,Validation Loss
10,3.0006,2.928456
20,2.9185,2.690914
30,2.6959,2.638189
40,2.6669,2.605998
50,2.6668,2.588048
60,2.4275,2.578076
70,2.7689,2.571414
80,2.6641,2.567323
90,2.6823,2.565076
100,2.5986,2.563603


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.1s


0,1
eval/loss,█▃▂▂▁▁▁▁▁▁
eval/runtime,▂▂▃▄█▁▃▁█▅
eval/samples_per_second,▇▇▆▅▁█▅█▁▄
eval/steps_per_second,██▆▆▁█▆█▁▆
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▇▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.5636
eval/runtime,120.1768
eval/samples_per_second,24.988
eval/steps_per_second,3.129
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5986
train/total_flos,1.019025827856384e+16
train/train_loss,2.70902


[34m[1mwandb[0m: Agent Starting Run: kl2cjicp with config:
[34m[1mwandb[0m: 	learning_rate: 0.0012960543474547276
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 4




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9258,2.680261
20,2.7375,2.592384
30,2.6128,2.573225
40,2.6162,2.55228
50,2.6128,2.541932
60,2.3894,2.534371
70,2.7278,2.530124
80,2.625,2.526201
90,2.6505,2.523867
100,2.5623,2.52334


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▇▇▇▆▆▄▆▁▅█
eval/samples_per_second,▁▂▂▃▃▅▃█▄▁
eval/steps_per_second,▁▁▁▅▅▅▅█▅▁
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.52334
eval/runtime,120.0868
eval/samples_per_second,25.007
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5623
train/total_flos,1.015513358303232e+16
train/train_loss,2.64602


[34m[1mwandb[0m: Agent Starting Run: qo94zzt6 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0007772866904158016
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 4




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9561,2.734229
20,2.7714,2.612916
30,2.6285,2.579769
40,2.6231,2.563002
50,2.6227,2.550567
60,2.3978,2.543941
70,2.7364,2.539079
80,2.6342,2.535614
90,2.6559,2.533225
100,2.5695,2.532444


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▄▂▆▄█▃▁▂▄▄
eval/samples_per_second,▄▆▃▅▁▆█▆▅▅
eval/steps_per_second,▅█▅▅▁█████
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.53244
eval/runtime,120.1072
eval/samples_per_second,25.003
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5695
train/total_flos,1.015513358303232e+16
train/train_loss,2.65956


[34m[1mwandb[0m: Agent Starting Run: 11fqwymk with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005188422513377607
[34m[1mwandb[0m: 	lora_alpha: 2
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,3.0024,2.940428
20,2.9414,2.712111
30,2.7108,2.647865
40,2.6766,2.615963
50,2.6759,2.598461
60,2.4401,2.587663
70,2.7754,2.580106
80,2.6732,2.57537
90,2.6941,2.572934
100,2.606,2.572294


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▂▂▁▁▁▁▁▁
eval/runtime,▅▅▅▄▃█▁▄▁▃
eval/samples_per_second,▄▄▄▅▆▁█▅█▆
eval/steps_per_second,▅▅▅▅▅▁█▅██
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▇▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.57229
eval/runtime,120.1078
eval/samples_per_second,25.003
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.606
train/total_flos,1.015262467620864e+16
train/train_loss,2.7196


[34m[1mwandb[0m: Agent Starting Run: 3vf4bcdf with config:
[34m[1mwandb[0m: 	learning_rate: 0.0006101233068819079
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,2.9359,2.702642
20,2.7596,2.609017
30,2.6299,2.577864
40,2.6215,2.56105
50,2.6208,2.548752
60,2.3962,2.542703
70,2.7365,2.538094
80,2.6324,2.534559
90,2.6559,2.532296
100,2.5676,2.531905


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▃▁▂█▃▃▂▅▂▅
eval/samples_per_second,▆█▇▁▇▆▇▄▇▄
eval/steps_per_second,▆██▁▆▆█▃█▃
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.53191
eval/runtime,120.1543
eval/samples_per_second,24.993
eval/steps_per_second,3.129
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5676
train/total_flos,1.015262467620864e+16
train/train_loss,2.65564


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: yeq43dar with config:
[34m[1mwandb[0m: 	learning_rate: 0.0007143254916769785
[34m[1mwandb[0m: 	lora_alpha: 4
[34m[1mwandb[0m: 	r: 32




trainable params: 10,485,760 || all params: 2,795,525,120 || trainable%: 0.37509088811192653




Step,Training Loss,Validation Loss
10,2.9836,2.831631
20,2.8214,2.646711
30,2.6561,2.597736
40,2.6375,2.577672
50,2.6379,2.562819
60,2.4045,2.556147
70,2.7467,2.550408
80,2.6423,2.546877
90,2.6658,2.544974
100,2.5789,2.544265


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.1s


0,1
eval/loss,█▃▂▂▁▁▁▁▁▁
eval/runtime,▆▇▃▅█▂▃▄▁▂
eval/samples_per_second,▃▂▆▄▁▇▆▅█▇
eval/steps_per_second,▁▁█▅▁██▅██
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.54427
eval/runtime,120.1627
eval/samples_per_second,24.991
eval/steps_per_second,3.129
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5789
train/total_flos,1.019025827856384e+16
train/train_loss,2.67747


[34m[1mwandb[0m: Agent Starting Run: t3a4xwvl with config:
[34m[1mwandb[0m: 	learning_rate: 0.0016240452856319432
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 16


trainable params: 5,242,880 || all params: 2,790,282,240 || trainable%: 0.18789783789040637




Step,Training Loss,Validation Loss
10,2.9138,2.670833
20,2.724,2.583606
30,2.6043,2.569542
40,2.6122,2.548271
50,2.6079,2.536728
60,2.3828,2.530697
70,2.7247,2.525887
80,2.6202,2.522869
90,2.6459,2.519764
100,2.5593,2.518792


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▂▁▁▁▁
eval/runtime,▁▆█▄▃▅▄▅▃▃
eval/samples_per_second,█▃▁▅▆▄▅▄▆▆
eval/steps_per_second,█▁▁▅▅▅▅▅▅▅
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▅▄▄▄▁▆▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.51879
eval/runtime,120.0763
eval/samples_per_second,25.009
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5593
train/total_flos,1.01701870239744e+16
train/train_loss,2.6395


[34m[1mwandb[0m: Agent Starting Run: nqz5tl7p with config:
[34m[1mwandb[0m: 	learning_rate: 0.00033230850505124813
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 8




trainable params: 2,621,440 || all params: 2,787,660,800 || trainable%: 0.09403726593995941




Step,Training Loss,Validation Loss
10,2.9932,2.890548
20,2.8783,2.670422
30,2.6819,2.627282
40,2.6578,2.599611
50,2.6571,2.581806
60,2.4227,2.573025
70,2.7641,2.567003
80,2.6586,2.563356
90,2.6806,2.560772
100,2.5955,2.559992


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▃▂▂▁▁▁▁▁▁
eval/runtime,▁▅▆█▃▆▃▃▃▅
eval/samples_per_second,█▅▄▁▆▃▆▆▆▄
eval/steps_per_second,█▄▄▁▄▁█▄█▄
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▇▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.55999
eval/runtime,120.1739
eval/samples_per_second,24.989
eval/steps_per_second,3.129
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5955
train/total_flos,1.016015139667968e+16
train/train_loss,2.69897


[34m[1mwandb[0m: Agent Starting Run: gz6rcgzw with config:
[34m[1mwandb[0m: 	learning_rate: 0.0003884976318730308
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,2.9625,2.766867
20,2.7913,2.637496
30,2.6484,2.593835
40,2.634,2.57509
50,2.6355,2.561079
60,2.4058,2.55427
70,2.7464,2.548455
80,2.6414,2.544444
90,2.6639,2.542184
100,2.5768,2.541398


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▂█▃▁▇▂▁▂▁▂
eval/samples_per_second,▇▁▆█▂██▇██
eval/steps_per_second,▆▁▆█▃██▆██
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.5414
eval/runtime,120.1052
eval/samples_per_second,25.003
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5768
train/total_flos,1.015262467620864e+16
train/train_loss,2.67059


[34m[1mwandb[0m: Agent Starting Run: j44p4gee with config:
[34m[1mwandb[0m: 	learning_rate: 0.0002745971673760735
[34m[1mwandb[0m: 	lora_alpha: 2
[34m[1mwandb[0m: 	r: 16




trainable params: 5,242,880 || all params: 2,790,282,240 || trainable%: 0.18789783789040637




Step,Training Loss,Validation Loss
10,3.0079,2.98234
20,3.054,2.855689
30,2.8399,2.737135
40,2.7458,2.682171
50,2.7353,2.653193
60,2.4923,2.637036
70,2.8243,2.627062
80,2.7167,2.62122
90,2.7336,2.617736
100,2.6518,2.616482


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▆▃▂▂▁▁▁▁▁
eval/runtime,▃▂▁▄▃▇▃█▇▅
eval/samples_per_second,▇▇█▅▇▁▆▁▂▄
eval/steps_per_second,███▄█▁▄▁▁▄
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,▇█▅▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.61648
eval/runtime,120.1617
eval/samples_per_second,24.991
eval/steps_per_second,3.129
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.6518
train/total_flos,1.01701870239744e+16
train/train_loss,2.78016


[34m[1mwandb[0m: Agent Starting Run: 77yh6xy3 with config:
[34m[1mwandb[0m: 	learning_rate: 0.0013757282456376525
[34m[1mwandb[0m: 	lora_alpha: 4
[34m[1mwandb[0m: 	r: 4




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9515,2.711336
20,2.7558,2.60101
30,2.6232,2.57445
40,2.6206,2.557652
50,2.6158,2.545607
60,2.3926,2.539224
70,2.7321,2.534562
80,2.6286,2.530815
90,2.6529,2.528745
100,2.5632,2.527348


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▄▆▅█▄▄▁▃▂▄
eval/samples_per_second,▅▃▄▁▅▅█▆▇▅
eval/steps_per_second,▄▃▄▁▄▄█▆▆▄
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.52735
eval/runtime,120.004
eval/samples_per_second,25.024
eval/steps_per_second,3.133
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5632
train/total_flos,1.015513358303232e+16
train/train_loss,2.65362


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jaeyf11w with config:
[34m[1mwandb[0m: 	learning_rate: 0.0010734874956814093
[34m[1mwandb[0m: 	lora_alpha: 2
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,2.9841,2.821956
20,2.8128,2.645225
30,2.6499,2.595028
40,2.6323,2.574992
50,2.6358,2.561187
60,2.4056,2.554585
70,2.7465,2.549136
80,2.6423,2.545339
90,2.6629,2.543334
100,2.5775,2.542664


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▂▂▁▁▁▁▁▁
eval/runtime,▆▆▃▄▂█▃▃▁▄
eval/samples_per_second,▃▃▅▅▆▁▆▆█▅
eval/steps_per_second,▁▃▆▆▆▁▆▆█▃
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.54266
eval/runtime,120.1124
eval/samples_per_second,25.002
eval/steps_per_second,3.13
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5775
train/total_flos,1.015262467620864e+16
train/train_loss,2.67496


[34m[1mwandb[0m: Agent Starting Run: ccg48k2c with config:
[34m[1mwandb[0m: 	learning_rate: 0.0018511692943030257
[34m[1mwandb[0m: 	lora_alpha: 4
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,2.9305,2.680303
20,2.7379,2.592973
30,2.6155,2.573115
40,2.6172,2.552472
50,2.6126,2.542547
60,2.3901,2.536153
70,2.73,2.53148
80,2.6255,2.528045
90,2.6498,2.526106
100,2.5625,2.525519


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▅▆▅█▆▄▁▃▅▂
eval/samples_per_second,▄▂▄▁▃▅█▆▄▇
eval/steps_per_second,▅▁▅▁▅▅██▅█
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.52552
eval/runtime,120.0902
eval/samples_per_second,25.006
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5625
train/total_flos,1.015262467620864e+16
train/train_loss,2.64716


[34m[1mwandb[0m: Agent Starting Run: rwv8p70z with config:
[34m[1mwandb[0m: 	learning_rate: 0.0006078872243379494
[34m[1mwandb[0m: 	lora_alpha: 16
[34m[1mwandb[0m: 	r: 4




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9409,2.709811
20,2.7609,2.606876
30,2.6247,2.577317
40,2.6231,2.560996
50,2.6207,2.548068
60,2.394,2.541878
70,2.7346,2.536759
80,2.6311,2.533249
90,2.6547,2.530816
100,2.5669,2.530372


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▄█▃▅▃▃▅▃▃▁
eval/samples_per_second,▅▁▆▄▇▆▄▆▆█
eval/steps_per_second,▅▁█▅██▅███
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.53037
eval/runtime,120.0759
eval/samples_per_second,25.009
eval/steps_per_second,3.131
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5669
train/total_flos,1.015513358303232e+16
train/train_loss,2.65516


[34m[1mwandb[0m: Agent Starting Run: a6anmp9q with config:
[34m[1mwandb[0m: 	learning_rate: 0.0013163638178068338
[34m[1mwandb[0m: 	lora_alpha: 4
[34m[1mwandb[0m: 	r: 2




trainable params: 655,360 || all params: 2,785,694,720 || trainable%: 0.02352590882607553




Step,Training Loss,Validation Loss
10,2.9501,2.710745
20,2.7563,2.603949
30,2.6256,2.575514
40,2.6206,2.559472
50,2.6194,2.547351
60,2.3965,2.541637
70,2.7346,2.536533
80,2.6298,2.533366
90,2.6526,2.531205
100,2.565,2.530154


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,▃▄▄▄▆▁▃▂█▂
eval/samples_per_second,▆▅▄▅▃█▆▇▁▇
eval/steps_per_second,▆▄▄▄▄█▆▆▁▆
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.53015
eval/runtime,120.1124
eval/samples_per_second,25.002
eval/steps_per_second,3.13
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.565
train/total_flos,1.015262467620864e+16
train/train_loss,2.65505


[34m[1mwandb[0m: Agent Starting Run: 4m7lh3lk with config:
[34m[1mwandb[0m: 	learning_rate: 0.0007312052867119651
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 8




trainable params: 2,621,440 || all params: 2,787,660,800 || trainable%: 0.09403726593995941




Step,Training Loss,Validation Loss
10,2.9606,2.744995
20,2.7764,2.616622
30,2.6327,2.582542
40,2.6254,2.56516
50,2.6241,2.552204
60,2.3972,2.545774
70,2.7379,2.540874
80,2.6367,2.537441
90,2.6567,2.534897
100,2.5698,2.534647


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,█▃▃▂▄▁▅▃▂▁
eval/samples_per_second,▁▆▇▇▅█▄▆▇█
eval/steps_per_second,▁███▄█▄███
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.53465
eval/runtime,120.1096
eval/samples_per_second,25.002
eval/steps_per_second,3.13
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5698
train/total_flos,1.016015139667968e+16
train/train_loss,2.66175


[34m[1mwandb[0m: Agent Starting Run: wsq243kz with config:
[34m[1mwandb[0m: 	learning_rate: 0.0005294252418942767
[34m[1mwandb[0m: 	lora_alpha: 4
[34m[1mwandb[0m: 	r: 4




trainable params: 1,310,720 || all params: 2,786,350,080 || trainable%: 0.04704075088798605




Step,Training Loss,Validation Loss
10,2.9936,2.886605
20,2.8669,2.660721
30,2.6765,2.617707
40,2.65,2.593578
50,2.6512,2.576303
60,2.4175,2.56832
70,2.7601,2.562615
80,2.6537,2.559029
90,2.6757,2.555943
100,2.5915,2.555135


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▃▂▂▁▁▁▁▁▁
eval/runtime,▁▁▁▃██████
eval/samples_per_second,███▆▁▁▁▁▁▁
eval/steps_per_second,███▆▁▁▁▁▁▁
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.55514
eval/runtime,157.0404
eval/samples_per_second,19.122
eval/steps_per_second,2.394
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5915
train/total_flos,1.015513358303232e+16
train/train_loss,2.69367


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vqem7ulk with config:
[34m[1mwandb[0m: 	learning_rate: 0.0013222286150400846
[34m[1mwandb[0m: 	lora_alpha: 8
[34m[1mwandb[0m: 	r: 8




trainable params: 2,621,440 || all params: 2,787,660,800 || trainable%: 0.09403726593995941




Step,Training Loss,Validation Loss
10,2.9261,2.679382
20,2.7375,2.592502
30,2.6137,2.573694
40,2.6177,2.551992
50,2.611,2.540586
60,2.3865,2.533543
70,2.7289,2.529042
80,2.6228,2.525728
90,2.6483,2.523227
100,2.5609,2.522573


[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-25)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-50)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-75)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./outputs/checkpoint-100)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


0,1
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/runtime,██▆█▇▆▅▆▁█
eval/samples_per_second,▁▁▃▁▂▃▄▃█▁
eval/steps_per_second,▁▁▃▂▂▄▄▃█▂
train/epoch,▁▁▂▂▃▃▃▃▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▄▄▁▅▄▄▃
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.52257
eval/runtime,157.2163
eval/samples_per_second,19.101
eval/steps_per_second,2.392
train/epoch,0.13
train/global_step,100.0
train/learning_rate,0.0
train/loss,2.5609
train/total_flos,1.016015139667968e+16
train/train_loss,2.64533
