# This Notebook is forked from 暗黑AGI's "baseline from 1st place of 2024"
Original Notebook: [baseline-from-1st-place-of-2024](https://www.kaggle.com/code/boristown/baseline-from-1st-place-of-2024)

---

# Modifications

- **Active Layer Control**: The num_active_layers variable has been added to common_stuff.py, allowing for control over how many initial layers of the Mistral-NeMo-Minitron model will be trained, while the rest are frozen.

- **Tuned Hyperparameter**: LoRA and training settings have been tuned to maximize performance within the 12-hour runtime limit.

This notebook scores about 115 on the public ARC-AGI-2 evaluation set. When run five times on the semi-private dataset, it scores [4.17, 5, 5, 5.42, 5.83]. The reason for this significant discrepancy remains unknown.

In [1]:
# Copyright 2024 Daniel Franzen and Jan Disselhoff
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
# This notebook contains our winning submission to the ARC Prize 2024 Kaggle competition,
# scoring 53.5 points on the private evaluation set.
# the ARChitects (Daniel Franzen and Jan Disselhoff)

In [1]:
%%writefile model_runner.py
import json
import os, sys
import bz2
import pickle
import numpy as np
from tqdm import tqdm


def indices_required_for_merges(keep_indices, vocab, merges):
    merges_lookup = {}
    for m in merges:
        a, b = m.split(' ') if isinstance(m, str) else m
        key = vocab[f'{a}{b}']
        if key not in merges_lookup: merges_lookup[key] = set()
        merges_lookup[key].add(vocab[a])
        merges_lookup[key].add(vocab[b])
    to_process = list(keep_indices)
    while len(to_process):
        for w in merges_lookup.get(to_process.pop(), []):
            if w not in keep_indices:
                keep_indices[w] = None
                to_process.append(w)
    return keep_indices

def remove_unused_merges(merges, vocab):
    return [f'{a} {b}' for a, b in [m.split(' ') if isinstance(m, str) else m for m in merges] if all(w in vocab for w in [a, b, a + b])]

def map_special_tokens(data, mapping=None):
    tokens = set()
    if isinstance(data, dict):
        special = data.get('special_tokens')
        if special is not None:
            for v in special.values():
                tokens.update(v['ids'])
                if mapping is not None:
                    v['ids'] = [mapping.get(i) for i in v['ids'] if i in mapping]
    for v in (data.values() if isinstance(data, dict) else data if isinstance(data, list) else []):
        tokens.update(map_special_tokens(v, mapping))
    return tokens

def remove_tokenizer_normalizer(tokenizer):
    from tokenizers import Tokenizer
    assert tokenizer.is_fast
    tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
    if tokenizer_json.get('normalizer') is not None:
        tokenizer_json['normalizer'] = None
        tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))

def shrink_tokenizer_vocab(tokenizer, keep_indices, keep_special_tokens, keep_token_order):
    from tokenizers import Tokenizer
    assert tokenizer.is_fast
    tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
    assert tokenizer_json['model']['type'] == "BPE"
    if keep_special_tokens:
        keep_indices.update({k: None for k in tokenizer.all_special_ids})
        keep_indices.update({k: None for k in map_special_tokens(tokenizer_json.get('post_processor'))})
    keep_indices = indices_required_for_merges(keep_indices, tokenizer_json['model']['vocab'], tokenizer_json['model']['merges'])
    if keep_token_order: keep_indices = sorted(keep_indices)
    mapping = {old: new for new, old in enumerate(keep_indices)}
    tokenizer_json['model']['vocab'] = {k: mapping[v] for k, v in tokenizer_json['model']['vocab'].items() if v in mapping}
    tokenizer_json['model']['merges'] = remove_unused_merges(tokenizer_json['model']['merges'], tokenizer_json['model']['vocab'])
    special_tokens_order = [t['id'] for t in tokenizer_json['added_tokens']]
    assert special_tokens_order==sorted(special_tokens_order)
    tokenizer_json['added_tokens'] = sorted([{**t, 'id': mapping[t['id']]} for t in tokenizer_json['added_tokens'] if t['id'] in mapping], key=lambda t: t['id'])
    map_special_tokens(tokenizer_json.get('post_processor'), mapping)
    tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
    return mapping, keep_indices

def shrink_model_embeddings(model, keep_indices, mapping):
    import torch
    with torch.no_grad():
        row_select = torch.tensor(list(keep_indices))
        new_embed_t = torch.index_select(model.get_input_embeddings().weight.data, 0, row_select.to(model.get_input_embeddings().weight.data.device))
        new_lm_head = torch.index_select(model.get_output_embeddings().weight.data, 0, row_select.to(model.get_output_embeddings().weight.data.device))
        model.resize_token_embeddings(len(keep_indices))
        model.get_input_embeddings().weight.data[:] = new_embed_t
        model.get_output_embeddings().weight.data[:] = new_lm_head
        for config in [model.config, model.generation_config]:
            for k, v in list(config.to_dict().items()):
                if k.endswith('token_id'):
                    setattr(config, k, [mapping.get(t) for t in v] if isinstance(v, list) else mapping.get(v))

def shrink_embeddings(model, tokenizer, corpus=None, keep_token_ids=[], keep_tokens=[], remove_token_ids=[], keep_model_tokens=True, keep_special_tokens=True, keep_normalizer=False, keep_token_order=True):
    if not keep_normalizer: remove_tokenizer_normalizer(tokenizer)
    from collections import OrderedDict  # use as OrderedSet
    keep_indices = OrderedDict()
    keep_indices.update({k: None for k in keep_token_ids})
    keep_indices.update({tokenizer.vocab[t]: None for t in keep_tokens})
    if corpus is not None: keep_indices.update({k: None for k in tokenizer(corpus)['input_ids']})
    if keep_model_tokens:
        for config in [model.config, model.generation_config]:
            for k, v in config.to_dict().items():
                if k.endswith('token_id'):
                    keep_indices.update({k: None for k in (v if isinstance(v, list) else [v])})
    keep_indices.pop(None, None)
    for idx in remove_token_ids: keep_indices.pop(idx, None)
    mapping, keep_indices = shrink_tokenizer_vocab(tokenizer, keep_indices, keep_special_tokens, keep_token_order)
    shrink_model_embeddings(model, keep_indices, mapping=mapping)
    return mapping

def fix_dtypes(model, fix_weights=True, fix_quant_states=True):
    import torch
    for module in model.modules():
        weight = getattr(module, 'weight', None)
        if weight is not None:
            if torch.is_floating_point(weight):
                if fix_weights and weight.dtype!=model.dtype:
                    module.to(model.dtype)
            else:
                qs = getattr(weight, 'quant_state', None)
                if qs is not None:
                    if fix_quant_states and qs.dtype!=model.dtype:
                        qs.dtype = model.dtype
    return model

def merge_peft_into_base(model):
    print('*** Merge peft model into base model...')
    assert is_peft_model(model)
    return fix_dtypes(model.merge_and_unload())

def save_model(store_path, model=None, tokenizer=None, merge=False):
    if merge: model = merge_peft_into_base(model)
    if store_path is not None:
        assert model is not None or tokenizer is not None
        print(f"*** Saving{' merged' if merge else ''} model/tokenizer to '{store_path}'...")
        if model is not None: model.save_pretrained(store_path)
        if tokenizer is not None:
            tokenizer.save_pretrained(store_path)
            to_delete = os.path.join(store_path, 'tokenizer.model')
            if os.path.isfile(to_delete): os.remove(to_delete)
    return model

def is_unsloth_model(model):
    return model.model_tags is not None and 'unsloth' in model.model_tags

def is_peft_model(model):
    return hasattr(model, 'peft_type')

def download_model(repo_id, store_path, get_name=lambda n: os.path.join(n.replace('/', '--'), 'transformers', 'default', '1')):
    import os
    if os.path.exists(repo_id): return repo_id
    model_path = os.path.join(store_path, get_name(repo_id))
    if not os.path.exists(model_path):
        from huggingface_hub import snapshot_download
        download_path = snapshot_download(repo_id=repo_id)
        os.makedirs(os.path.split(model_path)[0], exist_ok=True)
        os.symlink(download_path, model_path, target_is_directory=True)
    return model_path

def get_and_fix_peft_weights(store):
    print(f"*** Load peft state_dict from '{store}'...")
    from peft import load_peft_weights
    state_dict = load_peft_weights(store)
    for k in list(state_dict.keys()):
        if 'modules_to_save' in k:
            del state_dict[k]
            original_module_key = k.replace('.modules_to_save.', '.original_module.')
            if original_module_key in state_dict: del state_dict[original_module_key]
            assert k.replace('.modules_to_save.', '.') in state_dict
    return state_dict

def set_peft_weights(model, state_dict):
    print(f"*** Set model state_dict...")
    from peft import set_peft_model_state_dict
    res = set_peft_model_state_dict(model, state_dict)
    assert not res.unexpected_keys

def load_peft_state(model, store):
    set_peft_weights(model, get_and_fix_peft_weights(store))

def prepare_model(model, mode, tokenizer=None, formatter=None, shrink_embedding=False, dequantize=False, peft=[], local_files_only=False, add_special_tokens={}, set_pad_token=None, keep_tokens=[], keep_normalizer=None, peft_trainable=True, device_map=None, tf_grad_cp=True, tf_use_fa2=True, num_active_layers=None, **kwargs):
    if isinstance(model, str):
        assert tokenizer is None
        print(f"*** Load base model and tokenizer from '{model}'...")
        if mode=='unsloth_4bit':
            assert device_map is None, 'unsupported'
            from unsloth import FastLanguageModel
            model, tokenizer = FastLanguageModel.from_pretrained(model_name=model, dtype=None, load_in_4bit=True, local_files_only=local_files_only, **kwargs)
        elif mode in ['transformers', 'transformers_bf16', 'transformers_4bit', 'transformers_bf16_4bit', 'tokenizer_only']:
            import torch
            model_load_args = {}
            if device_map is not None: model_load_args['device_map'] = device_map
            if tf_use_fa2: model_load_args['attn_implementation'] = 'flash_attention_2'
            if mode in ['transformers_bf16', 'transformers_bf16_4bit']: model_load_args['torch_dtype'] = torch.bfloat16
            elif mode in ['transformers_4bit', 'transformers_bf16_4bit']:
                from transformers import BitsAndBytesConfig
                nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
                model_load_args['quantization_config'] = nf4_config
            from transformers import AutoTokenizer, AutoModelForCausalLM
            tokenizer = AutoTokenizer.from_pretrained(model, local_files_only=local_files_only, **kwargs)
            model = AutoModelForCausalLM.from_pretrained(model, **model_load_args) if mode!='tokenizer_only' else None
            if tf_grad_cp and model is not None: model.gradient_checkpointing_enable()
        else: raise NotImplementedError('Unknown mode.')
    if add_special_tokens: tokenizer.add_special_tokens(add_special_tokens)
    if set_pad_token is not None: tokenizer.pad_token = set_pad_token
    if formatter is not None and not hasattr(formatter, 'corpus'):
        formatter = formatter(tokenizer=tokenizer)
    if (shrink_embedding<len(tokenizer.vocab) if type(shrink_embedding)==int else shrink_embedding) or keep_normalizer is False:
        print('*** Shrink embedding...')
        embedding_size_before_shrink = len(tokenizer.vocab)
        mapping = shrink_embeddings(model, tokenizer, formatter.get_corpus(), keep_tokens=keep_tokens, keep_normalizer=keep_normalizer)
        print(f'*** -> Reduced embedding size from {embedding_size_before_shrink} to {len(mapping)} words.')
    if dequantize:
        print(f'*** Dequantize model...')
        model = model.dequantize()
    if len(peft):
        peft_trained = True if is_peft_model(model) else None
        for i, m in enumerate(peft):
            if peft_trained is True: model, peft_trained = merge_peft_into_base(model), None
            if isinstance(m, str):
                if peft_trained is False:
                    _, peft_trained = load_peft_state(model, m), True
                else:
                    print(f"*** Load peft model from '{m}'...")
                    from peft import PeftModel
                    model, peft_trained = PeftModel.from_pretrained(model, m, trainable=peft_trainable), True
            else:
                assert peft_trained is None
                if isinstance(m, dict):
                    print('*** Create new peft model...')
                    if is_unsloth_model(model):
                        from unsloth import FastLanguageModel
                        my_get_peft_model = FastLanguageModel.get_peft_model
                    else:
                        from peft import LoraConfig, get_peft_model
                        my_get_peft_model = lambda model, **kwargs: get_peft_model(model, LoraConfig(**kwargs))
                    model, peft_trained = my_get_peft_model(model, **m), False
                else: assert m is None
    layers_to_freeze = None
    if hasattr(model, 'model'):
        if hasattr(model.model, 'model') and hasattr(model.model.model, 'layers'):
            layers_to_freeze = model.model.model.layers
        elif hasattr(model.model, 'layers'):
            layers_to_freeze = model.model.layers

    if num_active_layers is not None and is_peft_model(model) and layers_to_freeze is not None:
        print(f"*** Activating only the first {num_active_layers} layers and freezing the rest...")
        total_layers = len(layers_to_freeze)
        if num_active_layers > total_layers:
            print(f"*** WARNING: num_active_layers ({num_active_layers}) is greater than total layers ({total_layers}). All layers will be active.")
        else:
            for i, layer in enumerate(layers_to_freeze):
                if i >= num_active_layers:
                    for param in layer.parameters():
                        param.requires_grad = False

            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            all_param = sum(p.numel() for p in model.parameters())
            print(f"*** -> Layers from {num_active_layers} to {total_layers - 1} are frozen.")
            print(f"*** -> Trainable params after freezing: {trainable_params:,} ({100 * trainable_params / all_param:.2f}%)")
    return model, tokenizer, formatter

def training_run(model, formatter, dataset, train_args, max_seq_length, merge=False, store=None, packing=False, grad_acc_fix=False, optimizers=None):
    assert merge is False, "merge after training does not seen to work (at least with unsloth, saved merged model will cointain the untrained weights!)"
    import torch
    from datasets import Dataset
    add_train_args = {}
    if is_unsloth_model(model):
        from unsloth import FastLanguageModel
        from unsloth import UnslothTrainer as Trainer
        from unsloth import UnslothTrainingArguments as TrainingArguments
        from unsloth import is_bfloat16_supported
        FastLanguageModel.for_training(model)
        add_train_args.update(fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported())
    else:
        from trl import SFTConfig as TrainingArguments
        from trl import SFTTrainer as Trainer
        model.train()
        add_train_args.update(bf16=True)

    formatter.tokenizer.padding_side = 'right'
    if is_unsloth_model(model):
        for convert_to_float in [model.get_input_embeddings(), model.get_output_embeddings()]:
            if convert_to_float.weight.dtype!=torch.float32: convert_to_float.to(torch.float32)

    add_args = {}
    if optimizers is not None: add_args['optimizers'] = optimizers

    trainer = Trainer(
        model=model,
        tokenizer=formatter.tokenizer,
        data_collator=formatter.get_data_collator(),
        train_dataset=Dataset.from_list(dataset.as_list(formatter)),
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=None,
        packing=packing,  # Can make training 5x faster for short sequences.
        **add_args,
        args=TrainingArguments(
            **add_train_args,
            **train_args
        ),
    )

    print('*** Start training run...')
    if grad_acc_fix and is_unsloth_model(model):
        from unsloth import unsloth_train
        trainer_stats = unsloth_train(trainer)
    else:
        if is_unsloth_model(model) and train_args['gradient_accumulation_steps']>1: print('*** WARNING: using faulty unsloth gradient accumulation')
        trainer_stats = trainer.train()
    try: print(f'*** -> Training took {trainer_stats.metrics["train_runtime"]} seconds.')
    except: pass
    if store is not None: save_model(store, model, formatter.tokenizer, merge=merge)
    return model, trainer_stats

def inference_load(store, keys=True, result_dict=None, always_read_from_file=False):
    if result_dict is None: result_dict = {}
    if store is not None:
        if keys is True: keys = os.listdir(store)
        for key in keys:
            if always_read_from_file or key not in result_dict:
                try:
                    with bz2.BZ2File(os.path.join(store, key)) as f: result_dict[key] = pickle.load(f)
                except: continue
    return result_dict

def inference_save(store, key, outputs):
    if store is not None:
        os.makedirs(store, exist_ok=True)
        with bz2.BZ2File(os.path.join(store, key), 'w') as f: pickle.dump(outputs, f)

class Decoder(object):
    def __init__(self, formatter, dataset, n_guesses, max_outputs=None, frac_score=False, quiet=False, name='', additional_decoders=None, prob_baseline=None):
        self.formatter = formatter
        self.dataset = dataset
        self.n_guesses = n_guesses
        self.decoded_results = {}
        self.correct_solutions = {}
        self.keys_lim = set()
        self.keys_all = set()
        self.mult_cnt = {}
        self.keys_cnt = {}
        self.frac_score = frac_score
        self.max_outputs = max_outputs
        self.quiet = quiet
        self.input_len = [{} if formatter is not None and formatter.tokenizer is None else ds.get_lengths(formatter, name='input') for ds in [dataset, dataset.mod(np.transpose, keep_key=True)]]
        self.reply_len = [{} if formatter is not None and formatter.tokenizer is None else ds.get_lengths(formatter, name='reply') for ds in [dataset, dataset.mod(np.transpose, keep_key=True)]]
        self.additional_decoders = additional_decoders
        self.name = name
        self.prob_tracker = {}
        self.prob_tracker_best = {}
        self.prob_baseline = prob_baseline

    def score(self, *to_score):
        scores = [(sum(1/self.mult_cnt[k.split('_')[0]] for k in s) if self.frac_score else len(s)) for s in to_score]
        score_cnt = len(self.mult_cnt if self.frac_score else self.keys_cnt)
        return scores, score_cnt

    def from_store(self, store, **kwargs):
        for key, outputs in inference_load(store).items():
            self.process(key, outputs, **kwargs)
        return self

    def score_fmt(self, v):
        return f'{v:5.1f}' if self.frac_score else f'{v:3}'

    def process_single_output(self, key, output_len, decoded, print_func=print, len_info=None, device_info=None):
        import numpy as np
        inv_mod = {k: v if k.endswith('val') else self.dataset.invert_mod(v, key, inv_perm=(k.startswith('output') or k.startswith('score_all'))) for k, v in decoded.items()}
        base_key = key.split('.')[0]
        self.decoded_results[base_key] = self.decoded_results.get(base_key, {})
        self.decoded_results[base_key][key] = inv_mod
        output = inv_mod.get('output')
        score = inv_mod.get('score')

        # quick scoring
        self.keys_cnt[base_key] = self.keys_cnt.get(base_key, 0) + 1
        mult_key, mult_sub = (base_key.split('_') + ['0'])[:2]
        self.mult_cnt[mult_key] = max(self.mult_cnt.get(mult_key, 0), int(mult_sub) + 1)
        if len(self.dataset.replies):
            correct_solution = self.dataset.replies.get(base_key)
            if correct_solution is not None:
                correct_solution = correct_solution[0]
                self.correct_solutions[base_key] = correct_solution
                is_correct = correct_solution is not None and np.array_equal(correct_solution, output)
                if is_correct:
                    self.keys_all.add(base_key)
                    if self.keys_cnt[base_key] <= self.n_guesses: self.keys_lim.add(base_key)
            corr_str = 'cant_decode' if output is None else 'sol_unknown' if correct_solution is None else 'ALL_CORRECT' if is_correct else 'bad_xy_size' if np.shape(correct_solution)!=np.shape(output) else 'bad_content'
            (score_lim, score_all), score_cnt = self.score(self.keys_lim, self.keys_all)

            tp_arr = (key.count('transpose') + key.count('rot90')) % 2
            msc = None if score is None else np.sum(score)
            fsc = inv_mod.get('score_val')
            if output is not None and fsc is not None:
                pt = self.prob_tracker[base_key] = self.prob_tracker.get(base_key, {})
                hash = tuple(map(tuple, output))
                prob = pt[hash] = pt.get(hash, 0) + (np.exp(fsc) if self.prob_baseline is None else fsc - np.log(self.prob_baseline))
                current_best = self.prob_tracker_best.get(base_key)
                if current_best is None or current_best[0]<prob:
                    self.prob_tracker_best[base_key] = (prob, output)
            fmt_name = f'{self.name}: ' if self.name else ''
            msc_print = f'{min(-msc, 9.99999):7.5f}' if msc is not None else 'unknown'
            fsc_print = f'{min(-fsc, 9.99999):7.5f}' if fsc is not None else 'unknown'
            if not self.quiet: print_func(f" {fmt_name}acc: {self.score_fmt(score_lim)}/{score_cnt:3}={min(score_lim/score_cnt, 0.999):5.1%} (2-guess), {self.score_fmt(score_all)}/{score_cnt:3}={min(score_all/score_cnt, 0.999):5.1%} (any);{f' {device_info}' if device_info else ''} tok:{self.input_len[tp_arr].get(base_key, '?'):>4}+{self.reply_len[tp_arr].get(base_key, '?'):>3}>{'n/a' if output_len is None else output_len:>3} {corr_str}:{msc_print}|{fsc_print} [{key}]")

    def get_current_best(self, base_key):
        current_best = self.prob_tracker_best.get(base_key)
        return None if current_best is None else current_best[1]

    def process_single_decode(self, key, de_tokenized, print_func=print, **kwargs):
        if len(de_tokenized)==3 and not isinstance(de_tokenized[1], float):  # for backwards compatibility
            output_len, *data = de_tokenized
            score_val = None
        else: output_len, score_val, *data = de_tokenized
        if self.formatter is None:
            assert len(data) == 1
            decoded = [data[0]]
        else: decoded = self.formatter.decode_to_array(*data)
        #if len(decoded)==2:
        #    same = np.array_equal(decoded[0].get('output'), decoded[1].get('output'))
        #    print_func(f"is_identical: {same}")
        #    if not same: for i in range(2): print_func(str(decoded[i].get('output')))
        for d in decoded: d['score_val'] = score_val
        for i, dec in enumerate(decoded):
            if i==0: self.process_single_output(key, output_len, dec, print_func=print_func, **kwargs)
            elif self.additional_decoders:
                if i-1<len(self.additional_decoders): self.additional_decoders[i-1].process_single_output(key, output_len, dec, print_func=print_func, **kwargs)
                else: print_func(f'{key} no decoder available for output #{i}')
            else: self.process_single_output(f'{key}.fix{i}', output_len, dec, print_func=print_func, **kwargs)

    def process(self, key, de_tokenized, **kwargs):
        for i, d in enumerate(de_tokenized):
            if self.max_outputs is None or i<=self.max_outputs:
                self.process_single_decode(f'{key}.out{i}', d, **kwargs)

    def get_unsolved_keys(self):
        unsolved = []
        for base_key, reply in self.dataset.replies.items():
            if not any(np.array_equal(reply[0], s.get('output')) for s in self.decoded_results.get(base_key, {}).values()):
                unsolved.append(base_key)
        return unsolved

    def run_selection_algo(self, selection_algorithm):
        return {bk: (selection_algorithm({k: g for k, g in v.items() if g.get('output') is not None}) if any(g.get('output') is not None for g in v.values()) else []) for bk, v in self.decoded_results.items()}

    def benchmark_selection_algos(self, selection_algorithms, skip_failed=True):
        import numpy as np
        results = {}
        print('*** Benchmark selection algorithms...')
        for selection_algorithm in selection_algorithms:
            name = selection_algorithm.__name__
            try:
                selected = self.run_selection_algo(selection_algorithm)
                if self.formatter is not None:
                    for sols in selected.values():
                        for s in sols:
                            assert self.formatter.is_valid_solution(s), f'found invalid solutions {s}'
                correct_keys = {k for k, v in selected.items() if self.correct_solutions.get(k) is not None and any(np.array_equal(guess, self.correct_solutions[k]) for guess in v[:self.n_guesses])}
                (score,), score_cnt = self.score(correct_keys)
                results[name] = score
                print(f" acc: {score:5.1f}/{score_cnt:3}={score/score_cnt:6.2%} ('{name}')")
            except:
                print(f" {'execution failed':>21} ('{name}')")
                if not skip_failed: raise
        return results

    def calc_augmented_scores(self, model, base_keys=None, store=None, seed=0, max_len=None, make_unique=True, quiet=False, **kwargs):
        if base_keys is None: base_keys = list(self.decoded_results.keys())
        if store is not None: store = f'{store}_new'  # new format is not backwards compatible, so use new folder
        for bk in (base_keys if quiet else tqdm(base_keys, desc='calculate augmented scores', file=sys.stdout)):
            res = self.decoded_results.get(bk, {})
            known_scores = {}
            for k, v in sorted(res.items()):
                if 'output' in v:
                    k_store = None if store is None else os.path.join(store, k)
                    id = tuple(map(tuple, v['output']))
                    if not (make_unique and id in known_scores):
                        try:
                            assert k_store is not None
                            with bz2.BZ2File(k_store) as f: known_scores[id] = pickle.load(f)
                            if isinstance(known_scores[id], list): known_scores[id] = dict(score_multi=known_scores[id])  # for backwards compatibility
                            k_store = None
                        except:
                            temp_dataset = self.dataset.__class__(
                                keys=[bk],
                                queries={bk: self.dataset.queries.get(bk)},
                                replies={bk: [v['output'].tolist()]},
                            )
                            temp_decoder = self.__class__(self.formatter, temp_dataset, n_guesses=self.n_guesses, quiet=True)
                            temp_dataset = temp_dataset.augment(**kwargs, seed=(seed+hash(k)+hash(id)) % 1024**2, quiet=True)
                            if max_len is not None: temp_dataset = temp_dataset.cut_to_len(formatter=self.formatter, name='input', max_len=max_len, quiet=True)
                            for x in temp_dataset.as_list(self.formatter): calc_score(**x, formatter=self.formatter, model=model, decoder=temp_decoder)
                            known_scores[id] = dict(
                                score_multi=[np.sum(x['score']) for x in temp_decoder.decoded_results[bk].values()],
                                score_multi_nl=[x['score_val'] for x in temp_decoder.decoded_results[bk].values()],
                                score_multi_array=np.array([x['score'] for x in temp_decoder.decoded_results[bk].values()]),
                                score_multi_array_cum=np.array([x['score_cum'] for x in temp_decoder.decoded_results[bk].values()]),
                                score_multi_array_all=np.array([x['score_all'] for x in temp_decoder.decoded_results[bk].values()]),
                                score_multi_array_all_cum=np.array([x['score_all_cum'] for x in temp_decoder.decoded_results[bk].values()]),
                            )
                            if k_store is not None:
                                os.makedirs(store, exist_ok=True)
                                with bz2.BZ2File(k_store, 'w') as f: pickle.dump(known_scores[id], f)
                    v.update(known_scores[id])

def turbo_dfs(model, logits, path, eos_token_id, max_new_tokens, max_score, max_score_greedy, temperature, suppress_tokens, torch, score=0.0, pos=0, cache=None):
    logits, next_logits = logits[0], (logits[1:] if len(logits)>1 else None)
    nll = -(logits / temperature).detach().float().log_softmax(-1).cpu().numpy()
    greedy_index = nll.argmin(-1).item()
    nll = list(enumerate(nll))
    if path: nll[0], nll[path[0]], path = nll[path[0]], nll[0], path[1:]  # follow precomputed path first
    suffixes = []
    for i, s in nll:
        next_score = score + s
        allowed_max_score = max_score_greedy if i==greedy_index else max_score
        if next_score < allowed_max_score:
            if i==eos_token_id: next_suffixes = [(next_score, [], [])]
            elif max_new_tokens>1:
                if next_logits is None:
                    if pos<cache[0][0][0].shape[2]: cache[0] = tuple(tuple(c[:, :, :pos] for c in l) for l in cache[0])
                    next_logits, cache[0] = model(
                        input_ids= torch.full((1,1), i, device=model.device),
                        position_ids=torch.full((1,1), pos, device=model.device),
                        past_key_values=cache[0],
                    )[:2]
                    next_logits = next_logits[0]  # unbatch
                next_suffixes = turbo_dfs(model, logits=next_logits, path=path, eos_token_id=eos_token_id, max_new_tokens=max_new_tokens-1, max_score=max_score, max_score_greedy=allowed_max_score, temperature=temperature, suppress_tokens=suppress_tokens, torch=torch, score=next_score, pos=pos+1, cache=cache)
            else: next_suffixes = []
            for suffix in next_suffixes:
                suffix[1].append(i)
                suffix[2].append(logits)
            suffixes.extend(next_suffixes)
        next_logits = None
    return suffixes

def inference_turbo_dfs(model, input_ids, eos_token_id, max_new_tokens, min_prob, min_prob_greedy=1, temperature=1.0, suppress_tokens=[], path=[], attention_mask=None):
    import torch
    with torch.no_grad():
        assert attention_mask is None or attention_mask.all(), 'not implemented'
        input_ids = torch.as_tensor(input_ids, device=model.device, dtype=int)
        if input_ids.ndim==2: input_ids = input_ids.squeeze(0)
        assert input_ids.ndim==1, 'batching not supported'
        max_score = -np.log(min_prob)
        max_score_greedy = (-np.log(min_prob_greedy)) if min_prob_greedy>0 else float('inf')  # avoid throwing numpy error
        max_score_greedy = max(max_score, max_score_greedy)
        if path is None: path = []
        if len(path) and path[-1]==eos_token_id: path = path[:-1]
        with torch.no_grad():
            full_path = input_ids
            if len(path): full_path = torch.cat([full_path, torch.as_tensor(path, device=model.device)])
            logits, cache = model(input_ids=full_path[np.newaxis])[:2]
            logits = logits[0, len(input_ids)-1:]
        result = turbo_dfs(model, logits=logits, path=path, eos_token_id=eos_token_id, max_new_tokens=max_new_tokens, max_score=max_score, max_score_greedy=max_score_greedy, temperature=temperature, suppress_tokens=suppress_tokens, torch=torch, score=0.0, pos=len(input_ids), cache=[cache])
        return sorted([(score_val, np.array(suffix[::-1]), torch.stack(score_arr[::-1]).float().cpu().numpy()) for score_val, suffix, score_arr in result], key=lambda x:x[0])

def inference_step(tokenized, model, remove_token_type_ids=True, num_beams=1, formatter=None, min_prob=None, current_best=None, **kwargs):
    import torch
    if remove_token_type_ids: tokenized.pop('token_type_ids', None)
    if min_prob is not None:
        assert num_beams==1
        gen = inference_turbo_dfs(model, **tokenized.to(model.device), path=current_best, min_prob=min_prob, eos_token_id=formatter.tokenizer.eos_token_id, **kwargs)
        tokens_out = [[g[1] for g in gen]]
        scores_out = [[g[2] for g in gen]]
    elif is_unsloth_model(model) and num_beams > 1:
        assert False, 'unsloth does not support beam search'
    else:
        gen = model.generate(**tokenized.to(model.device), return_dict_in_generate=True, output_logits=True, use_cache=True, **kwargs)
        tokens_out = gen['sequences'][:, torch.newaxis, tokenized['input_ids'].shape[-1]:].cpu().numpy().copy()
        scores_out = torch.stack(gen['logits'], axis=-2)[:, torch.newaxis].float().cpu().numpy().copy()
    return tokens_out, scores_out

def process_inference_output(key, outputs, formatter, store=None, decoder=None, decoder_args={}):
    de_tokenized = [formatter.de_tokenize(*output) for output in zip(*outputs)]
    inference_save(store, key, de_tokenized)
    if decoder is not None: decoder.process(key, de_tokenized, **decoder_args)
    return de_tokenized

def inference_run_v2(model, formatter, dataset, decoder=None, max_new_tokens=None, max_batch_size=1, store=None, result_dict=None, rerun_empty=False, retrain=None, use_turbo=False, group_multi_output=True, **kwargs):
    import torch
    assert max_batch_size==1, 'unsupported'

    with torch.no_grad():
        print('*** Load stored data...')
        if result_dict is None: result_dict = {}
        result_dict = inference_load(store, dataset.keys, result_dict)
        by_base_key = {}
        needs_rerun = {}
        base_key_list = []
        for key in dataset.keys:
            base_key = key.split('.')[0]
            if group_multi_output: base_key = base_key.split('_')[0]
            if base_key not in by_base_key: base_key_list.append(base_key)
            bk_list = by_base_key[base_key] = by_base_key.get(base_key, [])
            bk_list.append(key)
        for base_key, keys in by_base_key.items():
            for key in keys:
                de_tokenized = result_dict.get(key)
                if de_tokenized is None or (rerun_empty and not de_tokenized):
                    bk_list = needs_rerun[base_key] = needs_rerun.get(base_key, [])
                    bk_list.append(key)
                elif decoder is not None: decoder.process(key, de_tokenized)

        formatter.tokenizer.padding_side = 'left'
        if max_new_tokens is None: max_new_tokens = formatter.max_new_tokens()
        if is_unsloth_model(model):
            from unsloth import FastLanguageModel
            FastLanguageModel.for_inference(model)
        else: model.eval()

        print('*** Start inference run...')
    try:
        with tqdm(base_key_list, file=sys.stdout) as pbar:
            for base_key in pbar:
                run_keys = needs_rerun.get(base_key)
                if run_keys:
                    if retrain is not None:
                        retrain_dataset = dataset.keep_key_startswith(base_key)
                        print(f"retraining model for key '{base_key}' (retrain_dataset_size={len(retrain_dataset.keys)})")
                        retrain(model, retrain_dataset)
                        if is_unsloth_model(model): FastLanguageModel.for_inference(model)
                    with torch.no_grad():
                        for key in run_keys:
                            input_text = dataset.get(key, formatter)['input']
                            batch = formatter.tokenizer([input_text], return_tensors='pt')
                            current_best = decoder.get_current_best(key.split('.')[0]) if use_turbo else None
                            if current_best is not None:
                                current_best = dataset.forward_mod(current_best, key)
                                current_best = formatter.fmt_reply([current_best])
                                current_best = formatter.tokenizer(input_text+current_best)['input_ids'][batch['input_ids'].shape[-1]:]
                            batch_out = inference_step(batch, model, formatter=formatter, max_new_tokens=max_new_tokens, current_best=current_best, **kwargs)
                            outputs = [x[0] for x in batch_out]
                            result_dict[key] = process_inference_output(key, outputs, formatter, store=store, decoder=decoder, decoder_args=dict(print_func=pbar.write))
        print('*** Completed inference run.')
    except KeyboardInterrupt: print('*** Ctrl+C pressed, stopping inference run.')
    return result_dict

class Retrainer(object):
    def __init__(self, n, aug_opts, reload_state_dict=None, **kwargs):
        self.n = n
        self.aug_opts = aug_opts
        self.reload_state_dict = reload_state_dict
        self.kwargs = kwargs

    def preprocess(self, dataset):
        ds = [dataset.augment(quiet=True, shfl_keys=True, **self.aug_opts) for _ in range((self.n-1)//dataset.length()+1)]
        ds = ds[0] if len(ds)==1 else ds[0].append(*ds[1:])
        ds, _ = ds.split_at_pos(self.n)
        return ds

    def __call__(self, model, dataset):
        if self.reload_state_dict is not None: set_peft_weights(model, self.reload_state_dict)
        assert is_unsloth_model(model), 'not implemented'
        if is_unsloth_model(model):
            from unsloth import FastLanguageModel
            FastLanguageModel.for_training(model)
        else: model.train()
        training_run(model, dataset=self.preprocess(dataset), **self.kwargs)

def calc_score(key, input, reply, formatter, model, store=None, decoder=None, **_):
    import torch
    with torch.no_grad():
        input_len = len(formatter.tokenizer(input)['input_ids'])
        tokenized = formatter.tokenizer([input+reply], return_tensors='pt')
        reply_tok = tokenized['input_ids'][0][input_len:].cpu().numpy().copy()
        reply_log = model.forward(**tokenized.to(model.device))['logits'][0, input_len-1: -1].float().cpu().numpy().copy()
        process_inference_output(key, (reply_tok[torch.newaxis], reply_log[torch.newaxis]), formatter, store=store, decoder=decoder)

def mem_info(gpu_id=0):
    import torch
    try:
        gpu_stats = torch.cuda.get_device_properties(gpu_id)
        usage = torch.cuda.max_memory_reserved() / 1024**3
        avail = gpu_stats.total_memory / 1024**3
        print(f"*** GPU: {gpu_stats.name}, used {usage:.3} / {avail:.3} GB.")
    except: print('*** Exception occured when getting memory stats.')

Writing model_runner.py


In [2]:
%%writefile arc_loader.py
import json
import numpy as np
import hashlib
import os, sys
from tqdm import tqdm
from glob import glob
import itertools
import random

def cut_at_token(output, token_id):
    eos_positions = (output==token_id).nonzero()[0]
    return output[:eos_positions[0]] if len(eos_positions) else output

def shuffled(data_list):
    return np.random.permutation(data_list).tolist()

def permute_mod(a, descriptor, invert=False):
    permutation = [int(i) for i in descriptor if str(i).isdigit()]
    assert sorted(permutation)==list(range(10))
    a = np.asarray(a)
    if a.ndim==3:
        if not invert: permutation = np.argsort(permutation)
        a = a[..., permutation]
    else:
        assert a.ndim==2
        if invert: permutation = np.argsort(permutation)
        a = np.asarray(permutation)[a]
    return a

def permute_rnd_col_(query):
    permutation = [0]+(1+np.random.permutation(9)).tolist()
    return 'permute' + ''.join(map(str, permutation))

def permute_rnd_all_(query):
    permutation = np.random.permutation(10).tolist()
    return 'permute' + ''.join(map(str, permutation))

def permute_cnt_col_(query):
    elements, frequency = np.unique(np.concatenate([list(range(10))]+[np.array(x['input']).ravel() for x in query['train']]), return_counts=True)
    permutation = [0]+sorted(np.random.permutation(9)+1, key=lambda i: frequency[i], reverse=True)  # randomness as tie breaker
    return 'permute' + ''.join(map(str, permutation))

def permute_cnt_all_(query):
    elements, frequency = np.unique(np.concatenate([list(range(10))]+[np.array(x['input']).ravel() for x in query['train']]), return_counts=True)
    permutation = sorted(np.random.permutation(10), key=lambda i: frequency[i], reverse=True)  # randomness as tie breaker
    return 'permute' + ''.join(map(str, permutation))

permute_rnd_col = (permute_mod, permute_rnd_col_)
permute_rnd_all = (permute_mod, permute_rnd_all_)
permute_cnt_col = (permute_mod, permute_cnt_col_)
permute_cnt_all = (permute_mod, permute_cnt_all_)
permute_None = (np.copy, None)

class ArcDataset(object):
    @staticmethod
    def forward_mod(a, key, use_perm=True, is_output=True):
        if a is None: return a
        for op in key.split('.')[1:]:
            if op.startswith('I'):
                if is_output: continue
                op = op[1:]
            if   op=='rot90':              a = np.rot90(a)
            elif op=='transpose':          a = np.swapaxes(a, 0, 1)
            elif op.startswith('permute'): a = permute_mod(a, op, invert=False) if use_perm else a
            elif op.startswith('copy'):    a = np.copy(a)
            elif op.startswith('out'):     a = a
            elif op.startswith('ex'):      a = a
            elif op.startswith('fix'):     a = a
            elif op.startswith('ice'):     a = a  # for adding icecuber solutions
            else: raise NotImplementedError(f"Inversion of operation '{op}' unknown.")
        return a

    @staticmethod
    def invert_mod(a, key, inv_perm=True, is_output=True):
        if a is None: return a
        for op in key.split('.')[1:][::-1]:
            if op.startswith('I'):
                if is_output: continue
                op = op[1:]
            if   op=='rot90':              a = np.rot90(np.rot90(np.rot90(a)))
            elif op=='transpose':          a = np.swapaxes(a, 0, 1)
            elif op.startswith('permute'): a = permute_mod(a, op, invert=True) if inv_perm else a
            elif op.startswith('copy'):    a = np.copy(a)
            elif op.startswith('out'):     a = a
            elif op.startswith('ex'):      a = a
            elif op.startswith('fix'):     a = a
            elif op.startswith('ice'):     a = a  # for adding icecuber solutions
            else: raise NotImplementedError(f"Inversion of operation '{op}' unknown.")
        return a

    def __init__(self, queries, replies={}, keys=None, is_orig=False, is_fake=False):
        if keys is not None: keys = [k for k in keys if k is not None]
        self.queries = queries if keys is None else {k: queries[k] for k in keys}
        self.replies = replies if keys is None else {k: replies[k] for k in keys if k in replies}
        self.is_orig = is_orig
        self.is_fake = is_fake
        self.keys = sorted(queries.keys()) if keys is None else keys
        self.faulty = {}
        self.transposed_dataset = None

    @classmethod
    def empty(cls):
        return cls(queries={}, replies={}, keys=[])

    def change_keys(self, keys, keep_flags=False):
        flags = dict(is_fake=self.is_fake, is_orig=self.is_orig) if keep_flags else {}
        return self.__class__(queries=self.queries, replies=self.replies, keys=keys, **flags)

    @classmethod
    def from_file(cls, queries_file):
        print(f"*** Load challanges from '{queries_file}'...")
        with open(queries_file) as f: queries = f.read()
        import os
        if os.getenv('KAGGLE_IS_COMPETITION_RERUN'): #Real submit
            is_fake = False
        else: #Fake run
            is_fake = True
        #is_fake = hashlib.md5(queries.encode('utf-8')).hexdigest().lower()=='a6b7dac3cab03abf2eb333e16610d6dc'
        if is_fake: print("*** -> Fake test set detected, setting flag 'is_fake' to True.")
        return cls(
            queries=json.loads(queries),
            is_fake=is_fake,
            is_orig=True,
        )

    def load_replies(self, replies_file):
        print(f"*** Load solutions from '{replies_file}'...")
        with open(replies_file) as f: replies = f.read()
        replies_parsed = json.loads(replies)
        self.replies = {k: replies_parsed[k] for k in self.keys}
        return self

    def split_multi_replies(self):
        key_indices = [(k, i) for k in self.keys for i in range(len(self.queries[k]['test']))]
        return self.__class__(
            keys=[f'{k}_{i}' for k, i in key_indices],
            queries={f'{k}_{i}': {'train': self.queries[k]['train'], 'test': [self.queries[k]['test'][i]]} for k, i in key_indices},
            replies={f'{k}_{i}': [self.replies[k][i]] for k, i in key_indices if k in self.replies},
        )

    def move_test_to_train(self):
        new_queries = {k: {'train': self.queries[k]['train'] + [{**t, 'output': self.replies[k][i]} for i, t in enumerate(self.queries[k]['test'])], 'test': []} for k in self.keys}
        return self.__class__(queries=new_queries, keys=[k for k in self.keys])

    def last_train_ex_for_test(self):
        assert not self.replies
        new_queries = {k: {'train': self.queries[k]['train'][:-1], 'test': [{'input': self.queries[k]['train'][-1]['input']}]} for k in self.keys}
        new_replies = {k: [self.queries[k]['train'][-1]['output']] for k in self.keys}
        return self.__class__(queries=new_queries, replies=new_replies, keys=[k for k in self.keys])

    def length(self):
        return len(self.keys)

    def shuffled(self, seed=None):
        if seed is not None: np.random.seed(seed)
        return self.__class__(queries=self.queries, replies=self.replies, keys=shuffled(self.keys))

    def sorted(self, **kwargs):
        return self.__class__(queries=self.queries, replies=self.replies, keys=sorted(self.keys, **kwargs))

    def append(*datasets):
        return datasets[0].__class__(
            queries={k: v for d in datasets for k, v in d.queries.items()},
            replies={k: v for d in datasets for k, v in d.replies.items()},
            keys   =[k    for d in datasets for k    in d.keys           ],
        )

    def sort_ex_by_input_size(self, seed=42, reverse=False):
        np.random.seed(seed)
        sort_key = lambda ex: np.prod(np.shape(ex['input']))
        new_queries = {k2: {k: (sorted(np.random.permutation(np.array(v, dtype=object)), key=sort_key, reverse=reverse) if k=='train' else v) for k, v in v2.items()} for k2, v2 in self.queries.items()}
        return self.__class__(queries=new_queries, replies=self.replies, keys=[k for k in self.keys])

    def interleave(self, block_size, num_gpus=None):
        keys = np.reshape(self.keys, (-1, block_size)).T
        if num_gpus is None: return self.change_keys(keys.ravel().tolist())
        ret, num_gpus = (None, num_gpus) if isinstance(num_gpus, int) else num_gpus
        keys = np.concatenate([keys, np.full((-keys.shape[0]%num_gpus, keys.shape[1]), None)])
        keys = np.reshape(keys, (keys.shape[0]//num_gpus, num_gpus, -1)).swapaxes(0, 1).reshape(num_gpus, -1)
        new_datasets = [self.change_keys(gpu_keys.tolist()) for gpu_keys in keys]
        return new_datasets if ret is None else new_datasets[ret]

    def remove(self, *datasets):
        remove_keys = {k for d in datasets for k in d.keys}
        new_keys = [k for k in self.keys if k not in remove_keys]
        return self.change_keys(new_keys)

    def keep_key_startswith(self, key_start):
        new_keys = [k for k in self.keys if k.startswith(key_start)]
        return self.change_keys(new_keys)

    def mod_single(self, mod_func, descriptor, i, keep_key, inputs_only):
        queries = {}
        replies = {}
        keys    = []
        for k0 in self.keys:
            desc = (('copy{i}' if mod_func is np.copy else mod_func.__name__) if descriptor is None else descriptor if isinstance(descriptor, str) else descriptor(self.queries[k0])).format(i=i)
            func = lambda a, d: np.asarray(mod_func(a) if descriptor is None else mod_func(a, d)).tolist()
            k1 = k0 if keep_key else f"{k0}.{'I' if inputs_only else ''}{desc}"
            keys.append(k1)
            queries[k1] = {m: [{t: (func(a, desc) if t=='input' or not inputs_only else a) for t, a in x.items()} for x in e] for m, e in self.queries[k0].items()}
            if k0 in self.replies:
                replies[k1] = [func(a, desc) for a in self.replies[k0]]
        ret = self.__class__(queries=queries, replies=replies, keys=keys)
        return ret

    def mod(self, mod_func, descriptor=None, n=1, stack=None, keep=False, keep_key=False, shuffle=False, join=True, inputs_only=False):
        assert not (keep and keep_key)
        cur = self
        ret = [cur.shuffled() if shuffle else cur] if keep else []
        if stack is None: stack = mod_func.__name__.startswith('rot')
        for i in range(n):
            cur = (cur if stack else self).mod_single(mod_func, descriptor, i=i, keep_key=keep_key, inputs_only=inputs_only)
            ret.append(cur.shuffled() if shuffle else cur)
        return self.__class__.append(*ret) if join else ret

    def get(self, key, formatter):
        assert formatter.out2_token is None or key in self.replies
        train = formatter.fmt_train(self.queries[key]['train'])
        query = formatter.fmt_query(self.queries[key]['test'], i=len(self.queries[key]['train']))
        reply = formatter.fmt_reply(self.replies[key], self.faulty.get(key)) if key in self.replies else ''
        text = train+query+reply if reply else formatter.fmt_train(self.queries[key]['train'], last_is_challenge=True)
        return dict(key=key, train=train, query=query, reply=reply, input=train+query, text=text)

    def as_list(self, formatter):
        return [self.get(key, formatter) for key in self.keys]

    def as_dataset(self):
        from datasets import Dataset
        return Dataset.from_list([{'key': k, 'query': self.queries[k], 'reply': self.replies[k]} for k in self.keys])

    def get_length(self, key, formatter, name, max_of_transposed=False):
        if formatter is None:
            if   name=='input': return sum(np.prod(np.shape(v)) for v3 in self.queries[key].values() for v2 in v3 for v in v2.values())
            elif name=='reply': return sum(np.prod(np.shape(v)) for v in self.replies[key])
            else: assert False
        else:
            datasets = [self]
            if max_of_transposed:
                if self.transposed_dataset is None: self.transposed_dataset = self.mod(np.transpose, keep=False, keep_key=True)
                datasets.append(self.transposed_dataset)
            return max(len(formatter.tokenizer(ds.get(key, formatter=formatter)[name])['input_ids']) for ds in datasets)

    def get_lengths(self, formatter, name, max_of_transposed=False):
        return {key: self.get_length(key, formatter=formatter, name=name, max_of_transposed=max_of_transposed) for key in self.keys}

    def sorted_by_len(self, reverse=False, **kwargs):
        new_keys = [key for _, key in sorted([(v, k) for k, v in self.get_lengths(**kwargs).items()], reverse=reverse)]
        return self.change_keys(new_keys)

    def filter_by_len(self, min_len=0, max_len=float('inf'), **kwargs):
        new_keys = [k for k, v in self.get_lengths(**kwargs).items() if min_len<=v<=max_len]
        return self.change_keys(new_keys)

    def cut_to_query_count(self, max_count, from_end=False):
        new_queries = {}
        for k in self.keys:
            new_queries[k] = q = self.queries[k]
            while len(q['train'])>max_count: q['train'] = q['train'][:-1] if from_end else q['train'][1:]
        return self.__class__(queries=new_queries, replies=self.replies, keys=[k for k in self.keys])

    def cut_to_len(self, formatter, name, max_len, max_new_tokens='auto', from_end=False, quiet=False, **kwargs):
        if max_new_tokens:
            if max_new_tokens=='auto': max_new_tokens = formatter.max_new_tokens()
            max_len_old, max_len = max_len, max_len - max_new_tokens
            if not quiet: print(f'*** Reducing task size to max. {max_len_old} tokens ({max_len} input + {max_new_tokens} generated)...')
        elif not quiet: print(f'*** Reducing task size to max. {max_len} tokens...')
        temp_ds = self.change_keys(self.keys)
        new_keys = []
        new_queries = {}
        new_replies = {}
        for key in (self.keys if quiet else tqdm(self.keys, file=sys.stdout)):
            reply = temp_ds.replies.get(key)
            while max_len<temp_ds.get_length(key, formatter=formatter, name=name, **kwargs):
                query = temp_ds.queries[key]
                if not key.split('.')[-1].startswith('ex'): key = f"{key}.ex{''.join(map(str, range(len(query['train']))))}"
                key_split = key.split('.')
                assert key_split[-1].startswith('ex')
                key = '.'.join(key_split[:-1] + [f'ex{key_split[-1][2:-1] if from_end else key_split[-1][3:]}'])
                temp_ds.queries[key] = {k: ((v[:-1] if from_end else v[1:]) if k=='train' else v) for k, v in query.items()}
                if reply is not None: temp_ds.replies[key] = reply
            new_keys.append(key)
            new_queries[key] = temp_ds.queries[key]
            if reply is not None: new_replies[key] = reply
        return self.__class__(keys=new_keys, queries=new_queries, replies=new_replies)

    def shuffle_ex(self, perm=None, keep_max=None):
        new_keys = []
        new_queries = {}
        new_replies = {}
        for key in self.keys:
            n = len(self.queries[key]['train'])
            p = np.random.permutation(n) if perm is None else perm
            if keep_max is not None: p = p[:keep_max]
            new_key = f'{key}.ex' + ('-' if (p.max()>9) else '').join(map(str, p.tolist()))
            new_keys.append(new_key)
            new_queries[new_key] = {k: (np.array(v, dtype=object)[p].tolist() if k=='train' else v) for k, v in self.queries[key].items()}
            if key in self.replies: new_replies[new_key] = self.replies[key]
        return self.__class__(queries=new_queries, replies=new_replies, keys=new_keys)

    def shuffle_rp(self, keep_max=None):
        new_keys = []
        new_queries = {}
        new_replies = {}
        for key in self.keys:
            n = len(self.queries[key]['test'])
            p = np.random.permutation(n)
            if keep_max is not None: p = p[:keep_max]
            new_key = f'{key}.rp' + ('-' if (p.max()>9) else '').join(map(str, p.tolist()))
            new_keys.append(new_key)
            new_queries[new_key] = {k: (np.array(v, dtype=object)[p].tolist() if k=='test' else v) for k, v in self.queries[key].items()}
            if key in self.replies: new_replies[new_key] = np.array(self.replies[key], dtype=object)[p].tolist()
        return self.__class__(queries=new_queries, replies=new_replies, keys=new_keys)

    def append_to_keys(self, test):
        return self.change_keys([f'{k}{text}' for k in self.keys])

    def random_select(self, n):
        keys = np.array(self.keys).reshape(n, -1).T
        choice = np.random.randint(0, n, size=[len(keys)])
        return self.change_keys(keys[np.arange(len(keys)), choice])

    def augment(self, tp=False, rot=False, n=1, perm=None, perm_append=False, shfl_keys=False, shfl_ex=False, seed=None, quiet=False, inputs_only=False):
        if not quiet: print(f"*** Augment dataset{' (inputs only)' if inputs_only else ''}...")
        np.random.seed(seed)
        d = self
        if tp: d = d.mod(np.transpose, keep=True, inputs_only=inputs_only)
        if tp=='rand': d = d.random_select(n=2)
        if rot: d = d.mod(np.rot90, n=3, keep=True, inputs_only=inputs_only)
        if rot=='rand': d = d.random_select(n=4)
        if perm is None and n<=1: d = d.shuffled() if shfl_keys else d
        else: d = d.mod(*([np.copy] if perm is None else globals()[f"permute_{perm}"]), n=n, shuffle=shfl_keys, keep=perm_append, inputs_only=inputs_only)
        np.random.seed(seed)
        if shfl_ex: d = d.shuffle_ex()
        return d

    def remove_replies(self):
        return self.__class__(queries=self.queries, replies={}, keys=[k for k in self.keys])

    def split_at_pos(self, pos, random_seed=None):
        keys = self.keys
        if random_seed is not None:
            np.random.seed(random_seed)
            keys = np.random.permutation(keys)
        if isinstance(pos, float): pos = int(pos * len(self.keys) + 0.5)
        keys_split = [keys[:pos], keys[pos:]]
        return tuple(self.change_keys(new_keys, keep_flags=True) for new_keys in keys_split)

    def get_submission(self, results=None):
        assert self.is_orig==True, 'Must be run on original dataset.'
        submission = {k: [{f'attempt_{i+1}': [[0]] for i in range(2)} for _ in range(len(self.queries[k]['test']))] for k in self.keys}
        if results is not None: self.fill_submission(results, submission)
        return submission

    @staticmethod
    def fill_submission(results, submission):
        print(f'*** Generating submission for {len(results)} outputs...')
        for k, v in results.items():
            base_id, base_nr = k.split('_')
            target_dict = submission[base_id][int(base_nr)]
            for i, g in enumerate(v[:len(target_dict)]):
                target_dict[f'attempt_{i+1}'] = g.tolist()

    def validate_submission(self, submission):
        assert self.is_orig==True, 'Must be run on original dataset.'
        score = 0
        for k, v in self.replies.items():
            for i, r in enumerate(v):
                for attempt in ['attempt_1', 'attempt_2']:
                    if np.array_equal(r, submission[k][i][attempt]):
                        score += 1 / len(v)
                        break
        return score
def get_class_MyDataCollator(cache=[]):
    if not cache:
        from trl import DataCollatorForCompletionOnlyLM
        class MyDataCollator(DataCollatorForCompletionOnlyLM):
            def setup(self, out2_token_id=None, fault_token_id=None, fault_freq=0, sample_tries=8, mask_first_output=False):
                self.out2_token_id = out2_token_id
                self.fault_token_id = fault_token_id
                self.fault_freq = fault_freq
                self.sample_tries = sample_tries
                self.mask_first_output = mask_first_output
                return self

            def torch_call(self, examples):
                batch = super().torch_call(examples)
                if self.out2_token_id is not None:
                    assert not self.fault_freq
                    for i in range(len(batch['input_ids'])):
                        end_pos = ((batch['labels'][i] != -100              ).nonzero().max()).item() + 1
                        mid_pos = ((batch['labels'][i] == self.out2_token_id).nonzero().max()).item() + 1
                        beg_pos = mid_pos - (end_pos - mid_pos)
                        batch['labels'][i][beg_pos:mid_pos] = batch['labels'][i][mid_pos:end_pos]
                elif self.fault_freq:
                    for i in range(len(batch['input_ids'])):
                        end_pos = ((batch['labels'][i] != -100).nonzero().max()).item() + 1
                        if not isinstance(self.fault_freq, float):
                            eos_token_id = batch['labels'][i][end_pos - 1]
                            num_examples = (batch['labels'][i] == eos_token_id).sum().item() - 1
                            fault_freq = self.fault_freq[num_examples]
                        else: fault_freq = self.fault_freq
                        if random.random() < fault_freq:
                            beg_pos = ((batch['labels'][i][:end_pos]==-100).nonzero().max()).item() + 1
                            fault_pos = random.randint(beg_pos, end_pos-2)
                            fault_tok = batch['labels'][i][fault_pos].item()
                            for t in range(self.sample_tries):
                                new_tok = batch['labels'][i][random.randint(beg_pos, end_pos-2)].item()
                                if fault_tok!=new_tok:
                                    batch['input_ids'][i][fault_pos] = new_tok
                                    batch['labels'][i][fault_pos+1:end_pos] = self.fault_token_id
                                    break
                for i in range(len(batch['labels'])):
                    for _ in range(self.mask_first_output):
                        beg_pos = ((batch['labels'][i] != -100).nonzero().min()).item()
                        mid_pos = ((batch['labels'][i][beg_pos:] == -100).nonzero().min()).item() + beg_pos
                        end_pos = ((batch['labels'][i] != -100).nonzero().max()).item() + 1
                        if mid_pos<end_pos: batch['labels'][i][beg_pos:mid_pos] = -100
                return batch
        cache.append(MyDataCollator)
    return cache[0]

class ArcFormatter(object):
    def __init__(self, inp_prefix, out_prefix, arr_sep, out2_use=False, out2_token=None, arr_beg='', arr_end='', pretext='', pre_out=None, exa_sep='', exa_end='', qry_prefix=None, rpl_prefix=None, rpl_sep=None, dec_sep=None, min_wid=0, min_pad='', pretext_corpus_split='', masking=0, tokenizer=None, collator_kwargs={}, repeat_input_aug=None, repeat_input_pre=None):
        self.tokenizer = tokenizer
        self.inp_prefix = inp_prefix
        self.out_prefix = out_prefix
        self.out2_token = out2_token
        self.out2_use = out2_use
        assert not out2_use or out2_token is not None
        assert not out2_use or masking in [1, 2]
        assert masking!=2 or out2_use or rpl_prefix is not None
        self.qry_prefix = qry_prefix if qry_prefix is not None else inp_prefix
        self.rpl_prefix = rpl_prefix if rpl_prefix is not None else out_prefix
        self.rpl_sep = rpl_sep if rpl_sep is not None else self.rpl_prefix
        self.arr_sep = arr_sep
        self.arr_beg = arr_beg
        self.arr_end = arr_end
        self.pretext = pretext
        self.pre_out = pre_out
        self.pre_out_empty = ['']*99
        self.pretext_corpus_split = pretext_corpus_split
        self.exa_sep = exa_sep
        self.exa_end = exa_end
        self.dec_sep = arr_sep if dec_sep is None else dec_sep
        self.min_wid = min_wid
        self.min_pad = min_pad
        self.masking = masking
        self.collator_kwargs = collator_kwargs
        self.repeat_input_aug = repeat_input_aug
        self.repeat_input_pre = repeat_input_pre

    def fmt_array(self, array):
        return self.arr_beg + self.arr_sep.join(str(row).replace(' ', '').replace(',', '').replace('[', '').replace(']', '')+self.min_pad*max(0, self.min_wid-len(row)) for row in array) + self.arr_end

    def get_pre_out(self, pretext_split):
        if self.pre_out is None: return self.pre_out_empty
        if pretext_split: return [self.pretext_corpus_split.join(list(p) + ['']) for p in self.pre_out]
        return self.pre_out

    def fmt_train(self, train, last_is_challenge=False, pretext_split=False):
        po = self.get_pre_out(pretext_split=pretext_split)
        ex = [(f"{self.fmt_query([x], i, pretext_split=pretext_split)}{self.fmt_reply([x['output']])}" if last_is_challenge and i+1==len(train) else
               f"{self.inp_prefix}{self.fmt_array(x['input'])}{self.repeat_input(x, no_aug=pretext_split)}{po[i]}{self.out_prefix}{self.fmt_array(x['output'])}") for i, x in enumerate(train)]
        pre = self.pretext_corpus_split.join(list(self.pretext)+['']) if pretext_split else self.pretext
        end = '' if last_is_challenge else (self.exa_end + self.tokenizer.eos_token)
        return pre + (self.exa_end + self.tokenizer.eos_token + self.exa_sep).join(ex) + end

    def fmt_query(self, query, i, pretext_split=False):
        po = self.get_pre_out(pretext_split=pretext_split)
        return ''.join(f"{self.qry_prefix}{self.fmt_array(x['input'])}{self.repeat_input(x, no_aug=pretext_split)}{po[i]}{self.rpl_prefix}" for x in query[:1])

    def repeat_input(self, x, no_aug=False):
        if self.repeat_input_aug is None: return ''
        return f"{self.repeat_input_pre}{self.fmt_array(((lambda x: x) if no_aug else self.repeat_input_aug)(x['input']))}"

    def fmt_reply(self, reply, fault=None):
        ids = self.fmt_array(reply[0]) + self.exa_end + self.tokenizer.eos_token
        if self.out2_use:
            if fault is None: fault = reply
            ids = self.fmt_array(fault[0]) + self.exa_end + self.out2_token + ids
        return ids

    def quick_test(self, decoded, done):
        sp = decoded.split(self.tokenizer.eos_token)[0].split(self.dec_sep)
        sl = len(sp[0])
        is_prefix = sl>0 and len(sp[-1])<=sl and (len(sp)==1 or len(sp[-2])==sl) and all(x.isdigit() for x in sp[-1])
        return is_prefix and (not done or len(sp[-1])==0 or len(sp[-1])==sl)

    @staticmethod
    def is_valid_solution(guess):
        return isinstance(guess, np.ndarray) and guess.ndim == 2 and all(0 < x <= 30 for x in guess.shape)

    def max_new_tokens(self, safety_margin=1):
        max_sized_reply = np.zeros([30, 30], dtype=int)
        tokenized = self.tokenizer(self.fmt_reply([max_sized_reply]))['input_ids']
        max_new_tokens = len(tokenized)
        if tokenized[0]==self.tokenizer.bos_token_id: max_new_tokens -= 1
        return max_new_tokens + safety_margin

    def de_tokenize(self, tokens, scores=None):
        import torch
        tokens_cut = cut_at_token(tokens, self.tokenizer.eos_token_id)
        de_tokenized = self.tokenizer.batch_decode([tokens_cut])[0]
        score_val = None
        if scores is not None:
            tokens_with_eos = tokens[:len(tokens_cut)+1]
            score_val = torch.nn.functional.log_softmax(torch.tensor(scores), dim=-1).numpy().copy()[np.arange(len(tokens_with_eos)), tokens_with_eos].sum()
            number_token_ids = [self.tokenizer.vocab[k] for k in map(str, range(10))]
            fault_token_id = self.collator_kwargs.get('fault_token_id')
            if fault_token_id is not None: number_token_ids.append(fault_token_id)
            number_token_ids = np.array(number_token_ids)
            number_positions = (tokens_cut[..., np.newaxis] == number_token_ids).any(-1)
            scores = scores[:len(tokens_cut), number_token_ids][number_positions]
            scores = torch.nn.functional.log_softmax(torch.tensor(scores), dim=-1)[:, :10].numpy().copy()
        return max(len(tokens)+1, len(tokens_cut)), score_val, de_tokenized, scores

    def decode_to_array_single(self, text, score=None, limit_rows=30):
        try:
            by_rows = [row for row in [[int(x) for x in line if x.isdigit()] for line in text.split(self.dec_sep)] if len(row)]
            if limit_rows and len(by_rows) > limit_rows:
                by_rows = by_rows[:limit_rows]
                limited = True
            else: limited = False
            decoded = np.array(by_rows, dtype=int)
            if self.is_valid_solution(decoded):
                try:
                    assert score is not None
                    decoded_flat = decoded.ravel()
                    if limited: score = score[:len(decoded_flat)]
                    score_all = score.reshape(decoded.shape + score.shape[1:])
                    score_result = score[range(len(decoded_flat)), decoded_flat]
                    score_reshaped = score_result.reshape(decoded.shape)
                    score_cum_reshaped = score_result.cumsum().reshape(score_reshaped.shape)
                    score_all_cum = score_cum_reshaped[..., np.newaxis] - score_reshaped[..., np.newaxis] + score_all
                except: score_reshaped = score_cum_reshaped = np.full(decoded.shape, -float('inf'))
                return {'output': decoded, 'score': score_reshaped, 'score_cum': score_cum_reshaped, 'score_all': score_all, 'score_all_cum': score_all_cum}
        except: pass
        return {}

    def decode_to_array(self, text, score=None, limit_rows=30):
        if not self.out2_use: text, score = [text], [score]
        else:
            text = text.split(self.out2_token)
            if score is None: score = [None]*len(text)
            else:
                lengths = np.cumsum([len(list(filter(str.isdigit, t))) for t in text])
                score = [score[s:e] for s, e in zip([0]+lengths[:-1].tolist(), lengths)]
        return [self.decode_to_array_single(t, s) for t, s in zip(text, score)]

    def get_corpus(self):
        try:
            old_min_wid, self.min_wid = self.min_wid, min(self.min_wid, 2)
            return self.fmt_train([{'input': [[i] for i in range(10)], 'output': [[i] for i in range(10)]}]*3, last_is_challenge=True, pretext_split=True)
        finally: self.min_wid = old_min_wid

    def get_data_collator(self):
        if not self.masking: return None
        from transformers import DataCollatorForLanguageModeling
        collator_params = dict(tokenizer=self.tokenizer, mlm=False)
        pass_out2_token = self.tokenizer.vocab[self.out2_token] if self.out2_use and self.masking==1 else None
        if self.masking:
            assert not self.collator_kwargs.get('mask_first_output') or self.masking==1
            data_collator = get_class_MyDataCollator()(
                **collator_params,
                instruction_template=[self.inp_prefix, self.tokenizer.bos_token][self.masking - 1],
                response_template=[self.out_prefix, (self.out2_token if self.out2_use else self.rpl_sep)][self.masking - 1],
            ).setup(out2_token_id=pass_out2_token, **self.collator_kwargs)
        else:
            assert not self.collator_kwargs, 'only supported with masking on'
            data_collator = DataCollatorForLanguageModeling(**collator_params)
        return data_collator

    def get_output_token_ids(self):
        assert not self.out2_use
        num_tokens = [self.tokenizer.vocab[str(i)] for i in range(10)]
        sep_tokens = [tok for txt in [self.arr_beg, self.arr_sep, self.arr_end, self.exa_sep] if txt for tok in self.tokenizer(txt)['input_ids'][1:]]
        sep_tokens.append(self.tokenizer.eos_token_id)
        return num_tokens + sorted(set(sep_tokens))

ArcFormatter_pretext2 = lambda **kwargs: ArcFormatter(masking=1, inp_prefix='I', out_prefix='O', arr_sep='\n', arr_end='\n', pretext='ABCDEFGHJKLMNPQRSTUVWXYZ', pretext_corpus_split='\n', **kwargs)
ArcFormatter_pretext3 = lambda **kwargs: ArcFormatter(masking=1, inp_prefix='I', out_prefix='O', arr_sep='\n', arr_end='\n', pretext='ABCDEFGHJKLMNPQRSTUVWXYZabcdefghjklmnpqrstuvwxyz', pretext_corpus_split='\n', **kwargs)
ArcFormatter_premix_2 = lambda **kwargs: ArcFormatter(masking=1, inp_prefix='I', out_prefix='O', arr_sep='\n', arr_end='\n', pretext='ABCDEFGHJKLMNPQRSTUVWXYZ', pre_out=['+/-=']*99, pretext_corpus_split='\n', **kwargs)
ArcFormatter_premix_3 = lambda **kwargs: ArcFormatter(masking=1, inp_prefix='I', out_prefix='O', arr_sep='\n', arr_end='\n', pretext='ABCDEFGHJKLMNPQRSTUVWXYZabcdefghjklmnpqrstuvwxyz', pre_out=['+/-=']*99, pretext_corpus_split='\n', **kwargs)

available_formatters = dict(
    ArcFormatter_pretext2=ArcFormatter_pretext2,
    ArcFormatter_pretext3=ArcFormatter_pretext3,
    ArcFormatter_premix_2=ArcFormatter_premix_2,
    ArcFormatter_premix_3=ArcFormatter_premix_3,
)

Writing arc_loader.py


In [3]:
%%writefile selection.py
import numpy as np

def hashable(guess):
    return tuple(map(tuple, guess))

def make_unique(guess_list, indices=None):
    used = set()
    out = []
    out_ind = []
    for i, g in enumerate(guess_list):
        h = hashable(g)
        if h not in used:
            used.add(h)
            out.append(np.array(g))
            if indices is not None: out_ind.append(indices[i])
    return out if indices is None else (out, out_ind)

def first_only(guesses):
    return [g['output'] for g in guesses.values()][:1]

def keep_order(guesses):
    return [g['output'] for g in guesses.values()]

def keep_order_unique(guesses):
    return make_unique(keep_order(guesses))

def get_best_shape_by_score(guess_list, getter, once_per_result=True):
    seen_outputs = set()
    shape_scores = {}
    for i, g in enumerate(guess_list):
        shape = tuple(g['output'].shape)
        scores = shape_scores[shape] = shape_scores.get(shape, [[], []])
        scores[1].append(i)
        h = hashable(g['output'])
        if h in seen_outputs: continue
        if once_per_result: seen_outputs.add(h)
        scores[0].append(g)
    shape_scores = [(getter(scores), shape, indices) for shape, (scores, indices) in shape_scores.items()]
    shape_scores = sorted(shape_scores, key=(lambda x: x[0]), reverse=True)
    return shape_scores[0]

def score_sum(guesses, getter, shape_getter=None, prefer_common_shape=True):
    if shape_getter is None: shape_getter = getter
    guess_list = list(guesses.values())
    common_shape_indices = set(get_best_shape_by_score(guess_list, shape_getter)[2]) if prefer_common_shape else []
    scores = {}
    for i, g in enumerate(guess_list):
        h = hashable(g['output'])
        x = scores[h] = scores.get(h, [i in common_shape_indices, [], g['output']])
        x[1].append(g)
    scores = [(cs, getter(sc), o) for cs, sc, o in scores.values()]
    scores = sorted(scores, key=(lambda x: x[:2]), reverse=True)
    ordered_outputs = [x[-1] for x in scores]
    return ordered_outputs

getter_all_probsum = lambda guesses: sum(np.exp(g['score_val']) for g in guesses)
def score_all_probsum(guesses): return score_sum(guesses, getter_all_probsum)

def getter_full_probmul(p):
    def _getter(guesses, baseline=p):
        inf_score = sum([g['score_val']+baseline for g in guesses])
        aug_score = np.mean([sum(s+baseline for s in g['score_multi_nl']) for g in guesses])
        return inf_score + aug_score
    return _getter

def score_full_probmul_3(guesses): return score_sum(guesses, getter_full_probmul(3), prefer_common_shape=False)

selection_algorithms = [
    first_only,
    keep_order,
    keep_order_unique,
    score_all_probsum,
    score_full_probmul_3,
]

Writing selection.py


In [4]:
%%writefile async_tools.py
import sys
import asyncio

async def stream_reader(stream, id, to):
    id = '' if id is None else f'{id}. '
    data = b''
    while True:
        read = await stream.read(n=4096)
        if not read: break
        if to is not None:
            *complete_lines, data = (data + read + b'X').splitlines()
            data = data[:-1]
            for line in complete_lines:
                line = line.rstrip()
                if line: print(f"{id}{line.decode('utf-8')}", file=to, end='\n', flush=True)

async def wait_for_subprocess(subprocess, print_output=False, id=None):
    await asyncio.gather(
            stream_reader(subprocess.stdout, id, (sys.stdout if print_output else None)),
            stream_reader(subprocess.stderr, id, (sys.stderr if print_output else None)),
        )
    return await subprocess.wait()

async def wait_for_subprocesses(*processes, print_output=False):
    return await asyncio.gather(*[wait_for_subprocess(p, print_output=print_output, id=i if len(processes)>1 else None) for i, p in enumerate(processes)])

Writing async_tools.py


In [5]:
%%writefile common_stuff.py
# common configuration for training and evaluation
from arc_loader import *
from model_runner import *
from selection import *
from async_tools import *
import time
import random
import numpy as np
import torch


GLOBAL_SEED = 42


def set_all_seeds(seed=GLOBAL_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)


set_all_seeds()

# paths
tmp_dir = '/kaggle/temp'
arc_challenge_file = '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'
arc_solutions_file = '/kaggle/input/arc-prize-2025/arc-agi_training_solutions.json'
model_temp_storage = os.path.join(tmp_dir, 'finetuned_model')
infer_temp_storage = os.path.join(tmp_dir, 'inference_outputs')
score_temp_storage = os.path.join(tmp_dir, 'inference_scoring')

# load datasets
arc_test_set = ArcDataset.from_file(arc_challenge_file)
# if arc_test_set.is_fake: arc_test_set.load_replies(arc_solutions_file)
arc_test_set.is_fake = False  # force full run
# arc_train_set = ArcDataset.from_file('/kaggle/input/arc-prize-2024/arc-agi_training_challenges.json')

# models
MyFormatter, perm_aug, max_seq_length_train, mask_first = ArcFormatter_premix_3, 'rnd_all', 4224, 1

# training & inference
train_epochs = 4
multi_gpu_train = True
multi_gpu_random_split = False
max_seq_length_infer = 8192
prime_on_single_task = True
num_active_layers = 32
infer_params = dict(min_prob=0.5, store=infer_temp_storage, use_turbo=True)

# scoring
use_aug_score = True
aug_score_params = dict(tp=True, rot=True, perm=perm_aug, shfl_ex=True, make_unique=True, max_len=max_seq_length_infer)
submission_select_algo = score_full_probmul_3 if use_aug_score else score_all_probsum


def prepare_run(model_path, load_lora=None, train=False, gpu=None, **kwargs):
    seed = GLOBAL_SEED + (0 if gpu is None else gpu)
    set_all_seeds(seed)

    if gpu is not None:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

    model_kwargs = dict(max_seq_length=max_seq_length_train)
    model_kwargs.update(kwargs)

    model, tokenizer, formatter = prepare_model(
        model=model_path,
        local_files_only=True,
        mode='unsloth_4bit',
        formatter=MyFormatter,
        peft=([dict(
            r=32,
            target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'embed_tokens', 'lm_head'],
            lora_alpha=128,
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing=True,
            random_state=42,
            use_rslora=True,
            loftq_config=None,
        )] if train or load_lora else []) + ([load_lora] if load_lora else []),
        num_active_layers=(num_active_layers if train else None),
        **model_kwargs
    )

    if train and mask_first: formatter.collator_kwargs.update(mask_first_output=mask_first)

    return model, formatter


def prepare_dataset(formatter, train, gpu=None):
    seed = GLOBAL_SEED + (0 if gpu is None else gpu)
    set_all_seeds(seed)

    ds = arc_test_set
    if multi_gpu_train and gpu is not None:
        if multi_gpu_random_split:
            all_keys_shuffled = ds.shuffled(seed=123).keys
            num_total_keys = len(all_keys_shuffled)
            base_quarter_size = num_total_keys // 4
            start_index = gpu * base_quarter_size
            if gpu < 3:
                end_index = (gpu + 1) * base_quarter_size
            else:
                end_index = num_total_keys
            gpu_specific_keys = all_keys_shuffled[start_index:end_index]
            ds = ds.change_keys(gpu_specific_keys, keep_flags=True)
        else:
            ds = ds.sorted_by_len(formatter=formatter, name='input', max_of_transposed=True)
            # 4-GPU rotation pattern instead of 2-GPU
            assignment = ([0, 1, 2, 3] * ds.length())[:ds.length()][::-1]
            ds = ds.change_keys((np.array(ds.keys)[np.array(assignment) == gpu]).tolist())

    if arc_test_set.is_fake: ds.keys = ds.keys[:1]
    # Rest of the function remains the same
    if train:
        ds = ds.remove_replies()
        ds = ds.augment(tp=True, rot=True, perm=perm_aug, n=(2 if arc_test_set.is_fake else train_epochs), shfl_ex=True, shfl_keys=True)
        ds = ds.cut_to_len(formatter=formatter, name='text', max_len=max_seq_length_train, max_new_tokens=0, quiet=True)
        if arc_test_set.is_fake: ds = ds.sorted_by_len(formatter=formatter, name='text', reverse=True)
        print(len(ds.keys))
    else:
        ds = ds.sorted_by_len(formatter=formatter, name='input', max_of_transposed=True)
        ds = ds.split_multi_replies()
        ds = ds.augment(tp=True, rot=True, n=2, seed=42, perm=perm_aug, shfl_ex=True).interleave(ds.length())
        ds = ds.cut_to_len(formatter=formatter, name='input', max_len=max_seq_length_infer, quiet=True)
        print(len(ds.keys))
        grouped_keys = {}
        for key in ds.keys:
            base_key = key.split('.')[0]
            if base_key not in grouped_keys:
                grouped_keys[base_key] = []
            grouped_keys[base_key].append(key)
        final_keys = []
        for base_key in sorted(grouped_keys.keys()):
            group = grouped_keys[base_key]
            permuted_group = np.random.permutation(group).tolist()
            final_keys.extend(permuted_group[:5])
        ds = ds.change_keys(final_keys)
    return ds


def start_training(gpu):
    seed = GLOBAL_SEED + gpu
    set_all_seeds(seed)
    base_model = '/kaggle/input/mistral-hybrid/transformers/default/1/namannn/mistral-hybrid'

    try:
        storage_path = f'{model_temp_storage}_gpu{gpu}'
        if gpu == 0 or multi_gpu_train:
            with RemapCudaOOM():
                model, formatter = prepare_run(base_model, train=True, gpu=gpu)
                dataset = prepare_dataset(formatter, train=True, gpu=gpu if multi_gpu_train else None)
                model, trainer_stats = training_run(
                    model, formatter, dataset, store=storage_path,
                    max_seq_length=max_seq_length_train,
                    grad_acc_fix=False,
                    train_args=dict(
                        per_device_train_batch_size=8,
                        gradient_accumulation_steps=1,
                        warmup_steps=48,
                        num_train_epochs=1,
                        max_steps=5 if arc_test_set.is_fake else 240,
                        learning_rate=1e-4,
                        embedding_learning_rate=1e-5,
                        logging_steps=10,
                        optim="adamw_8bit",
                        weight_decay=0.01,
                        lr_scheduler_type='cosine',  # "linear", "cosine",
                        seed=42,
                        output_dir=os.path.join(tmp_dir, 'checkpoints'),
                        save_strategy="no",
                        report_to='none',
                    ),
                )
                mem_info()
    finally:
        os.makedirs(f'{storage_path}_done', exist_ok=True)


def start_inference(gpu):
    seed = GLOBAL_SEED + gpu + 100
    set_all_seeds(seed)

    storage_path = f'{model_temp_storage}_gpu{gpu if multi_gpu_train else 0}'
    while not os.path.exists(f'{storage_path}_done'): time.sleep(15)
    with RemapCudaOOM():
        model, formatter = prepare_run(storage_path, gpu=gpu)
        dataset = prepare_dataset(formatter, train=False, gpu=gpu)
        retrainer = None if not prime_on_single_task else Retrainer(
            n=128,
            aug_opts=dict(tp=True, rot=True, perm=perm_aug, shfl_ex=True),
            reload_state_dict=get_and_fix_peft_weights(storage_path),
            formatter=formatter,
            max_seq_length=max_seq_length_infer,
            grad_acc_fix=False,
            train_args=dict(
                per_device_train_batch_size=8,
                gradient_accumulation_steps=1,
                warmup_steps=4,
                num_train_epochs=1,
                learning_rate=5e-5,
                embedding_learning_rate=0,
                logging_steps=8,
                optim="adamw_8bit",
                weight_decay=0.01,
                lr_scheduler_type='constant',  # "linear", "cosine",
                seed=42,
                output_dir='tmp_output',
                save_strategy='no',
                report_to='none',
            ),
        )
        decoder = Decoder(formatter, arc_test_set.split_multi_replies(), n_guesses=2, prob_baseline=0.05)
        inference_run_v2(model, formatter, dataset, decoder, retrain=retrainer, **infer_params)
        if use_aug_score or arc_test_set.is_fake: decoder.calc_augmented_scores(model=model, store=score_temp_storage, **aug_score_params)
        mem_info()


class RemapCudaOOM:
    def __enter__(self): pass

    def __exit__(self, exc_type, exc_value, traceback):
        oom_errors = ["CUDA out of memory", "Make sure you have enough GPU RAM", "does not fit any GPU's remaining memory"]
        if exc_value and any(x in str(exc_value) for x in oom_errors):
            with open('submission.json', 'w') as f: f.write('cause submission scoring error')

Writing common_stuff.py


In [6]:
from common_stuff import *
import os
os.environ["WANDB_DISABLED"] = "true"

if not os.path.exists(os.path.join(tmp_dir, 'unsloth_installed')):  # unsloth offline install - https://stackoverflow.com/a/51646354
    !pip uninstall --yes torch accelerate
    !pip install --no-index --find-links=/kaggle/input/unsloth-2024-9-post4/wheelhouse unsloth
    #!pip uninstall --yes accelerate fastai torch torchaudio transformers
    #!pip install --no-index --find-links=/kaggle/input/unsloth-2024-10-7/wheelhouse unsloth  # do not use grad_acc_fix - trains very slow
    #!sed -i 's/if ((post_check - pre_check) >= 1).sum() > 1:/if False:/g' /opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py
    # fix delay bug in get_statistics()
    !sed -i 's/^def get_statistics():/def get_statistics():\n if False:/g' /opt/conda/lib/python3.10/site-packages/unsloth/models/_utils.py
    # fix faulty unsloth multi-gpu detection
    !sed -i "s/raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')/pass/g" /opt/conda/lib/python3.10/site-packages/unsloth/tokenizer_utils.py /opt/conda/lib/python3.10/site-packages/unsloth/models/llama.py /opt/conda/lib/python3.10/site-packages/unsloth/models/vision.py
    os.makedirs(os.path.join(tmp_dir, 'unsloth_installed'), exist_ok=True)
    print('Unsloth installed & patched.')

for gpu in [0, 1]: 
    signal_path = f'{model_temp_storage}_gpu{gpu}_done'
    if os.path.exists(signal_path): os.rmdir(signal_path)

if arc_test_set.is_fake:  # cleanup? (for debugging)
    #!rm -R /kaggle/temp/finetuned_model*
    #!rm -R /kaggle/temp/inference_outputs
    #!rm -R /kaggle/temp/inference_scoring
    #!ls /kaggle/temp
    pass


ModuleNotFoundError: No module named 'tqdm'

In [7]:
# Simplified ARC data visualization script (English version)
from arc_loader import *
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import json
import os

# Create ARC color map
cmap = colors.ListedColormap(
    ['#000000', '#0074D9', '#FF4136', '#2ECC40', '#FFDC00',
     '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
norm = colors.Normalize(vmin=0, vmax=9)

# Load data directly from file
arc_challenge_file = '/kaggle/input/arc-prize-2025/arc-agi_test_challenges.json'

# Load original data
with open(arc_challenge_file, 'r') as f:
    arc_data = json.load(f)

# Set random seeds
np.random.seed(42)
random.seed(42)

def visualize_arc_example(train_data, test_data, task_id):
    """Visualize training and test data for an ARC task"""
    # Get number of training and test examples
    n_train = len(train_data)
    n_test = len(test_data)
    
    # Create figure large enough for all examples
    fig, axes = plt.subplots(2, max(n_train, n_test), figsize=(4*max(n_train, n_test), 8))
    fig.suptitle(f"Task ID: {task_id}", fontsize=16)
    
    # Visualize training data
    for i in range(n_train):
        # Input
        axes[0, i].imshow(train_data[i]['input'], cmap=cmap, norm=norm)
        axes[0, i].grid(True, which='both', color='lightgrey', linewidth=0.5)
        axes[0, i].set_title(f"Training #{i+1} - Input")
        axes[0, i].set_xticks([])
        axes[0, i].set_yticks([])
        
        # Output
        axes[1, i].imshow(train_data[i]['output'], cmap=cmap, norm=norm)
        axes[1, i].grid(True, which='both', color='lightgrey', linewidth=0.5)
        axes[1, i].set_title(f"Training #{i+1} - Output")
        axes[1, i].set_xticks([])
        axes[1, i].set_yticks([])
    
    # Handle test data visualization
    for i in range(n_test):
        if i < n_train:
            # Already have training data in this column
            pass
        else:
            # Hide unused training cells
            if i >= n_train:
                axes[0, i].axis('off')
                axes[1, i].axis('off')
    
    # Show first test input
    if n_test > 0:
        # Create separate figure for test input
        plt.figure(figsize=(5, 5))
        plt.imshow(test_data[0]['input'], cmap=cmap, norm=norm)
        plt.grid(True, which='both', color='lightgrey', linewidth=0.5)
        plt.title(f"Test Input - {task_id}")
        plt.xticks([])
        plt.yticks([])
        plt.show()
        
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

# Simulate 4 GPU data splitting
task_ids = list(arc_data.keys())
random.shuffle(task_ids)  # Shuffle task order

# Assign tasks to each GPU
gpu_tasks = {}
for gpu_id in range(4):
    # Simple equal division - each GPU gets 1/4 of tasks
    start_idx = gpu_id * len(task_ids) // 4
    end_idx = (gpu_id + 1) * len(task_ids) // 4
    gpu_tasks[gpu_id] = task_ids[start_idx:end_idx]

# Display training data samples for each GPU
for gpu_id in range(4):
    assigned_tasks = gpu_tasks[gpu_id]
    print(f"\n{'='*40}\nGPU {gpu_id} Training Data Samples\n{'='*40}")
    print(f"GPU {gpu_id} assigned {len(assigned_tasks)} training tasks")
    
    # Show only first 3 examples
    samples = assigned_tasks[:3]
    
    for task_id in samples:
        print(f"\nTask: {task_id}")
        
        # Get training and test data for this task
        train_data = arc_data[task_id]['train']
        test_data = arc_data[task_id]['test']
        
        # Visualize
        visualize_arc_example(train_data, test_data, task_id)
        
        # Print data matrices
        print("Training Input (first example):")
        print(np.array(train_data[0]['input']))
        print("\nTraining Output (first example):")
        print(np.array(train_data[0]['output']))
        print("-" * 40)

ModuleNotFoundError: No module named 'tqdm'

In [10]:
%%python --bg --proc train_proc0
from common_stuff import *
start_training(gpu=0)

In [11]:
%%python --bg --proc train_proc1
from common_stuff import *
start_training(gpu=1)

In [12]:
%%python --bg --proc train_proc2
from common_stuff import *
start_training(gpu=2)

In [13]:
%%python --bg --proc train_proc3
from common_stuff import *
start_training(gpu=3)

In [14]:
%%python --bg --proc infer_proc0
from common_stuff import *
start_inference(gpu=0)

In [15]:
%%python --bg --proc infer_proc1
from common_stuff import *
start_inference(gpu=1)

In [16]:
%%python --bg --proc infer_proc2
from common_stuff import *
start_inference(gpu=2)

In [17]:
%%python --bg --proc infer_proc3
from common_stuff import *
start_inference(gpu=3)

In [18]:
proc_exit_codes = await wait_for_subprocesses(
    train_proc0, train_proc1, train_proc2, train_proc3,
    infer_proc0, infer_proc1, infer_proc2, infer_proc3,
    print_output=True or arc_test_set.is_fake
)
print(f'*** Subprocesses exit codes: {proc_exit_codes}')
assert all(x==0 for x in proc_exit_codes)

3. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
2. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
1. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
0. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
2. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_

2. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
2. *** -> Fake test set detected, setting flag 'is_fake' to True.
2. *** Load base model and tokenizer from '/kaggle/input/mistral-hybrid/transformers/default/1/namannn/mistral-hybrid'...
2. 🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
2. ==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.0.
2.    \\   /|    GPU: NVIDIA L4. Max memory: 22.278 GB. Platform = Linux.
2. O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
2. \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
2.  "-____-"     Free Apache license: http://github.com/unslothai/unsloth
2. Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
1. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
1. *** -> Fake test set detected, setting flag 'is_fake' to True.
1. ***

1. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
2. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
3. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
0. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
1. Loading checkpoint shards:  33%|███▎      | 1/3 [00:09<00:18,  9.15s/it]
2. Loading checkpoint shards:  33%|███▎      | 1/3 [00:09<00:19,  9.93s/it]
0. Loading checkpoint shards:  33%|███▎      | 1/3 [00:09<00:19,  9.74s/it]
3. Loading checkpoint shards:  33%|███▎      | 1/3 [00:09<00:19,  9.70s/it]
1. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:17<00:08,  8.95s/it]
1. Loading checkpoint shards: 100%|██████████| 3/3 [00:26<00:00,  8.70s/it]
1. Loading checkpoint shards: 100%|██████████| 3/3 [00:26<00:00,  8.79s/it]
2. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:18<00:09,  9.28s/it]
2. Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  8.86s/it]
2. Loading checkpoint shards: 100%|█████████

1. *** Create new peft model...
1. Unsloth: Casting embed_tokens to float32
1. Unsloth: Casting lm_head to float32
1. *** Activating only the first 32 layers and freezing the rest...
1. *** -> Layers from 32 to 39 are frozen.
1. *** -> Trainable params after freezing: 75,907,072 (2.02%)
1. *** Augment dataset...
1. 960
3. *** Create new peft model...
3. Unsloth: Casting embed_tokens to float32
3. Unsloth: Casting lm_head to float32
3. *** Activating only the first 32 layers and freezing the rest...
3. *** -> Layers from 32 to 39 are frozen.
3. *** -> Trainable params after freezing: 75,907,072 (2.02%)
3. *** Augment dataset...
3. 960
2. *** Create new peft model...
2. Unsloth: Casting embed_tokens to float32
2. Unsloth: Casting lm_head to float32
2. *** Activating only the first 32 layers and freezing the rest...
2. *** -> Layers from 32 to 39 are frozen.
2. *** -> Trainable params after freezing: 75,907,072 (2.02%)
2. *** Augment dataset...
2. 960


1. Map:   0%|          | 0/960 [00:00<?, ? examples/s]
1. Map: 100%|██████████| 960/960 [00:01<00:00, 631.94 examples/s]
1. Map: 100%|██████████| 960/960 [00:01<00:00, 626.91 examples/s]
1. max_steps is given, it will override any value given in num_train_epochs
3. Map:   0%|          | 0/960 [00:00<?, ? examples/s]
3. Map: 100%|██████████| 960/960 [00:01<00:00, 833.53 examples/s]
3. Map: 100%|██████████| 960/960 [00:01<00:00, 826.77 examples/s]
2. Map:   0%|          | 0/960 [00:00<?, ? examples/s]
3. max_steps is given, it will override any value given in num_train_epochs
2. Map: 100%|██████████| 960/960 [00:01<00:00, 826.56 examples/s]
2. Map: 100%|██████████| 960/960 [00:01<00:00, 819.92 examples/s]
2. max_steps is given, it will override any value given in num_train_epochs


0. *** Create new peft model...
0. Unsloth: Casting embed_tokens to float32
0. Unsloth: Casting lm_head to float32
0. *** Activating only the first 32 layers and freezing the rest...
0. *** -> Layers from 32 to 39 are frozen.
0. *** -> Trainable params after freezing: 75,907,072 (2.02%)
0. *** Augment dataset...
0. 960


0. Map:   0%|          | 0/960 [00:00<?, ? examples/s]
0. Map: 100%|██████████| 960/960 [00:01<00:00, 860.89 examples/s]
0. Map: 100%|██████████| 960/960 [00:01<00:00, 853.29 examples/s]
0. max_steps is given, it will override any value given in num_train_epochs
1. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
1.    \\   /|    Num examples = 960 | Num Epochs = 2
1. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
1. \        /    Total batch size = 8 | Total steps = 240
1.  "-____-"     Number of trainable parameters = 75,907,072
3. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
3.    \\   /|    Num examples = 960 | Num Epochs = 2
3. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
3. \        /    Total batch size = 8 | Total steps = 240
3.  "-____-"     Number of trainable parameters = 75,907,072
2. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
2.    \\   /|    Num examples = 960

1. *** Start training run...
1. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for embed_tokens.
1. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for lm_head.


0. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
0.    \\   /|    Num examples = 960 | Num Epochs = 2
0. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
0. \        /    Total batch size = 8 | Total steps = 240
0.  "-____-"     Number of trainable parameters = 75,907,072


3. *** Start training run...
3. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for embed_tokens.
3. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for lm_head.
2. *** Start training run...
2. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for embed_tokens.
2. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for lm_head.
0. *** Start training run...
0. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for embed_tokens.
0. Unsloth: Setting lr = 1.00e-05 instead of 1.00e-04 for lm_head.


3.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
0.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
2.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
1.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
3.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
0.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
2.   with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
3.   0%|          | 1/240 [00:37<2:27:43, 37.08s/it]
0.   0%|        

3. {'loss': 0.1501, 'grad_norm': 0.7867273688316345, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.08}
3. {'loss': 0.1056, 'grad_norm': 1.294529914855957, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.17}
3. {'loss': 0.0681, 'grad_norm': 0.8311694860458374, 'learning_rate': 6.25e-05, 'epoch': 0.25}
3. {'loss': 0.0698, 'grad_norm': 0.7124578356742859, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.33}
3. {'loss': 0.0509, 'grad_norm': 0.7246041297912598, 'learning_rate': 9.997322937381829e-05, 'epoch': 0.42}
3. {'loss': 0.0485, 'grad_norm': 0.9762706160545349, 'learning_rate': 9.903926402016153e-05, 'epoch': 0.5}
3. {'loss': 0.0392, 'grad_norm': 0.4533740282058716, 'learning_rate': 9.67952963378663e-05, 'epoch': 0.58}
3. {'loss': 0.0281, 'grad_norm': 0.4865405857563019, 'learning_rate': 9.330127018922194e-05, 'epoch': 0.67}
3. {'loss': 0.0293, 'grad_norm': 0.7114354372024536, 'learning_rate': 8.865052266813685e-05, 'epoch': 0.75}
3. {'loss': 0.023, 'grad_norm': 0.891041159

1.  92%|█████████▏| 220/240 [2:06:02<12:34, 37.74s/it]
2.  95%|█████████▌| 228/240 [2:06:19<06:44, 33.72s/it]
7. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
7. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


7. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
7. *** -> Fake test set detected, setting flag 'is_fake' to True.
7. *** Load base model and tokenizer from '/kaggle/temp/finetuned_model_gpu3'...
7. 🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
7. ==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.0.
7.    \\   /|    GPU: NVIDIA L4. Max memory: 22.278 GB. Platform = Linux.
7. O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
7. \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
7.  "-____-"     Free Apache license: http://github.com/unslothai/unsloth
7. Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


7. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
7. Loading checkpoint shards:  33%|███▎      | 1/3 [00:02<00:05,  2.69s/it]
7. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:05<00:02,  2.65s/it]
7. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.59s/it]
7. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.61s/it]
7. Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.
0.  99%|█████████▉| 237/240 [2:06:36<01:36, 32.02s/it]


7. *** Augment dataset...
7. 704
7. *** Load peft state_dict from '/kaggle/temp/finetuned_model_gpu3'...
7. *** Load stored data...
7. *** Start inference run...
7.   0%|          | 0/30 [00:00<?, ?it/s]retraining model for key '16b78196' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 483.92 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 477.29 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


1.  92%|█████████▏| 221/240 [2:06:45<12:25, 39.24s/it]
2.  95%|█████████▌| 229/240 [2:06:56<06:23, 34.89s/it]
2.  96%|█████████▌| 230/240 [2:07:34<05:57, 35.72s/it]
0.  99%|█████████▉| 238/240 [2:07:13<01:06, 33.49s/it]
7.   0%|          | 0/16 [00:00<?, ?it/s][A
2.  96%|█████████▌| 230/240 [2:07:34<05:57, 35.72s/it]
1.  92%|█████████▎| 222/240 [2:07:28<12:06, 40.39s/it]
0. 100%|█████████▉| 239/240 [2:07:41<00:31, 31.96s/it]
0. 100%|██████████| 240/240 [2:08:18<00:00, 33.53s/it]
0. 100%|██████████| 240/240 [2:08:18<00:00, 33.53s/it]
0. 100%|██████████| 240/240 [2:08:18<00:00, 33.53s/it]
0. 100%|██████████| 240/240 [2:08:18<00:00, 32.08s/it]


0. {'loss': 0.1471, 'grad_norm': 1.2236523628234863, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.08}
0. {'loss': 0.0945, 'grad_norm': 0.9049831032752991, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.17}
0. {'loss': 0.0848, 'grad_norm': 0.8789643049240112, 'learning_rate': 6.25e-05, 'epoch': 0.25}
0. {'loss': 0.0763, 'grad_norm': 0.7441339492797852, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.33}
0. {'loss': 0.0504, 'grad_norm': 0.7478144764900208, 'learning_rate': 9.997322937381829e-05, 'epoch': 0.42}
0. {'loss': 0.0459, 'grad_norm': 0.8582068681716919, 'learning_rate': 9.903926402016153e-05, 'epoch': 0.5}
0. {'loss': 0.0501, 'grad_norm': 0.6952024698257446, 'learning_rate': 9.67952963378663e-05, 'epoch': 0.58}
0. {'loss': 0.0349, 'grad_norm': 1.5011838674545288, 'learning_rate': 9.330127018922194e-05, 'epoch': 0.67}
0. {'loss': 0.027, 'grad_norm': 0.5692598223686218, 'learning_rate': 8.865052266813685e-05, 'epoch': 0.75}
0. {'loss': 0.0313, 'grad_norm': 0.42170265

4. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
4. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


4. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
4. *** -> Fake test set detected, setting flag 'is_fake' to True.
4. *** Load base model and tokenizer from '/kaggle/temp/finetuned_model_gpu0'...
4. 🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
4. ==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.0.
4.    \\   /|    GPU: NVIDIA L4. Max memory: 22.278 GB. Platform = Linux.
4. O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
4. \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
4.  "-____-"     Free Apache license: http://github.com/unslothai/unsloth
4. Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


4. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
4. Loading checkpoint shards:  33%|███▎      | 1/3 [00:02<00:05,  2.65s/it]
2.  96%|█████████▋| 231/240 [2:08:06<05:11, 34.61s/it]
4. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:05<00:02,  2.64s/it]
4. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.58s/it]
4. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.60s/it]
1.  93%|█████████▎| 223/240 [2:08:08<11:22, 40.15s/it]
4. Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.
7.   6%|▋         | 1/16 [00:40<10:04, 40.31s/it][A


4. *** Augment dataset...
4. 688
4. *** Load peft state_dict from '/kaggle/temp/finetuned_model_gpu0'...
4. *** Load stored data...
4. *** Start inference run...
4.   0%|          | 0/30 [00:00<?, ?it/s]retraining model for key '1ae2feb7' (retrain_dataset_size=15)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1288.69 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
2.  97%|█████████▋| 232/240 [2:08:37<04:27, 33.40s/it]
1.  93%|█████████▎| 224/240 [2:08:40<10:06, 37.89s/it]
4.   6%|▋         | 1/16 [00:13<03:27, 13.84s/it][A
7.  12%|█▎        | 2/16 [01:19<09:12, 39.47s/it][A
4.  12%|█▎        | 2/16 [00:26<03:00, 12.89s/it][A
4.  19%|█▉        | 3/16 [00:38<02:43, 12.59s/it][A
2.  97%|█████████▋| 233/240 [2:09:07<03:48, 32.57s/it]
4.  25%|██▌       | 4/16 [00:50<02:29, 12.46s/it][A
1.  94%|█████████▍| 225/240 [2:09:15<09:15, 37.02s/it]
7.  19%|█▉        | 3/16 [01:58<08:30, 39.30s/it][A
4.  31%|███▏      | 5/16 [01:02<02:16, 12.39s/it][A
2.  98%|█████████▊| 234/240 [2:09:45<03:24, 34.17s/it]
4.  38%|███▊      | 6/16 [01:15<02:03, 12.37s/it][A
1.  94%|█████████▍| 226/240 [2:09:58<09:00, 38.64s/it]
4.  44%|████▍     | 7/16 [01:27<01:51, 12.35s/it][A


4. {'loss': 0.0097, 'grad_norm': 0.22342471778392792, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:39<01:38, 12.34s/it][A
4. [A
7.  25%|██▌       | 4/16 [02:37<07:51, 39.27s/it][A
4.  50%|█████     | 8/16 [01:39<01:38, 12.34s/it][A
2.  98%|█████████▊| 235/240 [2:10:16<02:45, 33.02s/it]
4.  56%|█████▋    | 9/16 [01:52<01:26, 12.34s/it][A
2.  98%|█████████▊| 236/240 [2:10:46<02:08, 32.24s/it]
1.  95%|█████████▍| 227/240 [2:10:26<07:44, 35.72s/it]
4.  62%|██████▎   | 10/16 [02:04<01:14, 12.34s/it][A
7.  31%|███▏      | 5/16 [03:16<07:11, 39.27s/it][A
4.  69%|██████▉   | 11/16 [02:16<01:01, 12.35s/it][A
4.  75%|███████▌  | 12/16 [02:29<00:49, 12.36s/it][A
2.  99%|█████████▉| 237/240 [2:11:07<01:26, 28.88s/it]
4.  81%|████████▏ | 13/16 [02:41<00:37, 12.35s/it][A
1.  95%|█████████▌| 228/240 [2:11:09<07:32, 37.70s/it]
4.  88%|████████▊ | 14/16 [02:53<00:24, 12.34s/it][A
7.  38%|███▊      | 6/16 [03:56<06:33, 39.31s/it][A
1.  95%|█████████▌| 229/240 [2:11:48<07:00, 38.26s/it]
1.  96%|█████████▌| 230/240 [2:12:06<05:19, 31.94s/it]
4.  94%|█████████▍| 

4.   0%|          | 0/30 [01:43<?, ?it/s]
4. {'loss': 0.001, 'grad_norm': 0.5332469940185547, 'learning_rate': 5e-05, 'epoch': 1.0}
4.   0%|          | 0/30 [03:21<?, ?it/s]
4. {'train_runtime': 198.5782, 'train_samples_per_second': 0.645, 'train_steps_per_second': 0.081, 'train_loss': 0.0053152337204664946, 'epoch': 1.0}


4. 100%|██████████| 16/16 [03:18<00:00, 12.34s/it][A
4. [A
4. 100%|██████████| 16/16 [03:18<00:00, 12.34s/it][A
4. [A
4. 100%|██████████| 16/16 [03:18<00:00, 12.34s/it][A
4. 100%|██████████| 16/16 [03:18<00:00, 12.41s/it]
2.  99%|█████████▉| 238/240 [2:11:45<01:03, 31.53s/it]
1.  96%|█████████▌| 230/240 [2:12:06<05:19, 31.94s/it]
7.  44%|████▍     | 7/16 [04:35<05:54, 39.35s/it][A


7. {'loss': 0.0026, 'grad_norm': 0.17885564267635345, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [05:15<05:15, 39.38s/it][A
7. [A
2. 100%|█████████▉| 239/240 [2:12:15<00:31, 31.24s/it]
2. 100%|██████████| 240/240 [2:12:53<00:00, 33.16s/it]
2. 100%|██████████| 240/240 [2:12:53<00:00, 33.16s/it]
2. 100%|██████████| 240/240 [2:12:53<00:00, 33.16s/it]
2. 100%|██████████| 240/240 [2:12:53<00:00, 33.22s/it]


2. {'loss': 0.0907, 'grad_norm': 1.0151928663253784, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.08}
2. {'loss': 0.0721, 'grad_norm': 0.8373411297798157, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.17}
2. {'loss': 0.0502, 'grad_norm': 0.46275344491004944, 'learning_rate': 6.25e-05, 'epoch': 0.25}
2. {'loss': 0.0587, 'grad_norm': 1.2480559349060059, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.33}
2. {'loss': 0.0378, 'grad_norm': 0.8364644050598145, 'learning_rate': 9.997322937381829e-05, 'epoch': 0.42}
2. {'loss': 0.0347, 'grad_norm': 0.7663957476615906, 'learning_rate': 9.903926402016153e-05, 'epoch': 0.5}
2. {'loss': 0.0348, 'grad_norm': 0.8123700618743896, 'learning_rate': 9.67952963378663e-05, 'epoch': 0.58}
2. {'loss': 0.0246, 'grad_norm': 0.4959016740322113, 'learning_rate': 9.330127018922194e-05, 'epoch': 0.67}
2. {'loss': 0.0259, 'grad_norm': 0.7990118265151978, 'learning_rate': 8.865052266813685e-05, 'epoch': 0.75}
2. {'loss': 0.0196, 'grad_norm': 0.707128

6. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
6. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


6. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
6. *** -> Fake test set detected, setting flag 'is_fake' to True.
6. *** Load base model and tokenizer from '/kaggle/temp/finetuned_model_gpu2'...
6. 🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
6. ==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.0.
6.    \\   /|    GPU: NVIDIA L4. Max memory: 22.278 GB. Platform = Linux.
6. O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
6. \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
6.  "-____-"     Free Apache license: http://github.com/unslothai/unsloth
6. Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


6. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
6. Loading checkpoint shards:  33%|███▎      | 1/3 [00:02<00:05,  2.65s/it]
6. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:05<00:02,  2.63s/it]
6. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.57s/it]
6. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.59s/it]
6. Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.
1.  96%|█████████▋| 231/240 [2:12:38<04:49, 32.15s/it]
7.  50%|█████     | 8/16 [05:15<05:15, 39.38s/it][A


6. *** Augment dataset...
6. 672
6. *** Load peft state_dict from '/kaggle/temp/finetuned_model_gpu2'...
6. *** Load stored data...
6. *** Start inference run...
6.   0%|          | 0/30 [00:00<?, ?it/s]retraining model for key '135a2760' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1363.25 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
1.  97%|█████████▋| 232/240 [2:13:13<04:23, 32.97s/it]
6.   6%|▋         | 1/16 [00:13<03:22, 13.52s/it][A
7.  56%|█████▋    | 9/16 [05:54<04:36, 39.43s/it][A
6.  12%|█▎        | 2/16 [00:25<02:55, 12.56s/it][A
6.  19%|█▉        | 3/16 [00:37<02:39, 12.29s/it][A
1.  97%|█████████▋| 233/240 [2:13:42<03:41, 31.62s/it]


4.   0%|          | 0/30 [03:21<?, ?it/s]*** -> Training took 198.5782 seconds.
4.   3%|▎         | 1/30 [05:28<2:38:35, 328.11s/it]retraining model for key '269e22fb' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 598.18 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 589.77 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  25%|██▌       | 4/16 [00:49<02:25, 12.15s/it][A
6.  31%|███▏      | 5/16 [01:01<02:12, 12.08s/it][A
7.  62%|██████▎   | 10/16 [06:34<03:57, 39.66s/it][A
1.  98%|█████████▊| 234/240 [2:14:17<03:15, 32.64s/it]
6.  38%|███▊      | 6/16 [01:13<02:00, 12.04s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  44%|████▍     | 7/16 [01:25<01:48, 12.04s/it][A


6. {'loss': 0.0012, 'grad_norm': 0.05094897001981735, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:37<01:36, 12.05s/it][A
6. [A
6.  50%|█████     | 8/16 [01:37<01:36, 12.05s/it][A
1.  98%|█████████▊| 235/240 [2:14:42<02:32, 30.58s/it]
4.   6%|▋         | 1/16 [00:28<07:02, 28.16s/it][A
7.  69%|██████▉   | 11/16 [07:14<03:18, 39.73s/it][A
6.  56%|█████▋    | 9/16 [01:49<01:24, 12.07s/it][A
6.  62%|██████▎   | 10/16 [02:01<01:12, 12.08s/it][A
4.  12%|█▎        | 2/16 [00:56<06:33, 28.10s/it][A
6.  69%|██████▉   | 11/16 [02:13<01:00, 12.10s/it][A
1.  98%|█████████▊| 236/240 [2:15:17<02:07, 31.95s/it]
7.  75%|███████▌  | 12/16 [07:54<02:38, 39.74s/it][A
6.  75%|███████▌  | 12/16 [02:25<00:48, 12.10s/it][A
6.  81%|████████▏ | 13/16 [02:37<00:36, 12.12s/it][A
4.  19%|█▉        | 3/16 [01:24<06:05, 28.11s/it][A
1.  99%|█████████▉| 237/240 [2:15:53<01:38, 32.92s/it]
6.  88%|████████▊ | 14/16 [02:49<00:24, 12.08s/it][A
7.  81%|████████▏ | 13/16 [08:34<01:59, 39.74s/it][A
6.  94%|█████████▍| 15/16 [03:02<00:12, 12.09s/it][A


6.   0%|          | 0/30 [01:40<?, ?it/s]
6. {'loss': 0.0012, 'grad_norm': 0.822670578956604, 'learning_rate': 5e-05, 'epoch': 1.0}
6.   0%|          | 0/30 [03:17<?, ?it/s]
6. {'train_runtime': 194.1493, 'train_samples_per_second': 0.659, 'train_steps_per_second': 0.082, 'train_loss': 0.001207019027788192, 'epoch': 1.0}


6. 100%|██████████| 16/16 [03:14<00:00, 12.08s/it][A
6. [A
6. 100%|██████████| 16/16 [03:14<00:00, 12.08s/it][A
6. [A
6. 100%|██████████| 16/16 [03:14<00:00, 12.08s/it][A
6. 100%|██████████| 16/16 [03:14<00:00, 12.13s/it]
4.  25%|██▌       | 4/16 [01:52<05:37, 28.13s/it][A
1.  99%|█████████▉| 238/240 [2:16:17<01:00, 30.30s/it]
4.  31%|███▏      | 5/16 [02:20<05:09, 28.18s/it][A
7.  88%|████████▊ | 14/16 [09:13<01:19, 39.72s/it][A
4.  38%|███▊      | 6/16 [02:49<04:42, 28.20s/it][A
1. 100%|█████████▉| 239/240 [2:17:00<00:34, 34.07s/it]
1. 100%|██████████| 240/240 [2:17:42<00:00, 36.50s/it]
1. 100%|██████████| 240/240 [2:17:42<00:00, 36.50s/it]
1. 100%|██████████| 240/240 [2:17:42<00:00, 36.50s/it]
1. 100%|██████████| 240/240 [2:17:42<00:00, 34.43s/it]


1. {'loss': 0.1341, 'grad_norm': 2.360454797744751, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.08}
1. {'loss': 0.0941, 'grad_norm': 0.7104193568229675, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.17}
1. {'loss': 0.0759, 'grad_norm': 0.6203359365463257, 'learning_rate': 6.25e-05, 'epoch': 0.25}
1. {'loss': 0.0601, 'grad_norm': 0.7786904573440552, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.33}
1. {'loss': 0.0543, 'grad_norm': 1.0464279651641846, 'learning_rate': 9.997322937381829e-05, 'epoch': 0.42}
1. {'loss': 0.0387, 'grad_norm': 0.722430944442749, 'learning_rate': 9.903926402016153e-05, 'epoch': 0.5}
1. {'loss': 0.0411, 'grad_norm': 0.8128759264945984, 'learning_rate': 9.67952963378663e-05, 'epoch': 0.58}
1. {'loss': 0.0309, 'grad_norm': 0.5200320482254028, 'learning_rate': 9.330127018922194e-05, 'epoch': 0.67}
1. {'loss': 0.0255, 'grad_norm': 0.5879812240600586, 'learning_rate': 8.865052266813685e-05, 'epoch': 0.75}
1. {'loss': 0.025, 'grad_norm': 0.5690993666

7.  94%|█████████▍| 15/16 [09:53<00:39, 39.71s/it][A


7.   0%|          | 0/30 [05:19<?, ?it/s]
7. {'loss': 0.0006, 'grad_norm': 0.020181803032755852, 'learning_rate': 5e-05, 'epoch': 1.0}
7.   0%|          | 0/30 [10:37<?, ?it/s]
7. {'train_runtime': 633.425, 'train_samples_per_second': 0.202, 'train_steps_per_second': 0.025, 'train_loss': 0.0015664689417462796, 'epoch': 1.0}


7. 100%|██████████| 16/16 [10:33<00:00, 39.77s/it][A
7. [A
7. 100%|██████████| 16/16 [10:33<00:00, 39.77s/it][A
7. [A
7. 100%|██████████| 16/16 [10:33<00:00, 39.77s/it][A
7. 100%|██████████| 16/16 [10:33<00:00, 39.59s/it]
5. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
5. Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


5. *** Load challanges from '/kaggle/input/arc-prize-2025/arc-agi_evaluation_challenges.json'...
5. *** -> Fake test set detected, setting flag 'is_fake' to True.
5. *** Load base model and tokenizer from '/kaggle/temp/finetuned_model_gpu1'...
5. 🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
5. ==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.44.0.
5.    \\   /|    GPU: NVIDIA L4. Max memory: 22.278 GB. Platform = Linux.
5. O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
5. \        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
5.  "-____-"     Free Apache license: http://github.com/unslothai/unsloth
5. Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


5. Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]
4.  44%|████▍     | 7/16 [03:17<04:14, 28.23s/it][A


4. {'loss': 0.0055, 'grad_norm': 0.223825603723526, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [03:45<03:46, 28.27s/it][A
4. [A
5. Loading checkpoint shards:  33%|███▎      | 1/3 [00:02<00:05,  2.66s/it]
5. Loading checkpoint shards:  67%|██████▋   | 2/3 [00:05<00:02,  2.65s/it]
5. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.59s/it]
5. Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.61s/it]
5. Unsloth 2024.9.post4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


5. *** Augment dataset...
5. 688
5. *** Load peft state_dict from '/kaggle/temp/finetuned_model_gpu1'...
5. *** Load stored data...
5. *** Start inference run...
5.   0%|          | 0/30 [00:00<?, ?it/s]retraining model for key '0934a4d8' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 423.92 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 419.70 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  50%|█████     | 8/16 [03:45<03:46, 28.27s/it][A
4.  56%|█████▋    | 9/16 [04:14<03:18, 28.40s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  62%|██████▎   | 10/16 [04:42<02:50, 28.46s/it][A
5.   6%|▋         | 1/16 [00:42<10:41, 42.75s/it][A
4.  69%|██████▉   | 11/16 [05:11<02:22, 28.46s/it][A
4.  75%|███████▌  | 12/16 [05:39<01:53, 28.46s/it][A
5.  12%|█▎        | 2/16 [01:24<09:47, 41.98s/it][A
4.  81%|████████▏ | 13/16 [06:08<01:25, 28.48s/it][A
5.  19%|█▉        | 3/16 [02:05<09:04, 41.88s/it][A
4.  88%|████████▊ | 14/16 [06:36<00:56, 28.49s/it][A


7.   0%|          | 0/30 [10:37<?, ?it/s]*** -> Training took 633.425 seconds.
7.   3%|▎         | 1/30 [14:29<7:00:16, 869.54s/it]retraining model for key '16de56c4' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1872.27 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  94%|█████████▍| 15/16 [07:05<00:28, 28.48s/it][A


4.   3%|▎         | 1/30 [09:17<2:38:35, 328.11s/it]


4. 100%|██████████| 16/16 [07:33<00:00, 28.45s/it][A
4. [A
4. 100%|██████████| 16/16 [07:33<00:00, 28.45s/it][A
4. [A


4. {'loss': 0.0009, 'grad_norm': 0.3851648271083832, 'learning_rate': 5e-05, 'epoch': 1.0}
4.   3%|▎         | 1/30 [13:05<2:38:35, 328.11s/it]
4. {'train_runtime': 453.7301, 'train_samples_per_second': 0.282, 'train_steps_per_second': 0.035, 'train_loss': 0.00318871031049639, 'epoch': 1.0}


4. 100%|██████████| 16/16 [07:33<00:00, 28.45s/it][A
4. 100%|██████████| 16/16 [07:33<00:00, 28.36s/it]
5.  25%|██▌       | 4/16 [02:48<08:23, 41.98s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A


6.   0%|          | 0/30 [03:17<?, ?it/s]*** -> Training took 194.1493 seconds.
6.   3%|▎         | 1/30 [08:50<4:16:14, 530.17s/it]retraining model for key '136b0064' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1838.66 examples/s]
7.   6%|▋         | 1/16 [00:09<02:23,  9.56s/it][A
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  12%|█▎        | 2/16 [00:19<02:13,  9.54s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  19%|█▉        | 3/16 [00:28<02:04,  9.54s/it][A
6.   6%|▋         | 1/16 [00:08<02:11,  8.77s/it][A
5.  31%|███▏      | 5/16 [03:30<07:42, 42.04s/it][A
6.  12%|█▎        | 2/16 [00:17<02:02,  8.74s/it][A
7.  25%|██▌       | 4/16 [00:38<01:54,  9.54s/it][A
6.  19%|█▉        | 3/16 [00:26<01:53,  8.74s/it][A
7.  31%|███▏      | 5/16 [00:47<01:44,  9.53s/it][A
6.  25%|██▌       | 4/16 [00:35<01:45,  8.76s/it][A
7.  38%|███▊      | 6/16 [00:57<01:35,  9.52s/it][A
6.  31%|███▏      | 5/16 [00:43<01:36,  8.75s/it][A
7.  44%|████▍     | 7/16 [01:06<01:25,  9.51s/it][A


7. {'loss': 0.0028, 'grad_norm': 0.4009556174278259, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:16<01:15,  9.49s/it][A
7. [A
6.  38%|███▊      | 6/16 [00:52<01:27,  8.75s/it][A
7.  50%|█████     | 8/16 [01:16<01:15,  9.49s/it][A
5.  38%|███▊      | 6/16 [04:12<07:01, 42.11s/it][A
6.  44%|████▍     | 7/16 [01:01<01:18,  8.74s/it][A


6. {'loss': 0.0355, 'grad_norm': 1.471221685409546, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:09<01:09,  8.74s/it][A
6. [A
7.  56%|█████▋    | 9/16 [01:25<01:06,  9.49s/it][A
6.  50%|█████     | 8/16 [01:09<01:09,  8.74s/it][A
7.  62%|██████▎   | 10/16 [01:35<00:56,  9.48s/it][A
6.  56%|█████▋    | 9/16 [01:18<01:01,  8.75s/it][A
7.  69%|██████▉   | 11/16 [01:44<00:47,  9.47s/it][A
6.  62%|██████▎   | 10/16 [01:27<00:52,  8.75s/it][A
7.  75%|███████▌  | 12/16 [01:53<00:37,  9.46s/it][A
6.  69%|██████▉   | 11/16 [01:36<00:43,  8.74s/it][A
5.  44%|████▍     | 7/16 [04:54<06:18, 42.10s/it][A


5. {'loss': 0.0064, 'grad_norm': 1.3462674617767334, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [05:36<05:36, 42.12s/it][A
5. [A
7.  81%|████████▏ | 13/16 [02:03<00:28,  9.45s/it][A
6.  75%|███████▌  | 12/16 [01:44<00:34,  8.74s/it][A
7.  88%|████████▊ | 14/16 [02:12<00:18,  9.44s/it][A
6.  81%|████████▏ | 13/16 [01:53<00:26,  8.74s/it][A


4.   3%|▎         | 1/30 [13:05<2:38:35, 328.11s/it]*** -> Training took 453.7301 seconds.


7.  94%|█████████▍| 15/16 [02:22<00:09,  9.44s/it][A


7.   3%|▎         | 1/30 [15:48<7:00:16, 869.54s/it]
7. {'loss': 0.0013, 'grad_norm': 0.23983830213546753, 'learning_rate': 5e-05, 'epoch': 1.0}
7.   3%|▎         | 1/30 [17:04<7:00:16, 869.54s/it]
7. {'train_runtime': 151.6363, 'train_samples_per_second': 0.844, 'train_steps_per_second': 0.106, 'train_loss': 0.0020786550594493747, 'epoch': 1.0}


7. 100%|██████████| 16/16 [02:31<00:00,  9.43s/it][A
7. [A
7. 100%|██████████| 16/16 [02:31<00:00,  9.43s/it][A
7. [A
7. 100%|██████████| 16/16 [02:31<00:00,  9.43s/it][A
7. 100%|██████████| 16/16 [02:31<00:00,  9.48s/it]


4.   7%|▋         | 2/30 [15:34<3:49:23, 491.56s/it]retraining model for key '271d71e2' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1432.52 examples/s]
6.  88%|████████▊ | 14/16 [02:02<00:17,  8.74s/it][A
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  94%|█████████▍| 15/16 [02:11<00:08,  8.74s/it][A


6.   3%|▎         | 1/30 [10:03<4:16:14, 530.17s/it]
6. {'loss': 0.0052, 'grad_norm': 1.783237338066101, 'learning_rate': 5e-05, 'epoch': 1.0}
6.   3%|▎         | 1/30 [11:13<4:16:14, 530.17s/it]
6. {'train_runtime': 139.8643, 'train_samples_per_second': 0.915, 'train_steps_per_second': 0.114, 'train_loss': 0.020358080742880702, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:19<00:00,  8.73s/it][A
6. [A
6. 100%|██████████| 16/16 [02:19<00:00,  8.73s/it][A
6. [A
6. 100%|██████████| 16/16 [02:19<00:00,  8.73s/it][A
6. 100%|██████████| 16/16 [02:19<00:00,  8.74s/it]
4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  50%|█████     | 8/16 [05:36<05:36, 42.12s/it][A
4.   6%|▋         | 1/16 [00:11<02:47, 11.17s/it][A
4.  12%|█▎        | 2/16 [00:22<02:35, 11.14s/it][A


6.   3%|▎         | 1/30 [11:13<4:16:14, 530.17s/it]*** -> Training took 139.8643 seconds.
6.   7%|▋         | 2/30 [11:47<2:30:35, 322.68s/it]retraining model for key '2b83f449' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1633.79 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  19%|█▉        | 3/16 [00:33<02:24, 11.13s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  25%|██▌       | 4/16 [00:44<02:13, 11.13s/it][A
5.  56%|█████▋    | 9/16 [06:18<04:55, 42.16s/it][A
6.   6%|▋         | 1/16 [00:10<02:33, 10.26s/it][A
4.  31%|███▏      | 5/16 [00:55<02:02, 11.13s/it][A
6.  12%|█▎        | 2/16 [00:20<02:24, 10.33s/it][A
4.  38%|███▊      | 6/16 [01:06<01:51, 11.13s/it][A
6.  19%|█▉        | 3/16 [00:30<02:14, 10.32s/it][A
4.  44%|████▍     | 7/16 [01:17<01:40, 11.12s/it][A


4. {'loss': 0.0114, 'grad_norm': 0.4247930943965912, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:28<01:28, 11.11s/it][A
4. [A
6.  25%|██▌       | 4/16 [00:41<02:03, 10.27s/it][A


7.   3%|▎         | 1/30 [17:04<7:00:16, 869.54s/it]*** -> Training took 151.6363 seconds.
7.   7%|▋         | 2/30 [18:46<3:57:35, 509.13s/it]retraining model for key '1818057f' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1500.34 examples/s]
4.  50%|█████     | 8/16 [01:28<01:28, 11.11s/it][A
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160
5.  62%|██████▎   | 10/16 [07:01<04:13, 42.21s/it][A


7. *** Start training run...


6.  31%|███▏      | 5/16 [00:51<01:52, 10.20s/it][A
4.  56%|█████▋    | 9/16 [01:40<01:17, 11.11s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  38%|███▊      | 6/16 [01:01<01:41, 10.14s/it][A
4.  62%|██████▎   | 10/16 [01:51<01:06, 11.11s/it][A
6.  44%|████▍     | 7/16 [01:11<01:31, 10.12s/it][A


6. {'loss': 0.0353, 'grad_norm': 0.3578176200389862, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:21<01:20, 10.09s/it][A
6. [A
7.   6%|▋         | 1/16 [00:11<02:56, 11.79s/it][A
4.  69%|██████▉   | 11/16 [02:02<00:55, 11.12s/it][A
6.  50%|█████     | 8/16 [01:21<01:20, 10.09s/it][A
7.  12%|█▎        | 2/16 [00:23<02:44, 11.76s/it][A
5.  69%|██████▉   | 11/16 [07:43<03:30, 42.14s/it][A
4.  75%|███████▌  | 12/16 [02:13<00:44, 11.13s/it][A
6.  56%|█████▋    | 9/16 [01:31<01:10, 10.09s/it][A
7.  19%|█▉        | 3/16 [00:35<02:33, 11.78s/it][A
6.  62%|██████▎   | 10/16 [01:41<01:00, 10.09s/it][A
4.  81%|████████▏ | 13/16 [02:24<00:33, 11.14s/it][A
7.  25%|██▌       | 4/16 [00:47<02:21, 11.79s/it][A
6.  69%|██████▉   | 11/16 [01:51<00:50, 10.08s/it][A
4.  88%|████████▊ | 14/16 [02:35<00:22, 11.14s/it][A
7.  31%|███▏      | 5/16 [00:58<02:09, 11.79s/it][A
6.  75%|███████▌  | 12/16 [02:01<00:40, 10.09s/it][A
4.  94%|█████████▍| 15/16 [02:46<00:11, 11.14s/it][A


4.   7%|▋         | 2/30 [17:06<3:49:23, 491.56s/it]
4. {'loss': 0.0015, 'grad_norm': 0.8052920699119568, 'learning_rate': 5e-05, 'epoch': 1.0}
4.   7%|▋         | 2/30 [18:35<3:49:23, 491.56s/it]
4. {'train_runtime': 178.0875, 'train_samples_per_second': 0.719, 'train_steps_per_second': 0.09, 'train_loss': 0.006414992094505578, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:58<00:00, 11.14s/it][A
4. [A
4. 100%|██████████| 16/16 [02:58<00:00, 11.14s/it][A
4. [A
4. 100%|██████████| 16/16 [02:58<00:00, 11.14s/it][A
4. 100%|██████████| 16/16 [02:58<00:00, 11.13s/it]
7.  38%|███▊      | 6/16 [01:10<01:58, 11.81s/it][A
6.  81%|████████▏ | 13/16 [02:11<00:30, 10.10s/it][A
5.  75%|███████▌  | 12/16 [08:25<02:48, 42.16s/it][A
6.  88%|████████▊ | 14/16 [02:21<00:20, 10.12s/it][A
7.  44%|████▍     | 7/16 [01:22<01:46, 11.81s/it][A


7. {'loss': 0.0086, 'grad_norm': 0.07340596616268158, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:34<01:34, 11.81s/it][A
7. [A
6.  94%|█████████▍| 15/16 [02:32<00:10, 10.13s/it][A


6.   7%|▋         | 2/30 [13:12<2:30:35, 322.68s/it]
6. {'loss': 0.0071, 'grad_norm': 0.3827928304672241, 'learning_rate': 5e-05, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:42<00:00, 10.12s/it][A
6. [A
6. 100%|██████████| 16/16 [02:42<00:00, 10.12s/it][A
6. [A


6.   7%|▋         | 2/30 [14:32<2:30:35, 322.68s/it]
6. {'train_runtime': 162.2313, 'train_samples_per_second': 0.789, 'train_steps_per_second': 0.099, 'train_loss': 0.021195611683651805, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:42<00:00, 10.12s/it][A
6. 100%|██████████| 16/16 [02:42<00:00, 10.14s/it]
7.  50%|█████     | 8/16 [01:34<01:34, 11.81s/it][A
7.  56%|█████▋    | 9/16 [01:46<01:22, 11.82s/it][A
5.  81%|████████▏ | 13/16 [09:07<02:06, 42.19s/it][A
7.  62%|██████▎   | 10/16 [01:58<01:10, 11.83s/it][A
7.  69%|██████▉   | 11/16 [02:09<00:59, 11.83s/it][A
7.  75%|███████▌  | 12/16 [02:21<00:47, 11.86s/it][A
7.  81%|████████▏ | 13/16 [02:33<00:35, 11.90s/it][A
5.  88%|████████▊ | 14/16 [09:49<01:24, 42.18s/it][A
7.  88%|████████▊ | 14/16 [02:45<00:23, 11.94s/it][A


6.   7%|▋         | 2/30 [14:32<2:30:35, 322.68s/it]*** -> Training took 162.2313 seconds.
6.  10%|█         | 3/30 [15:54<2:09:42, 288.23s/it]retraining model for key '2d0172a1' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 762.91 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 751.68 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  94%|█████████▍| 15/16 [02:57<00:11, 11.96s/it][A


7.   7%|▋         | 2/30 [20:23<3:57:35, 509.13s/it]
7. {'loss': 0.0046, 'grad_norm': 0.07071002572774887, 'learning_rate': 5e-05, 'epoch': 1.0}
7.   7%|▋         | 2/30 [21:59<3:57:35, 509.13s/it]
7. {'train_runtime': 189.8117, 'train_samples_per_second': 0.674, 'train_steps_per_second': 0.084, 'train_loss': 0.006597050232812762, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:09<00:00, 11.95s/it][A
7. [A
7. 100%|██████████| 16/16 [03:09<00:00, 11.95s/it][A
7. [A
7. 100%|██████████| 16/16 [03:09<00:00, 11.95s/it][A
7. 100%|██████████| 16/16 [03:09<00:00, 11.86s/it]
5.  94%|█████████▍| 15/16 [10:32<00:42, 42.19s/it][A


5.   0%|          | 0/30 [05:41<?, ?it/s]
5. {'loss': 0.004, 'grad_norm': 0.058422211557626724, 'learning_rate': 5e-05, 'epoch': 1.0}
5.   0%|          | 0/30 [11:18<?, ?it/s]
5. {'train_runtime': 674.0827, 'train_samples_per_second': 0.19, 'train_steps_per_second': 0.024, 'train_loss': 0.00519835390150547, 'epoch': 1.0}


5. 100%|██████████| 16/16 [11:14<00:00, 42.12s/it][A
5. [A
5. 100%|██████████| 16/16 [11:14<00:00, 42.12s/it][A
5. [A
5. 100%|██████████| 16/16 [11:14<00:00, 42.12s/it][A
5. 100%|██████████| 16/16 [11:14<00:00, 42.13s/it]
6.   0%|          | 0/16 [00:00<?, ?it/s][A


5.   0%|          | 0/30 [11:18<?, ?it/s]*** -> Training took 674.0827 seconds.
5.   3%|▎         | 1/30 [11:30<5:33:48, 690.65s/it]retraining model for key '13e47133' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 835.01 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 822.61 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


4.   7%|▋         | 2/30 [18:35<3:49:23, 491.56s/it]*** -> Training took 178.0875 seconds.
5. *** Start training run...
4.  10%|█         | 3/30 [21:05<3:08:21, 418.59s/it]retraining model for key '291dc1e1' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1448.75 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.   6%|▋         | 1/16 [00:22<05:30, 22.04s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.   6%|▋         | 1/16 [00:11<02:46, 11.08s/it][A
6.  12%|█▎        | 2/16 [00:44<05:08, 22.01s/it][A
4.  12%|█▎        | 2/16 [00:22<02:34, 11.04s/it][A
5.   6%|▋         | 1/16 [00:21<05:19, 21.28s/it][A
4.  19%|█▉        | 3/16 [00:33<02:23, 11.03s/it][A
6.  19%|█▉        | 3/16 [01:06<04:46, 22.04s/it][A
4.  25%|██▌       | 4/16 [00:44<02:12, 11.01s/it][A
5.  12%|█▎        | 2/16 [00:42<04:58, 21.35s/it][A
4.  31%|███▏      | 5/16 [00:55<02:01, 11.00s/it][A
6.  25%|██▌       | 4/16 [01:28<04:23, 22.00s/it][A
4.  38%|███▊      | 6/16 [01:06<01:49, 10.99s/it][A
5.  19%|█▉        | 3/16 [01:03<04:36, 21.27s/it][A
4.  44%|████▍     | 7/16 [01:17<01:38, 11.00s/it][A


4. {'loss': 0.0305, 'grad_norm': 0.614399254322052, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:28<01:27, 10.99s/it][A
4. [A
6.  31%|███▏      | 5/16 [01:49<04:01, 21.97s/it][A
4.  50%|█████     | 8/16 [01:28<01:27, 10.99s/it][A
5.  25%|██▌       | 4/16 [01:24<04:14, 21.20s/it][A
4.  56%|█████▋    | 9/16 [01:39<01:16, 11.00s/it][A
6.  38%|███▊      | 6/16 [02:11<03:39, 21.97s/it][A
4.  62%|██████▎   | 10/16 [01:50<01:06, 11.01s/it][A
5.  31%|███▏      | 5/16 [01:46<03:52, 21.18s/it][A
4.  69%|██████▉   | 11/16 [02:01<00:55, 11.02s/it][A
6.  44%|████▍     | 7/16 [02:33<03:17, 21.97s/it][A


6. {'loss': 0.0183, 'grad_norm': 0.8062732815742493, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:55<02:55, 21.98s/it][A
6. [A
4.  75%|███████▌  | 12/16 [02:12<00:44, 11.03s/it][A
5.  38%|███▊      | 6/16 [02:07<03:31, 21.18s/it][A
4.  81%|████████▏ | 13/16 [02:23<00:33, 11.04s/it][A
6.  50%|█████     | 8/16 [02:55<02:55, 21.98s/it][A
4.  88%|████████▊ | 14/16 [02:34<00:22, 11.04s/it][A
5.  44%|████▍     | 7/16 [02:28<03:10, 21.18s/it][A


5. {'loss': 0.0038, 'grad_norm': 0.1412174105644226, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:49<02:49, 21.16s/it][A
5. [A
4.  94%|█████████▍| 15/16 [02:45<00:11, 11.04s/it][A


4.  10%|█         | 3/30 [22:37<3:08:21, 418.59s/it]
4. {'loss': 0.0058, 'grad_norm': 0.9204539060592651, 'learning_rate': 5e-05, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:56<00:00, 11.03s/it][A
4. [A


4.  10%|█         | 3/30 [24:05<3:08:21, 418.59s/it]
4. {'train_runtime': 176.3531, 'train_samples_per_second': 0.726, 'train_steps_per_second': 0.091, 'train_loss': 0.01817387924529612, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:56<00:00, 11.03s/it][A
4. [A
4. 100%|██████████| 16/16 [02:56<00:00, 11.03s/it][A
4. 100%|██████████| 16/16 [02:56<00:00, 11.02s/it]
6.  56%|█████▋    | 9/16 [03:17<02:33, 21.98s/it][A


7.   7%|▋         | 2/30 [21:59<3:57:35, 509.13s/it]*** -> Training took 189.8117 seconds.
7.  10%|█         | 3/30 [25:42<3:30:04, 466.83s/it]retraining model for key '20270e3b' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 2849.33 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...
4.  10%|█         | 3/30 [24:05<3:08:21, 418.59s/it]*** -> Training took 176.3531 seconds.


5.  50%|█████     | 8/16 [02:49<02:49, 21.16s/it][A


4.  13%|█▎        | 4/30 [24:16<2:22:19, 328.43s/it]retraining model for key '2c181942' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 575.57 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 568.02 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:05<01:29,  5.96s/it][A
6.  62%|██████▎   | 10/16 [03:39<02:11, 21.98s/it][A
7.  12%|█▎        | 2/16 [00:11<01:22,  5.92s/it][A
5.  56%|█████▋    | 9/16 [03:10<02:28, 21.16s/it][A
7.  19%|█▉        | 3/16 [00:17<01:16,  5.91s/it][A
7.  25%|██▌       | 4/16 [00:23<01:10,  5.91s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  31%|███▏      | 5/16 [00:29<01:04,  5.90s/it][A
6.  69%|██████▉   | 11/16 [04:01<01:49, 21.97s/it][A
7.  38%|███▊      | 6/16 [00:35<00:59,  5.90s/it][A
5.  62%|██████▎   | 10/16 [03:31<02:06, 21.15s/it][A
7.  44%|████▍     | 7/16 [00:41<00:53,  5.90s/it][A


7. {'loss': 0.0106, 'grad_norm': 1.7349493503570557, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [00:47<00:47,  5.90s/it][A
7. [A
7.  50%|█████     | 8/16 [00:47<00:47,  5.90s/it][A
6.  75%|███████▌  | 12/16 [04:23<01:27, 21.96s/it][A
7.  56%|█████▋    | 9/16 [00:53<00:41,  5.90s/it][A
4.   6%|▋         | 1/16 [00:29<07:18, 29.26s/it][A
5.  69%|██████▉   | 11/16 [03:53<01:45, 21.15s/it][A
7.  62%|██████▎   | 10/16 [00:59<00:35,  5.91s/it][A
7.  69%|██████▉   | 11/16 [01:04<00:29,  5.91s/it][A
7.  75%|███████▌  | 12/16 [01:10<00:23,  5.92s/it][A
6.  81%|████████▏ | 13/16 [04:45<01:05, 21.96s/it][A
7.  81%|████████▏ | 13/16 [01:16<00:17,  5.91s/it][A
5.  75%|███████▌  | 12/16 [04:14<01:24, 21.16s/it][A
7.  88%|████████▊ | 14/16 [01:22<00:11,  5.91s/it][A
4.  12%|█▎        | 2/16 [00:58<06:49, 29.27s/it][A
7.  94%|█████████▍| 15/16 [01:28<00:05,  5.91s/it][A


7.  10%|█         | 3/30 [26:33<3:30:04, 466.83s/it]
7. {'loss': 0.0014, 'grad_norm': 0.04534924030303955, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  10%|█         | 3/30 [27:20<3:30:04, 466.83s/it]
7. {'train_runtime': 94.5337, 'train_samples_per_second': 1.354, 'train_steps_per_second': 0.169, 'train_loss': 0.005970537837129086, 'epoch': 1.0}


7. 100%|██████████| 16/16 [01:34<00:00,  5.91s/it][A
7. [A
7. 100%|██████████| 16/16 [01:34<00:00,  5.91s/it][A
7. [A
7. 100%|██████████| 16/16 [01:34<00:00,  5.91s/it][A
7. 100%|██████████| 16/16 [01:34<00:00,  5.91s/it]
6.  88%|████████▊ | 14/16 [05:07<00:43, 21.97s/it][A
5.  81%|████████▏ | 13/16 [04:35<01:03, 21.15s/it][A
4.  19%|█▉        | 3/16 [01:27<06:20, 29.29s/it][A
6.  94%|█████████▍| 15/16 [05:29<00:21, 21.95s/it][A


6.  10%|█         | 3/30 [18:54<2:09:42, 288.23s/it]
6. {'loss': 0.005, 'grad_norm': 0.6942006349563599, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  10%|█         | 3/30 [21:50<2:09:42, 288.23s/it]


6. 100%|██████████| 16/16 [05:51<00:00, 21.94s/it][A
6. [A
6. 100%|██████████| 16/16 [05:51<00:00, 21.94s/it][A
6. [A


6. {'train_runtime': 351.5196, 'train_samples_per_second': 0.364, 'train_steps_per_second': 0.046, 'train_loss': 0.011648109648376703, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:51<00:00, 21.94s/it][A
6. 100%|██████████| 16/16 [05:51<00:00, 21.97s/it]
5.  88%|████████▊ | 14/16 [04:56<00:42, 21.16s/it][A


7.  10%|█         | 3/30 [27:20<3:30:04, 466.83s/it]*** -> Training took 94.5337 seconds.
7.  13%|█▎        | 4/30 [28:03<2:26:27, 337.96s/it]retraining model for key '20a9e565' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 632.36 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 623.64 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


5.  94%|█████████▍| 15/16 [05:17<00:21, 21.16s/it][A


5.   3%|▎         | 1/30 [14:24<5:33:48, 690.65s/it]
5. {'loss': 0.0012, 'grad_norm': 0.4062530994415283, 'learning_rate': 5e-05, 'epoch': 1.0}
5.   3%|▎         | 1/30 [17:13<5:33:48, 690.65s/it]
5. {'train_runtime': 338.863, 'train_samples_per_second': 0.378, 'train_steps_per_second': 0.047, 'train_loss': 0.0024959638831205666, 'epoch': 1.0}


5. 100%|██████████| 16/16 [05:38<00:00, 21.17s/it][A
5. [A
5. 100%|██████████| 16/16 [05:38<00:00, 21.17s/it][A
5. [A
5. 100%|██████████| 16/16 [05:38<00:00, 21.17s/it][A
5. 100%|██████████| 16/16 [05:38<00:00, 21.18s/it]
4.  25%|██▌       | 4/16 [01:57<05:51, 29.32s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  31%|███▏      | 5/16 [02:26<05:22, 29.34s/it][A
7.   6%|▋         | 1/16 [00:29<07:25, 29.70s/it][A


6.  10%|█         | 3/30 [21:50<2:09:42, 288.23s/it]*** -> Training took 351.5196 seconds.


4.  38%|███▊      | 6/16 [02:55<04:53, 29.35s/it][A


6.  13%|█▎        | 4/30 [23:14<2:30:47, 348.00s/it]retraining model for key '36a08778' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 784.05 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 772.85 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  12%|█▎        | 2/16 [00:59<06:56, 29.76s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  44%|████▍     | 7/16 [03:25<04:24, 29.36s/it][A


4. {'loss': 0.0048, 'grad_norm': 0.23753371834754944, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [03:54<03:55, 29.38s/it][A
4. [A
6.   6%|▋         | 1/16 [00:22<05:33, 22.23s/it][A
7.  19%|█▉        | 3/16 [01:29<06:28, 29.86s/it][A
4.  50%|█████     | 8/16 [03:54<03:55, 29.38s/it][A


5.   3%|▎         | 1/30 [17:13<5:33:48, 690.65s/it]*** -> Training took 338.863 seconds.
5.   7%|▋         | 2/30 [19:21<4:21:55, 561.27s/it]retraining model for key '142ca369' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 693.16 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 683.30 examples/s]
6.  12%|█▎        | 2/16 [00:44<05:11, 22.22s/it][A
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  25%|██▌       | 4/16 [01:59<05:59, 29.93s/it][A
4.  56%|█████▋    | 9/16 [04:24<03:25, 29.40s/it][A
6.  19%|█▉        | 3/16 [01:06<04:48, 22.16s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  31%|███▏      | 5/16 [02:29<05:29, 29.94s/it][A
6.  25%|██▌       | 4/16 [01:28<04:25, 22.12s/it][A
4.  62%|██████▎   | 10/16 [04:53<02:56, 29.42s/it][A
5.   6%|▋         | 1/16 [00:25<06:22, 25.50s/it][A
6.  31%|███▏      | 5/16 [01:50<04:03, 22.11s/it][A
7.  38%|███▊      | 6/16 [02:59<04:59, 29.95s/it][A
4.  69%|██████▉   | 11/16 [05:23<02:27, 29.45s/it][A
5.  12%|█▎        | 2/16 [00:51<05:57, 25.56s/it][A
6.  38%|███▊      | 6/16 [02:12<03:40, 22.10s/it][A
7.  44%|████▍     | 7/16 [03:29<04:29, 29.97s/it][A
7.  50%|█████     | 8/16 [03:59<03:59, 29.98s/it][A
7. [A


7. {'loss': 0.0259, 'grad_norm': 2.005958318710327, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  19%|█▉        | 3/16 [01:16<05:32, 25.54s/it][A
4.  75%|███████▌  | 12/16 [05:52<01:57, 29.46s/it][A
6.  44%|████▍     | 7/16 [02:34<03:19, 22.11s/it][A


6. {'loss': 0.0037, 'grad_norm': 0.21396669745445251, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:57<02:57, 22.13s/it][A
6. [A
5.  25%|██▌       | 4/16 [01:42<05:06, 25.51s/it][A
7.  50%|█████     | 8/16 [03:59<03:59, 29.98s/it][A
6.  50%|█████     | 8/16 [02:57<02:57, 22.13s/it][A
4.  81%|████████▏ | 13/16 [06:22<01:28, 29.46s/it][A
5.  31%|███▏      | 5/16 [02:07<04:40, 25.50s/it][A
6.  56%|█████▋    | 9/16 [03:19<02:34, 22.13s/it][A
7.  56%|█████▋    | 9/16 [04:29<03:30, 30.00s/it][A
4.  88%|████████▊ | 14/16 [06:51<00:58, 29.44s/it][A
6.  62%|██████▎   | 10/16 [03:41<02:12, 22.13s/it][A
5.  38%|███▊      | 6/16 [02:33<04:14, 25.50s/it][A
7.  62%|██████▎   | 10/16 [04:59<02:59, 30.00s/it][A
4.  94%|█████████▍| 15/16 [07:20<00:29, 29.43s/it][A


4.  13%|█▎        | 4/30 [28:14<2:22:19, 328.43s/it]
4. {'loss': 0.0007, 'grad_norm': 0.39564278721809387, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  13%|█▎        | 4/30 [32:10<2:22:19, 328.43s/it]
4. {'train_runtime': 470.3338, 'train_samples_per_second': 0.272, 'train_steps_per_second': 0.034, 'train_loss': 0.002748749335296452, 'epoch': 1.0}


4. 100%|██████████| 16/16 [07:50<00:00, 29.42s/it][A
4. [A
4. 100%|██████████| 16/16 [07:50<00:00, 29.42s/it][A
4. [A
4. 100%|██████████| 16/16 [07:50<00:00, 29.42s/it][A
4. 100%|██████████| 16/16 [07:50<00:00, 29.40s/it]
6.  69%|██████▉   | 11/16 [04:03<01:50, 22.12s/it][A
5.  44%|████▍     | 7/16 [02:58<03:49, 25.49s/it][A
5.  50%|█████     | 8/16 [03:24<03:23, 25.50s/it][A
5. [A


5. {'loss': 0.0032, 'grad_norm': 0.21135690808296204, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  75%|███████▌  | 12/16 [04:25<01:28, 22.12s/it][A
7.  69%|██████▉   | 11/16 [05:29<02:29, 29.99s/it][A
5.  50%|█████     | 8/16 [03:24<03:23, 25.50s/it][A
6.  81%|████████▏ | 13/16 [04:47<01:06, 22.10s/it][A
7.  75%|███████▌  | 12/16 [05:59<01:59, 29.99s/it][A
5.  56%|█████▋    | 9/16 [03:49<02:58, 25.51s/it][A
6.  88%|████████▊ | 14/16 [05:09<00:44, 22.11s/it][A
7.  81%|████████▏ | 13/16 [06:29<01:29, 29.99s/it][A
5.  62%|██████▎   | 10/16 [04:15<02:33, 25.51s/it][A
6.  94%|█████████▍| 15/16 [05:31<00:22, 22.11s/it][A


6.  13%|█▎        | 4/30 [26:15<2:30:47, 348.00s/it]
6. {'loss': 0.0008, 'grad_norm': 0.28369802236557007, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  13%|█▎        | 4/30 [29:12<2:30:47, 348.00s/it]
6. {'train_runtime': 353.9219, 'train_samples_per_second': 0.362, 'train_steps_per_second': 0.045, 'train_loss': 0.0022598178475163877, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:53<00:00, 22.10s/it][A
6. [A
6. 100%|██████████| 16/16 [05:53<00:00, 22.10s/it][A
6. [A
6. 100%|██████████| 16/16 [05:53<00:00, 22.10s/it][A
6. 100%|██████████| 16/16 [05:53<00:00, 22.12s/it]
5.  69%|██████▉   | 11/16 [04:40<02:07, 25.50s/it][A
7.  88%|████████▊ | 14/16 [06:59<00:59, 29.98s/it][A
5.  75%|███████▌  | 12/16 [05:06<01:42, 25.50s/it][A
7.  94%|█████████▍| 15/16 [07:29<00:29, 29.99s/it][A


7.  13%|█▎        | 4/30 [32:06<2:26:27, 337.96s/it]
7. {'loss': 0.0046, 'grad_norm': 1.8752752542495728, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  13%|█▎        | 4/30 [36:06<2:26:27, 337.96s/it]
7. {'train_runtime': 479.5393, 'train_samples_per_second': 0.267, 'train_steps_per_second': 0.033, 'train_loss': 0.015251360600814223, 'epoch': 1.0}


7. 100%|██████████| 16/16 [07:59<00:00, 30.03s/it][A
7. [A
7. 100%|██████████| 16/16 [07:59<00:00, 30.03s/it][A
7. [A
7. 100%|██████████| 16/16 [07:59<00:00, 30.03s/it][A
7. 100%|██████████| 16/16 [07:59<00:00, 29.97s/it]
5.  81%|████████▏ | 13/16 [05:31<01:16, 25.51s/it][A


7.  13%|█▎        | 4/30 [36:06<2:26:27, 337.96s/it]*** -> Training took 479.5393 seconds.


5.  88%|████████▊ | 14/16 [05:57<00:51, 25.51s/it][A


7.  17%|█▋        | 5/30 [36:48<2:48:55, 405.42s/it]retraining model for key '21897d95' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1451.56 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...
4.  13%|█▎        | 4/30 [32:10<2:22:19, 328.43s/it]*** -> Training took 470.3338 seconds.


7.   0%|          | 0/16 [00:00<?, ?it/s][A


4.  17%|█▋        | 5/30 [35:33<3:09:20, 454.40s/it]retraining model for key '3dc255db' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1745.62 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


5.  94%|█████████▍| 15/16 [06:22<00:25, 25.51s/it][A


5.   7%|▋         | 2/30 [22:49<4:21:55, 561.27s/it]
5. {'loss': 0.0004, 'grad_norm': 0.12700526416301727, 'learning_rate': 5e-05, 'epoch': 1.0}
5.   7%|▋         | 2/30 [26:13<4:21:55, 561.27s/it]
5. {'train_runtime': 408.1125, 'train_samples_per_second': 0.314, 'train_steps_per_second': 0.039, 'train_loss': 0.001786214779713191, 'epoch': 1.0}


5. 100%|██████████| 16/16 [06:48<00:00, 25.50s/it][A
5. [A
5. 100%|██████████| 16/16 [06:48<00:00, 25.50s/it][A
5. [A
5. 100%|██████████| 16/16 [06:48<00:00, 25.50s/it][A
5. 100%|██████████| 16/16 [06:48<00:00, 25.51s/it]
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:12<03:01, 12.09s/it][A
4.   6%|▋         | 1/16 [00:09<02:16,  9.08s/it][A
7.  12%|█▎        | 2/16 [00:24<02:48, 12.06s/it][A
4.  12%|█▎        | 2/16 [00:18<02:06,  9.05s/it][A
7.  19%|█▉        | 3/16 [00:36<02:36, 12.05s/it][A
4.  19%|█▉        | 3/16 [00:27<01:57,  9.04s/it][A
4.  25%|██▌       | 4/16 [00:36<01:48,  9.04s/it][A
7.  25%|██▌       | 4/16 [00:48<02:24, 12.05s/it][A
4.  31%|███▏      | 5/16 [00:45<01:39,  9.05s/it][A
7.  31%|███▏      | 5/16 [01:00<02:12, 12.05s/it][A
4.  38%|███▊      | 6/16 [00:54<01:30,  9.05s/it][A
7.  38%|███▊      | 6/16 [01:12<02:00, 12.05s/it][A
4.  44%|████▍     | 7/16 [01:03<01:21,  9.06s/it][A


4. {'loss': 0.0061, 'grad_norm': 1.0123660564422607, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:12<01:12,  9.07s/it][A
4. [A
7.  44%|████▍     | 7/16 [01:24<01:48, 12.04s/it][A


7. {'loss': 0.0172, 'grad_norm': 0.4784879684448242, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:36<01:36, 12.03s/it][A
7. [A
4.  50%|█████     | 8/16 [01:12<01:12,  9.07s/it][A
4.  56%|█████▋    | 9/16 [01:21<01:03,  9.07s/it][A
7.  50%|█████     | 8/16 [01:36<01:36, 12.03s/it][A
4.  62%|██████▎   | 10/16 [01:30<00:54,  9.06s/it][A
7.  56%|█████▋    | 9/16 [01:48<01:24, 12.01s/it][A
4.  69%|██████▉   | 11/16 [01:39<00:45,  9.06s/it][A
7.  62%|██████▎   | 10/16 [02:00<01:12, 12.00s/it][A
4.  75%|███████▌  | 12/16 [01:48<00:36,  9.07s/it][A
4.  81%|████████▏ | 13/16 [01:57<00:27,  9.06s/it][A
7.  69%|██████▉   | 11/16 [02:12<01:00, 12.00s/it][A
4.  88%|████████▊ | 14/16 [02:06<00:18,  9.07s/it][A
7.  75%|███████▌  | 12/16 [02:24<00:48, 12.00s/it][A
4.  94%|█████████▍| 15/16 [02:15<00:09,  9.07s/it][A
4. 100%|██████████| 16/16 [02:24<00:00,  9.07s/it][A
4. [A
4. 100%|██████████| 16/16 [02:24<00:00,  9.07s/it][A
4. [A


4.  17%|█▋        | 5/30 [36:49<3:09:20, 454.40s/it]
4. {'loss': 0.0006, 'grad_norm': 0.15371665358543396, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  17%|█▋        | 5/30 [38:01<3:09:20, 454.40s/it]
4. {'train_runtime': 144.9918, 'train_samples_per_second': 0.883, 'train_steps_per_second': 0.11, 'train_loss': 0.003327933431137353, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:24<00:00,  9.07s/it][A
4. 100%|██████████| 16/16 [02:24<00:00,  9.06s/it]
7.  81%|████████▏ | 13/16 [02:36<00:36, 12.01s/it][A
7.  88%|████████▊ | 14/16 [02:48<00:24, 12.01s/it][A
7.  94%|█████████▍| 15/16 [03:00<00:12, 12.01s/it][A


7.  17%|█▋        | 5/30 [38:28<2:48:55, 405.42s/it]
7. {'loss': 0.0027, 'grad_norm': 1.3539406061172485, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  17%|█▋        | 5/30 [40:04<2:48:55, 405.42s/it]
7. {'train_runtime': 192.2878, 'train_samples_per_second': 0.666, 'train_steps_per_second': 0.083, 'train_loss': 0.009992693667300045, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:12<00:00, 11.99s/it][A
7. [A
7. 100%|██████████| 16/16 [03:12<00:00, 11.99s/it][A
7. [A
7. 100%|██████████| 16/16 [03:12<00:00, 11.99s/it][A
7. 100%|██████████| 16/16 [03:12<00:00, 12.02s/it]


4.  17%|█▋        | 5/30 [38:01<3:09:20, 454.40s/it]*** -> Training took 144.9918 seconds.
4.  20%|██        | 6/30 [38:55<2:27:24, 368.51s/it]retraining model for key '3e6067c3' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 570.13 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 563.00 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A


5.   7%|▋         | 2/30 [26:13<4:21:55, 561.27s/it]*** -> Training took 408.1125 seconds.
5.  10%|█         | 3/30 [30:01<4:28:42, 597.13s/it]retraining model for key '195c6913' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 481.66 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 476.09 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...
7.  17%|█▋        | 5/30 [40:04<2:48:55, 405.42s/it]*** -> Training took 192.2878 seconds.
7.  20%|██        | 6/30 [41:22<2:24:21, 360.90s/it]retraining model for key '221dfab4' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 667.20 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 658.25 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.   6%|▋         | 1/16 [00:30<07:35, 30.37s/it][A


6.  13%|█▎        | 4/30 [29:12<2:30:47, 348.00s/it]*** -> Training took 353.9219 seconds.
6.  17%|█▋        | 5/30 [35:33<3:23:47, 489.11s/it]retraining model for key '409aa875' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 972.95 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 956.54 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  12%|█▎        | 2/16 [01:00<07:05, 30.41s/it][A
6.   6%|▋         | 1/16 [00:17<04:22, 17.50s/it][A
5.   6%|▋         | 1/16 [00:37<09:26, 37.74s/it][A
7.   6%|▋         | 1/16 [00:27<06:56, 27.77s/it][A
6.  12%|█▎        | 2/16 [00:35<04:05, 17.51s/it][A
4.  19%|█▉        | 3/16 [01:31<06:36, 30.48s/it][A
6.  19%|█▉        | 3/16 [00:52<03:47, 17.49s/it][A
7.  12%|█▎        | 2/16 [00:55<06:29, 27.83s/it][A
5.  12%|█▎        | 2/16 [01:15<08:47, 37.64s/it][A
4.  25%|██▌       | 4/16 [02:01<06:05, 30.46s/it][A
6.  25%|██▌       | 4/16 [01:10<03:30, 17.51s/it][A
7.  19%|█▉        | 3/16 [01:23<06:02, 27.86s/it][A
6.  31%|███▏      | 5/16 [01:27<03:12, 17.50s/it][A
4.  31%|███▏      | 5/16 [02:32<05:34, 30.45s/it][A
5.  19%|█▉        | 3/16 [01:52<08:09, 37.62s/it][A
6.  38%|███▊      | 6/16 [01:44<02:54, 17.48s/it][A
7.  25%|██▌      

6. {'loss': 0.0087, 'grad_norm': 0.1414438635110855, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:19<02:19, 17.47s/it][A
6. [A
4.  38%|███▊      | 6/16 [03:02<05:04, 30.49s/it][A
7.  31%|███▏      | 5/16 [02:19<05:06, 27.87s/it][A
5.  25%|██▌       | 4/16 [02:30<07:32, 37.67s/it][A
6.  50%|█████     | 8/16 [02:19<02:19, 17.47s/it][A
6.  56%|█████▋    | 9/16 [02:37<02:02, 17.51s/it][A
4.  44%|████▍     | 7/16 [03:33<04:34, 30.48s/it][A


4. {'loss': 0.0037, 'grad_norm': 0.08021893352270126, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [04:03<04:03, 30.48s/it][A
4. [A
7.  38%|███▊      | 6/16 [02:47<04:39, 27.90s/it][A
6.  62%|██████▎   | 10/16 [02:55<01:45, 17.53s/it][A
5.  31%|███▏      | 5/16 [03:08<06:53, 37.62s/it][A
4.  50%|█████     | 8/16 [04:03<04:03, 30.48s/it][A
6.  69%|██████▉   | 11/16 [03:12<01:27, 17.52s/it][A
7.  44%|████▍     | 7/16 [03:15<04:11, 27.91s/it][A


7. {'loss': 0.0111, 'grad_norm': 0.07515857368707657, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [03:43<03:43, 27.89s/it][A
7. [A
6.  75%|███████▌  | 12/16 [03:30<01:10, 17.51s/it][A
5.  38%|███▊      | 6/16 [03:45<06:16, 37.62s/it][A
4.  56%|█████▋    | 9/16 [04:34<03:33, 30.50s/it][A
7.  50%|█████     | 8/16 [03:43<03:43, 27.89s/it][A
6.  81%|████████▏ | 13/16 [03:47<00:52, 17.49s/it][A
6.  88%|████████▊ | 14/16 [04:04<00:35, 17.50s/it][A
4.  62%|██████▎   | 10/16 [05:04<03:02, 30.48s/it][A
7.  56%|█████▋    | 9/16 [04:10<03:15, 27.89s/it][A
5.  44%|████▍     | 7/16 [04:23<05:38, 37.61s/it][A


5. {'loss': 0.0096, 'grad_norm': 0.14613480865955353, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [05:01<05:01, 37.63s/it][A
5. [A
6.  94%|█████████▍| 15/16 [04:22<00:17, 17.53s/it][A


6.  17%|█▋        | 5/30 [37:57<3:23:47, 489.11s/it]
6. {'loss': 0.0063, 'grad_norm': 0.14759868383407593, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  17%|█▋        | 5/30 [40:17<3:23:47, 489.11s/it]
6. {'train_runtime': 280.0812, 'train_samples_per_second': 0.457, 'train_steps_per_second': 0.057, 'train_loss': 0.007496953010559082, 'epoch': 1.0}


6. 100%|██████████| 16/16 [04:40<00:00, 17.52s/it][A
6. [A
6. 100%|██████████| 16/16 [04:40<00:00, 17.52s/it][A
6. [A
6. 100%|██████████| 16/16 [04:40<00:00, 17.52s/it][A
6. 100%|██████████| 16/16 [04:40<00:00, 17.50s/it]
7.  62%|██████▎   | 10/16 [04:38<02:47, 27.91s/it][A
4.  69%|██████▉   | 11/16 [05:35<02:32, 30.50s/it][A
5.  50%|█████     | 8/16 [05:01<05:01, 37.63s/it][A
7.  69%|██████▉   | 11/16 [05:06<02:19, 27.91s/it][A
4.  75%|███████▌  | 12/16 [06:05<02:01, 30.50s/it][A
5.  56%|█████▋    | 9/16 [05:38<04:23, 37.63s/it][A
7.  75%|███████▌  | 12/16 [05:34<01:51, 27.93s/it][A
4.  81%|████████▏ | 13/16 [06:36<01:31, 30.50s/it][A
7.  81%|████████▏ | 13/16 [06:03<01:24, 28.03s/it][A
5.  62%|██████▎   | 10/16 [06:16<03:45, 37.65s/it][A
4.  88%|████████▊ | 14/16 [07:06<01:01, 30.50s/it][A


6.  17%|█▋        | 5/30 [40:17<3:23:47, 489.11s/it]*** -> Training took 280.0812 seconds.
6.  20%|██        | 6/30 [42:12<3:03:19, 458.31s/it]retraining model for key '446ef5d2' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 2008.57 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  88%|████████▊ | 14/16 [06:31<00:56, 28.08s/it][A
6.   6%|▋         | 1/16 [00:08<02:02,  8.19s/it][A
4.  94%|█████████▍| 15/16 [07:37<00:30, 30.48s/it][A


4.  20%|██        | 6/30 [43:03<2:27:24, 368.51s/it]
4. {'loss': 0.0013, 'grad_norm': 0.21752473711967468, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  20%|██        | 6/30 [47:07<2:27:24, 368.51s/it]
4. {'train_runtime': 487.6793, 'train_samples_per_second': 0.262, 'train_steps_per_second': 0.033, 'train_loss': 0.0024729594006203115, 'epoch': 1.0}


4. 100%|██████████| 16/16 [08:07<00:00, 30.48s/it][A
4. [A
4. 100%|██████████| 16/16 [08:07<00:00, 30.48s/it][A
4. [A
4. 100%|██████████| 16/16 [08:07<00:00, 30.48s/it][A
4. 100%|██████████| 16/16 [08:07<00:00, 30.48s/it]
5.  69%|██████▉   | 11/16 [06:53<03:08, 37.62s/it][A
6.  12%|█▎        | 2/16 [00:16<01:54,  8.17s/it][A
6.  19%|█▉        | 3/16 [00:24<01:45,  8.14s/it][A
7.  94%|█████████▍| 15/16 [06:59<00:28, 28.10s/it][A


7.  20%|██        | 6/30 [45:09<2:24:21, 360.90s/it]
7. {'loss': 0.0019, 'grad_norm': 0.04953395575284958, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  20%|██        | 6/30 [48:54<2:24:21, 360.90s/it]
7. {'train_runtime': 447.5307, 'train_samples_per_second': 0.286, 'train_steps_per_second': 0.036, 'train_loss': 0.006508138554636389, 'epoch': 1.0}


7. 100%|██████████| 16/16 [07:27<00:00, 28.12s/it][A
7. [A
7. 100%|██████████| 16/16 [07:27<00:00, 28.12s/it][A
7. [A
7. 100%|██████████| 16/16 [07:27<00:00, 28.12s/it][A
7. 100%|██████████| 16/16 [07:27<00:00, 27.97s/it]
6.  25%|██▌       | 4/16 [00:32<01:37,  8.14s/it][A
6.  31%|███▏      | 5/16 [00:40<01:29,  8.12s/it][A
6.  38%|███▊      | 6/16 [00:48<01:21,  8.12s/it][A
5.  75%|███████▌  | 12/16 [07:31<02:30, 37.62s/it][A
6.  44%|████▍     | 7/16 [00:56<01:13,  8.12s/it][A


6. {'loss': 0.0125, 'grad_norm': 0.641453206539154, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:05<01:04,  8.12s/it][A
6. [A
6.  50%|█████     | 8/16 [01:05<01:04,  8.12s/it][A
6.  56%|█████▋    | 9/16 [01:13<00:56,  8.11s/it][A
6.  62%|██████▎   | 10/16 [01:21<00:48,  8.12s/it][A
6.  69%|██████▉   | 11/16 [01:29<00:40,  8.11s/it][A
5.  81%|████████▏ | 13/16 [08:09<01:52, 37.61s/it][A
6.  75%|███████▌  | 12/16 [01:37<00:32,  8.11s/it][A
6.  81%|████████▏ | 13/16 [01:45<00:24,  8.12s/it][A
6.  88%|████████▊ | 14/16 [01:53<00:16,  8.12s/it][A
6.  94%|█████████▍| 15/16 [02:01<00:08,  8.12s/it][A


6.  20%|██        | 6/30 [43:20<3:03:19, 458.31s/it]
6. {'loss': 0.0021, 'grad_norm': 0.6489315032958984, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  20%|██        | 6/30 [44:25<3:03:19, 458.31s/it]
6. {'train_runtime': 129.9315, 'train_samples_per_second': 0.985, 'train_steps_per_second': 0.123, 'train_loss': 0.007295522023923695, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:09<00:00,  8.11s/it][A
6. [A
6. 100%|██████████| 16/16 [02:09<00:00,  8.11s/it][A
6. [A
6. 100%|██████████| 16/16 [02:09<00:00,  8.11s/it][A
6. 100%|██████████| 16/16 [02:09<00:00,  8.12s/it]
5.  88%|████████▊ | 14/16 [08:46<01:15, 37.60s/it][A
5.  94%|█████████▍| 15/16 [09:24<00:37, 37.60s/it][A


5.  10%|█         | 3/30 [35:06<4:28:42, 597.13s/it]
5. {'loss': 0.0008, 'grad_norm': 0.21408672630786896, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  10%|█         | 3/30 [40:07<4:28:42, 597.13s/it]
5. {'train_runtime': 601.8986, 'train_samples_per_second': 0.213, 'train_steps_per_second': 0.027, 'train_loss': 0.005195580772124231, 'epoch': 1.0}


5. 100%|██████████| 16/16 [10:01<00:00, 37.59s/it][A
5. [A
5. 100%|██████████| 16/16 [10:01<00:00, 37.59s/it][A
5. [A
5. 100%|██████████| 16/16 [10:01<00:00, 37.59s/it][A
5. 100%|██████████| 16/16 [10:01<00:00, 37.62s/it]


4.  20%|██        | 6/30 [47:07<2:27:24, 368.51s/it]*** -> Training took 487.6793 seconds.
4.  23%|██▎       | 7/30 [54:22<3:31:15, 551.13s/it]retraining model for key '45a5af55' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 942.40 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 927.08 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
4.   6%|▋         | 1/16 [00:17<04:19, 17.31s/it][A
4.  12%|█▎        | 2/16 [00:34<04:02, 17.32s/it][A


6.  20%|██        | 6/30 [44:25<3:03:19, 458.31s/it]*** -> Training took 129.9315 seconds.
6.  23%|██▎       | 7/30 [50:47<3:02:50, 476.96s/it]retraining model for key '4c416de3' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 558.72 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 550.86 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  19%|█▉        | 3/16 [00:52<03:45, 17.35s/it][A
4.  25%|██▌       | 4/16 [01:09<03:28, 17.36s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A


5.  10%|█         | 3/30 [40:07<4:28:42, 597.13s/it]*** -> Training took 601.8986 seconds.
5.  13%|█▎        | 4/30 [46:31<5:26:00, 752.31s/it]retraining model for key '28a6681f' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 2186.39 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  31%|███▏      | 5/16 [01:26<03:10, 17.36s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   6%|▋         | 1/16 [00:07<01:54,  7.61s/it][A
6.   6%|▋         | 1/16 [00:31<07:46, 31.11s/it][A
4.  38%|███▊      | 6/16 [01:44<02:53, 17.37s/it][A
5.  12%|█▎        | 2/16 [00:15<01:46,  7.61s/it][A


7.  20%|██        | 6/30 [48:54<2:24:21, 360.90s/it]*** -> Training took 447.5307 seconds.
7.  23%|██▎       | 7/30 [58:04<3:38:35, 570.24s/it]retraining model for key '247ef758' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1523.94 examples/s]
5.  19%|█▉        | 3/16 [00:22<01:39,  7.62s/it][A
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


5.  25%|██▌       | 4/16 [00:30<01:31,  7.63s/it][A
4.  44%|████▍     | 7/16 [02:01<02:36, 17.39s/it][A


4. {'loss': 0.0033, 'grad_norm': 0.37372922897338867, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:19<02:19, 17.41s/it][A
4. [A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  31%|███▏      | 5/16 [00:38<01:23,  7.64s/it][A
6.  12%|█▎        | 2/16 [01:02<07:17, 31.25s/it][A
5.  38%|███▊      | 6/16 [00:45<01:16,  7.64s/it][A
7.   6%|▋         | 1/16 [00:11<02:58, 11.87s/it][A
4.  50%|█████     | 8/16 [02:19<02:19, 17.41s/it][A
5.  44%|████▍     | 7/16 [00:53<01:08,  7.63s/it][A
5.  50%|█████     | 8/16 [01:00<01:00,  7.62s/it][A
5. [A


5. {'loss': 0.0097, 'grad_norm': 0.6163346171379089, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  12%|█▎        | 2/16 [00:23<02:45, 11.84s/it][A
5.  50%|█████     | 8/16 [01:00<01:00,  7.62s/it][A
4.  56%|█████▋    | 9/16 [02:36<02:01, 17.41s/it][A
5.  56%|█████▋    | 9/16 [01:08<00:53,  7.60s/it][A
7.  19%|█▉        | 3/16 [00:35<02:33, 11.83s/it][A
6.  19%|█▉        | 3/16 [01:34<06:48, 31.43s/it][A
5.  62%|██████▎   | 10/16 [01:16<00:45,  7.59s/it][A
7.  25%|██▌       | 4/16 [00:47<02:21, 11.80s/it][A
5.  69%|██████▉   | 11/16 [01:23<00:37,  7.58s/it][A
4.  62%|██████▎   | 10/16 [02:53<01:44, 17.40s/it][A
5.  75%|███████▌  | 12/16 [01:31<00:30,  7.57s/it][A
7.  31%|███▏      | 5/16 [00:59<02:09, 11.78s/it][A
5.  81%|████████▏ | 13/16 [01:38<00:22,  7.57s/it][A
4.  69%|██████▉   | 11/16 [03:11<01:26, 17.39s/it][A
6.  25%|██▌       | 4/16 [02:05<06:18, 31.53s/it][A
5.  88%|████████▊ | 14/16 [01:46<00:15,  7.57s/it][A
7.  38%|███▊      | 6/16 [01:10<01:57, 11.77s/it][A
5.  94%|█████████▍| 15/16 [01:53<00:07,  7.58s/it][A


5.  13%|█▎        | 4/30 [47:35<5:26:00, 752.31s/it]
5. {'loss': 0.0014, 'grad_norm': 0.10246770083904266, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  13%|█▎        | 4/30 [48:35<5:26:00, 752.31s/it]
5. {'train_runtime': 121.5859, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.132, 'train_loss': 0.005566170555539429, 'epoch': 1.0}


5. 100%|██████████| 16/16 [02:01<00:00,  7.59s/it][A
5. [A
5. 100%|██████████| 16/16 [02:01<00:00,  7.59s/it][A
5. [A
5. 100%|██████████| 16/16 [02:01<00:00,  7.59s/it][A
5. 100%|██████████| 16/16 [02:01<00:00,  7.60s/it]
7.  44%|████▍     | 7/16 [01:22<01:45, 11.76s/it][A


7. {'loss': 0.0065, 'grad_norm': 0.5761913061141968, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:34<01:34, 11.76s/it][A
7. [A
4.  75%|███████▌  | 12/16 [03:28<01:09, 17.39s/it][A
7.  50%|█████     | 8/16 [01:34<01:34, 11.76s/it][A
4.  81%|████████▏ | 13/16 [03:45<00:52, 17.39s/it][A
6.  31%|███▏      | 5/16 [02:37<05:47, 31.56s/it][A
7.  56%|█████▋    | 9/16 [01:45<01:22, 11.75s/it][A


5.  13%|█▎        | 4/30 [48:35<5:26:00, 752.31s/it]*** -> Training took 121.5859 seconds.
5.  17%|█▋        | 5/30 [49:12<3:44:37, 539.10s/it]retraining model for key '31f7f899' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1594.51 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  88%|████████▊ | 14/16 [04:03<00:34, 17.41s/it][A
7.  62%|██████▎   | 10/16 [01:57<01:10, 11.75s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  69%|██████▉   | 11/16 [02:09<00:58, 11.75s/it][A
6.  38%|███▊      | 6/16 [03:09<05:15, 31.60s/it][A
4.  94%|█████████▍| 15/16 [04:20<00:17, 17.44s/it][A


4.  23%|██▎       | 7/30 [56:45<3:31:15, 551.13s/it]
4. {'loss': 0.0001, 'grad_norm': 0.07672607898712158, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  23%|██▎       | 7/30 [59:04<3:31:15, 551.13s/it]
4. {'train_runtime': 278.4982, 'train_samples_per_second': 0.46, 'train_steps_per_second': 0.057, 'train_loss': 0.0017289094175794162, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:38<00:00, 17.48s/it][A
4. [A
4. 100%|██████████| 16/16 [04:38<00:00, 17.48s/it][A
4. [A
4. 100%|██████████| 16/16 [04:38<00:00, 17.48s/it][A
4. 100%|██████████| 16/16 [04:38<00:00, 17.41s/it]
5.   6%|▋         | 1/16 [00:10<02:38, 10.57s/it][A
7.  75%|███████▌  | 12/16 [02:21<00:46, 11.75s/it][A
5.  12%|█▎        | 2/16 [00:21<02:27, 10.53s/it][A
7.  81%|████████▏ | 13/16 [02:32<00:35, 11.74s/it][A
5.  19%|█▉        | 3/16 [00:31<02:16, 10.52s/it][A
7.  88%|████████▊ | 14/16 [02:44<00:23, 11.74s/it][A
6.  44%|████▍     | 7/16 [03:40<04:44, 31.62s/it][A


6. {'loss': 0.0015, 'grad_norm': 0.11966341733932495, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [04:12<04:12, 31.62s/it][A
6. [A
5.  25%|██▌       | 4/16 [00:42<02:06, 10.54s/it][A
7.  94%|█████████▍| 15/16 [02:56<00:11, 11.73s/it][A


7.  23%|██▎       | 7/30 [59:41<3:38:35, 570.24s/it]
7. {'loss': 0.0022, 'grad_norm': 0.12008596211671829, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  23%|██▎       | 7/30 [1:01:15<3:38:35, 570.24s/it]
7. {'train_runtime': 188.0736, 'train_samples_per_second': 0.681, 'train_steps_per_second': 0.085, 'train_loss': 0.004364358610473573, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:08<00:00, 11.72s/it][A
7. [A
7. 100%|██████████| 16/16 [03:08<00:00, 11.72s/it][A
7. [A
7. 100%|██████████| 16/16 [03:08<00:00, 11.72s/it][A
7. 100%|██████████| 16/16 [03:08<00:00, 11.75s/it]
5.  31%|███▏      | 5/16 [00:52<01:55, 10.54s/it][A
5.  38%|███▊      | 6/16 [01:03<01:45, 10.53s/it][A
6.  50%|█████     | 8/16 [04:12<04:12, 31.62s/it][A
5.  44%|████▍     | 7/16 [01:13<01:34, 10.53s/it][A


5. {'loss': 0.006, 'grad_norm': 0.21452713012695312, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:24<01:24, 10.53s/it][A
5. [A
5.  50%|█████     | 8/16 [01:24<01:24, 10.53s/it][A
5.  56%|█████▋    | 9/16 [01:34<01:13, 10.53s/it][A
6.  56%|█████▋    | 9/16 [04:43<03:41, 31.61s/it][A
5.  62%|██████▎   | 10/16 [01:45<01:03, 10.53s/it][A
5.  69%|██████▉   | 11/16 [01:55<00:52, 10.53s/it][A
5.  75%|███████▌  | 12/16 [02:06<00:42, 10.53s/it][A
6.  62%|██████▎   | 10/16 [05:15<03:09, 31.60s/it][A
5.  81%|████████▏ | 13/16 [02:16<00:31, 10.53s/it][A
5.  88%|████████▊ | 14/16 [02:27<00:21, 10.53s/it][A
5.  94%|█████████▍| 15/16 [02:37<00:10, 10.53s/it][A


5.  17%|█▋        | 5/30 [50:39<3:44:37, 539.10s/it]
5. {'loss': 0.0012, 'grad_norm': 0.18923121690750122, 'learning_rate': 5e-05, 'epoch': 1.0}


5. 100%|██████████| 16/16 [02:48<00:00, 10.52s/it][A
5. [A
5. 100%|██████████| 16/16 [02:48<00:00, 10.52s/it][A
5. [A
5. 100%|██████████| 16/16 [02:48<00:00, 10.52s/it][A
5. 100%|██████████| 16/16 [02:48<00:00, 10.53s/it]


5.  17%|█▋        | 5/30 [52:03<3:44:37, 539.10s/it]
5. {'train_runtime': 168.4683, 'train_samples_per_second': 0.76, 'train_steps_per_second': 0.095, 'train_loss': 0.0035798161989077926, 'epoch': 1.0}


6.  69%|██████▉   | 11/16 [05:47<02:38, 31.60s/it][A


7.  23%|██▎       | 7/30 [1:01:15<3:38:35, 570.24s/it]*** -> Training took 188.0736 seconds.
7.  27%|██▋       | 8/30 [1:03:37<3:01:24, 494.73s/it]retraining model for key '2ba387bc' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 720.58 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 709.94 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


6.  75%|███████▌  | 12/16 [06:18<02:06, 31.59s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  81%|████████▏ | 13/16 [06:50<01:34, 31.60s/it][A
7.   6%|▋         | 1/16 [00:25<06:17, 25.17s/it][A


4.  23%|██▎       | 7/30 [59:04<3:31:15, 551.13s/it]*** -> Training took 278.4982 seconds.
4.  27%|██▋       | 8/30 [1:03:14<3:19:47, 544.91s/it]retraining model for key '4a21e3da' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 778.51 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 767.13 examples/s]
6.  88%|████████▊ | 14/16 [07:21<01:03, 31.60s/it][A
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...
5.  17%|█▋        | 5/30 [52:03<3:44:37, 539.10s/it]*** -> Training took 168.4683 seconds.
5.  20%|██        | 6/30 [53:52<3:00:28, 451.20s/it]retraining model for key '35ab12c3' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 819.00 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 806.86 examples/s]
7.  12%|█▎        | 2/16 [00:50<05:52, 25.14s/it][A
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  94%|█████████▍| 15/16 [07:53<00:31, 31.60s/it][A


6.  23%|██▎       | 7/30 [55:03<3:02:50, 476.96s/it]
6. {'loss': 0.0007, 'grad_norm': 0.08046546578407288, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  23%|██▎       | 7/30 [59:16<3:02:50, 476.96s/it]
6. {'train_runtime': 505.1297, 'train_samples_per_second': 0.253, 'train_steps_per_second': 0.032, 'train_loss': 0.0011045857390854508, 'epoch': 1.0}


6. 100%|██████████| 16/16 [08:25<00:00, 31.60s/it][A
6. [A
6. 100%|██████████| 16/16 [08:25<00:00, 31.60s/it][A
6. [A
6. 100%|██████████| 16/16 [08:25<00:00, 31.60s/it][A
6. 100%|██████████| 16/16 [08:25<00:00, 31.57s/it]
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  19%|█▉        | 3/16 [01:15<05:26, 25.15s/it][A
4.   6%|▋         | 1/16 [00:21<05:23, 21.56s/it][A
5.   6%|▋         | 1/16 [00:21<05:29, 21.97s/it][A
7.  25%|██▌       | 4/16 [01:40<05:02, 25.21s/it][A
4.  12%|█▎        | 2/16 [00:43<05:02, 21.61s/it][A
5.  12%|█▎        | 2/16 [00:43<05:06, 21.89s/it][A
7.  31%|███▏      | 5/16 [02:06<04:37, 25.23s/it][A
4.  19%|█▉        | 3/16 [01:04<04:40, 21.58s/it][A
5.  19%|█▉        | 3/16 [01:05<04:43, 21.79s/it][A
4.  25%|██▌       | 4/16 [01:26<04:18, 21.54s/it][A
7.  38%|███▊      | 6/16 [02:31<04:12, 25.27s/it][A
5.  25%|██▌       | 4/16 [01:27<04:21, 21.76s/it][A
4.  31%|███▏      | 5/16 [01:47<03:56, 21.52s/it][A
7.  44%|████▍     | 7/16 [02:56<03:47, 

7. {'loss': 0.0234, 'grad_norm': 0.7927687168121338, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [03:22<03:22, 25.37s/it][A
7. [A
5.  31%|███▏      | 5/16 [01:48<03:59, 21.75s/it][A


6.  23%|██▎       | 7/30 [59:16<3:02:50, 476.96s/it]*** -> Training took 505.1297 seconds.
6.  27%|██▋       | 8/30 [1:01:10<3:11:55, 523.43s/it]retraining model for key '53fb4810' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1186.95 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1160.82 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  38%|███▊      | 6/16 [02:09<03:35, 21.52s/it][A
7.  50%|█████     | 8/16 [03:22<03:22, 25.37s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  38%|███▊      | 6/16 [02:10<03:37, 21.76s/it][A
4.  44%|████▍     | 7/16 [02:30<03:13, 21.52s/it][A


4. {'loss': 0.0061, 'grad_norm': 0.45660945773124695, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:52<02:52, 21.52s/it][A
4. [A
6.   6%|▋         | 1/16 [00:14<03:39, 14.62s/it][A
5.  44%|████▍     | 7/16 [02:32<03:15, 21.74s/it][A


5. {'loss': 0.0039, 'grad_norm': 0.35104602575302124, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:54<02:53, 21.72s/it][A
5. [A
7.  56%|█████▋    | 9/16 [03:47<02:57, 25.40s/it][A
6.  12%|█▎        | 2/16 [00:29<03:24, 14.63s/it][A
4.  50%|█████     | 8/16 [02:52<02:52, 21.52s/it][A
6.  19%|█▉        | 3/16 [00:43<03:10, 14.67s/it][A
5.  50%|█████     | 8/16 [02:54<02:53, 21.72s/it][A
7.  62%|██████▎   | 10/16 [04:13<02:32, 25.46s/it][A
4.  56%|█████▋    | 9/16 [03:13<02:30, 21.51s/it][A
6.  25%|██▌       | 4/16 [00:58<02:56, 14.68s/it][A
5.  56%|█████▋    | 9/16 [03:15<02:32, 21.71s/it][A
6.  31%|███▏      | 5/16 [01:13<02:41, 14.66s/it][A
4.  62%|██████▎   | 10/16 [03:35<02:09, 21.50s/it][A
7.  69%|██████▉   | 11/16 [04:38<02:07, 25.49s/it][A
6.  38%|███▊      | 6/16 [01:27<02:26, 14.65s/it][A
5.  62%|██████▎   | 10/16 [03:37<02:10, 21.71s/it][A
4.  69%|██████▉   | 11/16 [03:56<01:47, 21.50s/it][A
7.  75%|███████▌  | 12/16 [05:04<01:41, 25.48s/it][A
6.  44%|████▍     | 7/16 [01:42<02:11, 14.61s/it][A


6. {'loss': 0.0026, 'grad_norm': 0.052767444401979446, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:57<01:56, 14.60s/it][A
6. [A
5.  69%|██████▉   | 11/16 [03:59<01:48, 21.72s/it][A
6.  50%|█████     | 8/16 [01:57<01:56, 14.60s/it][A
4.  75%|███████▌  | 12/16 [04:18<01:26, 21.51s/it][A
7.  81%|████████▏ | 13/16 [05:29<01:16, 25.46s/it][A
6.  56%|█████▋    | 9/16 [02:11<01:42, 14.60s/it][A
5.  75%|███████▌  | 12/16 [04:20<01:26, 21.71s/it][A
4.  81%|████████▏ | 13/16 [04:39<01:04, 21.51s/it][A
6.  62%|██████▎   | 10/16 [02:26<01:27, 14.60s/it][A
7.  88%|████████▊ | 14/16 [05:55<00:50, 25.47s/it][A
5.  81%|████████▏ | 13/16 [04:42<01:05, 21.71s/it][A
6.  69%|██████▉   | 11/16 [02:40<01:13, 14.62s/it][A
4.  88%|████████▊ | 14/16 [05:01<00:43, 21.51s/it][A
5.  88%|████████▊ | 14/16 [05:04<00:43, 21.71s/it][A
6.  75%|███████▌  | 12/16 [02:55<00:58, 14.62s/it][A
7.  94%|█████████▍| 15/16 [06:20<00:25, 25.48s/it][A


7.  27%|██▋       | 8/30 [1:07:03<3:01:24, 494.73s/it]
7. {'loss': 0.01, 'grad_norm': 0.4063149094581604, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  27%|██▋       | 8/30 [1:10:27<3:01:24, 494.73s/it]
7. {'train_runtime': 406.1544, 'train_samples_per_second': 0.315, 'train_steps_per_second': 0.039, 'train_loss': 0.016728307586163282, 'epoch': 1.0}


7. 100%|██████████| 16/16 [06:46<00:00, 25.45s/it][A
7. [A
7. 100%|██████████| 16/16 [06:46<00:00, 25.45s/it][A
7. [A
7. 100%|██████████| 16/16 [06:46<00:00, 25.45s/it][A
7. 100%|██████████| 16/16 [06:46<00:00, 25.38s/it]
4.  94%|█████████▍| 15/16 [05:22<00:21, 21.51s/it][A


4.  27%|██▋       | 8/30 [1:06:10<3:19:47, 544.91s/it]
4. {'loss': 0.0013, 'grad_norm': 0.47741594910621643, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  27%|██▋       | 8/30 [1:09:02<3:19:47, 544.91s/it]
4. {'train_runtime': 344.3067, 'train_samples_per_second': 0.372, 'train_steps_per_second': 0.046, 'train_loss': 0.003687505377456546, 'epoch': 1.0}


4. 100%|██████████| 16/16 [05:44<00:00, 21.50s/it][A
4. [A
4. 100%|██████████| 16/16 [05:44<00:00, 21.50s/it][A
4. [A
4. 100%|██████████| 16/16 [05:44<00:00, 21.50s/it][A
4. 100%|██████████| 16/16 [05:44<00:00, 21.52s/it]
6.  81%|████████▏ | 13/16 [03:10<00:43, 14.60s/it][A
5.  94%|█████████▍| 15/16 [05:26<00:21, 21.72s/it][A


5.  20%|██        | 6/30 [56:50<3:00:28, 451.20s/it]
5. {'loss': 0.0011, 'grad_norm': 0.05104278028011322, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  20%|██        | 6/30 [59:44<3:00:28, 451.20s/it]
5. {'train_runtime': 347.7408, 'train_samples_per_second': 0.368, 'train_steps_per_second': 0.046, 'train_loss': 0.0025256011285819113, 'epoch': 1.0}


5. 100%|██████████| 16/16 [05:47<00:00, 21.70s/it][A
5. [A
5. 100%|██████████| 16/16 [05:47<00:00, 21.70s/it][A
5. [A
5. 100%|██████████| 16/16 [05:47<00:00, 21.70s/it][A
5. 100%|██████████| 16/16 [05:47<00:00, 21.73s/it]
6.  88%|████████▊ | 14/16 [03:24<00:29, 14.61s/it][A
6.  94%|█████████▍| 15/16 [03:39<00:14, 14.60s/it][A


6.  27%|██▋       | 8/30 [1:03:10<3:11:55, 523.43s/it]
6. {'loss': 0.0003, 'grad_norm': 0.0065006716176867485, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  27%|██▋       | 8/30 [1:05:07<3:11:55, 523.43s/it]
6. {'train_runtime': 233.925, 'train_samples_per_second': 0.547, 'train_steps_per_second': 0.068, 'train_loss': 0.0014484730490949005, 'epoch': 1.0}


6. 100%|██████████| 16/16 [03:53<00:00, 14.61s/it][A
6. [A
6. 100%|██████████| 16/16 [03:53<00:00, 14.61s/it][A
6. [A
6. 100%|██████████| 16/16 [03:53<00:00, 14.61s/it][A
6. 100%|██████████| 16/16 [03:53<00:00, 14.62s/it]


7.  27%|██▋       | 8/30 [1:10:27<3:01:24, 494.73s/it]*** -> Training took 406.1544 seconds.
7.  30%|███       | 9/30 [1:11:10<2:48:41, 481.96s/it]retraining model for key '332f06d7' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1156.24 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1134.18 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:15<03:56, 15.76s/it][A
7.  12%|█▎        | 2/16 [00:31<03:40, 15.76s/it][A
7.  19%|█▉        | 3/16 [00:47<03:25, 15.79s/it][A
7.  25%|██▌       | 4/16 [01:03<03:09, 15.80s/it][A
7.  31%|███▏      | 5/16 [01:19<02:54, 15.83s/it][A


5.  20%|██        | 6/30 [59:44<3:00:28, 451.20s/it]*** -> Training took 347.7408 seconds.
5.  23%|██▎       | 7/30 [1:01:58<2:57:18, 462.53s/it]retraining model for key '38007db0' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1323.96 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  38%|███▊      | 6/16 [01:34<02:38, 15.85s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A


4.  27%|██▋       | 8/30 [1:09:02<3:19:47, 544.91s/it]*** -> Training took 344.3067 seconds.
4.  30%|███       | 9/30 [1:11:50<3:07:35, 535.98s/it]retraining model for key '4c3d4a41' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 2100.10 examples/s]
7.  44%|████▍     | 7/16 [01:50<02:22, 15.86s/it][A


7. {'loss': 0.0026, 'grad_norm': 0.31086745858192444, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:06<02:07, 15.88s/it][A
7. [A
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


5.   6%|▋         | 1/16 [00:12<03:10, 12.72s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  50%|█████     | 8/16 [02:06<02:07, 15.88s/it][A
4.   6%|▋         | 1/16 [00:07<01:57,  7.85s/it][A
5.  12%|█▎        | 2/16 [00:25<02:59, 12.80s/it][A
4.  12%|█▎        | 2/16 [00:15<01:49,  7.80s/it][A
7.  56%|█████▋    | 9/16 [02:22<01:51, 15.89s/it][A
5.  19%|█▉        | 3/16 [00:38<02:46, 12.80s/it][A
4.  19%|█▉        | 3/16 [00:23<01:41,  7.79s/it][A
4.  25%|██▌       | 4/16 [00:31<01:33,  7.77s/it][A
5.  25%|██▌       | 4/16 [00:51<02:33, 12.76s/it][A
7.  62%|██████▎   | 10/16 [02:38<01:35, 15.88s/it][A
4.  31%|███▏      | 5/16 [00:38<01:25,  7.76s/it][A
4.  38%|███▊      | 6/16 [00:46<01:17,  7.76s/it][A
5.  31%|███▏      | 5/16 [01:03<02:19, 12.72s/it][A
7.  69%|██████▉   | 11/16 [02:54<01:19, 15.88s/it][A
4.  44%|████▍     | 7/16 [00:54<01:09,  7.74s/it][A


4. {'loss': 0.0262, 'grad_norm': 0.36836034059524536, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:02<01:01,  7.74s/it][A
4. [A
5.  38%|███▊      | 6/16 [01:16<02:06, 12.69s/it][A
4.  50%|█████     | 8/16 [01:02<01:01,  7.74s/it][A
7.  75%|███████▌  | 12/16 [03:10<01:03, 15.88s/it][A
4.  56%|█████▋    | 9/16 [01:09<00:54,  7.74s/it][A
5.  44%|████▍     | 7/16 [01:29<01:54, 12.68s/it][A


5. {'loss': 0.0205, 'grad_norm': 0.22525005042552948, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:41<01:41, 12.68s/it][A
5. [A
4.  62%|██████▎   | 10/16 [01:17<00:46,  7.74s/it][A


6.  27%|██▋       | 8/30 [1:05:07<3:11:55, 523.43s/it]*** -> Training took 233.925 seconds.
6.  30%|███       | 9/30 [1:08:49<2:56:09, 503.31s/it]retraining model for key '6e453dd6' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1181.97 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1160.04 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  69%|██████▉   | 11/16 [01:25<00:38,  7.74s/it][A
7.  81%|████████▏ | 13/16 [03:26<00:47, 15.88s/it][A
5.  50%|█████     | 8/16 [01:41<01:41, 12.68s/it][A
4.  75%|███████▌  | 12/16 [01:33<00:30,  7.73s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  56%|█████▋    | 9/16 [01:54<01:28, 12.67s/it][A
4.  81%|████████▏ | 13/16 [01:40<00:23,  7.74s/it][A
7.  88%|████████▊ | 14/16 [03:42<00:31, 15.89s/it][A
4.  88%|████████▊ | 14/16 [01:48<00:15,  7.74s/it][A
6.   6%|▋         | 1/16 [00:14<03:33, 14.23s/it][A
5.  62%|██████▎   | 10/16 [02:07<01:16, 12.67s/it][A
4.  94%|█████████▍| 15/16 [01:56<00:07,  7.73s/it][A


4.  30%|███       | 9/30 [1:12:55<3:07:35, 535.98s/it]
4. {'loss': 0.0013, 'grad_norm': 1.6327157020568848, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  30%|███       | 9/30 [1:13:57<3:07:35, 535.98s/it]
4. {'train_runtime': 123.9769, 'train_samples_per_second': 1.032, 'train_steps_per_second': 0.129, 'train_loss': 0.013764757371973246, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:03<00:00,  7.74s/it][A
4. [A
4. 100%|██████████| 16/16 [02:03<00:00,  7.74s/it][A
4. [A
4. 100%|██████████| 16/16 [02:03<00:00,  7.74s/it][A
4. 100%|██████████| 16/16 [02:03<00:00,  7.75s/it]
7.  94%|█████████▍| 15/16 [03:57<00:15, 15.88s/it][A


7.  30%|███       | 9/30 [1:13:21<2:48:41, 481.96s/it]
7. {'loss': 0.0005, 'grad_norm': 0.32568860054016113, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  30%|███       | 9/30 [1:15:28<2:48:41, 481.96s/it]
7. {'train_runtime': 253.8248, 'train_samples_per_second': 0.504, 'train_steps_per_second': 0.063, 'train_loss': 0.001590103463968262, 'epoch': 1.0}


7. 100%|██████████| 16/16 [04:13<00:00, 15.88s/it][A
7. [A
7. 100%|██████████| 16/16 [04:13<00:00, 15.88s/it][A
7. [A
7. 100%|██████████| 16/16 [04:13<00:00, 15.88s/it][A
7. 100%|██████████| 16/16 [04:13<00:00, 15.86s/it]
5.  69%|██████▉   | 11/16 [02:19<01:03, 12.67s/it][A
6.  12%|█▎        | 2/16 [00:28<03:19, 14.25s/it][A
5.  75%|███████▌  | 12/16 [02:32<00:50, 12.68s/it][A
6.  19%|█▉        | 3/16 [00:42<03:05, 14.28s/it][A
5.  81%|████████▏ | 13/16 [02:45<00:38, 12.69s/it][A
6.  25%|██▌       | 4/16 [00:57<02:51, 14.30s/it][A
5.  88%|████████▊ | 14/16 [02:57<00:25, 12.68s/it][A
6.  31%|███▏      | 5/16 [01:11<02:37, 14.30s/it][A
5.  94%|█████████▍| 15/16 [03:10<00:12, 12.67s/it][A


5.  23%|██▎       | 7/30 [1:03:43<2:57:18, 462.53s/it]
5. {'loss': 0.003, 'grad_norm': 0.31435272097587585, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  23%|██▎       | 7/30 [1:05:25<2:57:18, 462.53s/it]
5. {'train_runtime': 203.0803, 'train_samples_per_second': 0.63, 'train_steps_per_second': 0.079, 'train_loss': 0.011754624196328223, 'epoch': 1.0}


5. 100%|██████████| 16/16 [03:23<00:00, 12.66s/it][A
5. [A
5. 100%|██████████| 16/16 [03:23<00:00, 12.66s/it][A
5. [A
5. 100%|██████████| 16/16 [03:23<00:00, 12.66s/it][A
5. 100%|██████████| 16/16 [03:23<00:00, 12.69s/it]
6.  38%|███▊      | 6/16 [01:25<02:23, 14.31s/it][A
6.  44%|████▍     | 7/16 [01:40<02:08, 14.32s/it][A


6. {'loss': 0.0146, 'grad_norm': 0.20179688930511475, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:54<01:54, 14.32s/it][A
6. [A
6.  50%|█████     | 8/16 [01:54<01:54, 14.32s/it][A
6.  56%|█████▋    | 9/16 [02:08<01:40, 14.32s/it][A
6.  62%|██████▎   | 10/16 [02:23<01:25, 14.31s/it][A
6.  69%|██████▉   | 11/16 [02:37<01:11, 14.32s/it][A
6.  75%|███████▌  | 12/16 [02:51<00:57, 14.31s/it][A
6.  81%|████████▏ | 13/16 [03:05<00:42, 14.31s/it][A


4.  30%|███       | 9/30 [1:13:57<3:07:35, 535.98s/it]*** -> Training took 123.9769 seconds.
4.  33%|███▎      | 10/30 [1:16:44<2:33:45, 461.25s/it]retraining model for key '58f5dbd5' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 941.03 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 925.43 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...
7.  30%|███       | 9/30 [1:15:28<2:48:41, 481.96s/it]*** -> Training took 253.8248 seconds.
7.  33%|███▎      | 10/30 [1:18:25<2:35:45, 467.25s/it]retraining model for key '3a25b0d8' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1355.97 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160
6.  88%|████████▊ | 14/16 [03:20<00:28, 14.32s/it][A


7. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  94%|█████████▍| 15/16 [03:34<00:14, 14.32s/it][A


6.  30%|███       | 9/30 [1:10:47<2:56:09, 503.31s/it]
6. {'loss': 0.0039, 'grad_norm': 0.059993449598550797, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  30%|███       | 9/30 [1:12:42<2:56:09, 503.31s/it]
6. {'train_runtime': 228.959, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.07, 'train_loss': 0.009262607200071216, 'epoch': 1.0}


6. 100%|██████████| 16/16 [03:48<00:00, 14.32s/it][A
6. [A
6. 100%|██████████| 16/16 [03:48<00:00, 14.32s/it][A
6. [A
6. 100%|██████████| 16/16 [03:48<00:00, 14.32s/it][A
6. 100%|██████████| 16/16 [03:48<00:00, 14.31s/it]
4.   6%|▋         | 1/16 [00:17<04:22, 17.51s/it][A
7.   6%|▋         | 1/16 [00:12<03:14, 12.96s/it][A


5.  23%|██▎       | 7/30 [1:05:25<2:57:18, 462.53s/it]*** -> Training took 203.0803 seconds.
5.  27%|██▋       | 8/30 [1:07:54<2:37:05, 428.44s/it]retraining model for key '5545f144' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1364.11 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  12%|█▎        | 2/16 [00:25<03:00, 12.92s/it][A
4.  12%|█▎        | 2/16 [00:35<04:05, 17.54s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  19%|█▉        | 3/16 [00:38<02:48, 12.92s/it][A
5.   6%|▋         | 1/16 [00:12<03:09, 12.66s/it][A
4.  19%|█▉        | 3/16 [00:52<03:48, 17.58s/it][A
7.  25%|██▌       | 4/16 [00:51<02:35, 12.96s/it][A
5.  12%|█▎        | 2/16 [00:25<02:57, 12.65s/it][A
4.  25%|██▌       | 4/16 [01:10<03:31, 17.63s/it][A
7.  31%|███▏      | 5/16 [01:04<02:23, 13.01s/it][A
5.  19%|█▉        | 3/16 [00:37<02:44, 12.65s/it][A
7.  38%|███▊      | 6/16 [01:18<02:10, 13.06s/it][A
5.  25%|██▌       | 4/16 [00:50<02:31, 12.63s/it][A
4.  31%|███▏      | 5/16 [01:28<03:14, 17.64s/it][A
7.  44%|████▍     | 7/16 [01:31<01:57, 13.09s/it][A


7. {'loss': 0.0126, 'grad_norm': 0.6833515167236328, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:44<01:44, 13.12s/it][A
7. [A
5.  31%|███▏      | 5/16 [01:03<02:18, 12.62s/it][A
4.  38%|███▊      | 6/16 [01:45<02:56, 17.63s/it][A
7.  50%|█████     | 8/16 [01:44<01:44, 13.12s/it][A
5.  38%|███▊      | 6/16 [01:15<02:05, 12.59s/it][A
4.  44%|████▍     | 7/16 [02:03<02:38, 17.60s/it][A


4. {'loss': 0.0304, 'grad_norm': 1.027778148651123, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:20<02:20, 17.60s/it][A
4. [A
7.  56%|█████▋    | 9/16 [01:57<01:31, 13.13s/it][A
5.  44%|████▍     | 7/16 [01:28<01:53, 12.59s/it][A


5. {'loss': 0.0076, 'grad_norm': 0.06842312216758728, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:40<01:40, 12.59s/it][A
5. [A


6.  30%|███       | 9/30 [1:12:42<2:56:09, 503.31s/it]*** -> Training took 228.959 seconds.
6.  33%|███▎      | 10/30 [1:14:47<2:32:50, 458.55s/it]retraining model for key '7b0280bc' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 799.11 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 787.75 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  50%|█████     | 8/16 [01:40<01:40, 12.59s/it][A
7.  62%|██████▎   | 10/16 [02:10<01:18, 13.15s/it][A
4.  50%|█████     | 8/16 [02:20<02:20, 17.60s/it][A
5.  56%|█████▋    | 9/16 [01:53<01:28, 12.59s/it][A
7.  69%|██████▉   | 11/16 [02:23<01:05, 13.17s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  56%|█████▋    | 9/16 [02:38<02:03, 17.60s/it][A
5.  62%|██████▎   | 10/16 [02:06<01:15, 12.59s/it][A
7.  75%|███████▌  | 12/16 [02:37<00:52, 13.19s/it][A
5.  69%|██████▉   | 11/16 [02:18<01:02, 12.59s/it][A
4.  62%|██████▎   | 10/16 [02:56<01:45, 17.62s/it][A
7.  81%|████████▏ | 13/16 [02:50<00:39, 13.20s/it][A
6.   6%|▋         | 1/16 [00:21<05:23, 21.54s/it][A
5.  75%|███████▌  | 12/16 [02:31<00:50, 12.59s/it][A
7.  88%|████████▊ | 14/16 [03:03<00:26, 13.19s/it][A
4.  69%|██████▉   | 11/16 [03:13<01:28, 17.62s/it][A
5.  81%|████████▏ | 13/16 [02:43<00:37, 12.59s/it][A
6.  12%|█▎        | 2/16 [00:43<05:01, 21.55s/it][A
7.  94%|█████████▍| 15/16 [03:16<00:13, 13.

7.  33%|███▎      | 10/30 [1:20:12<2:35:45, 467.25s/it]
7. {'loss': 0.0009, 'grad_norm': 1.0549622774124146, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  33%|███▎      | 10/30 [1:21:58<2:35:45, 467.25s/it]
7. {'train_runtime': 209.8548, 'train_samples_per_second': 0.61, 'train_steps_per_second': 0.076, 'train_loss': 0.006767401471734047, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:29<00:00, 13.16s/it][A
7. [A
7. 100%|██████████| 16/16 [03:29<00:00, 13.16s/it][A
7. [A
7. 100%|██████████| 16/16 [03:29<00:00, 13.16s/it][A
7. 100%|██████████| 16/16 [03:29<00:00, 13.12s/it]
4.  75%|███████▌  | 12/16 [03:31<01:10, 17.63s/it][A
5.  88%|████████▊ | 14/16 [02:56<00:25, 12.59s/it][A
6.  19%|█▉        | 3/16 [01:04<04:39, 21.51s/it][A
5.  94%|█████████▍| 15/16 [03:09<00:12, 12.60s/it][A


5.  27%|██▋       | 8/30 [1:09:38<2:37:05, 428.44s/it]
5. {'loss': 0.0032, 'grad_norm': 0.8958998918533325, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  27%|██▋       | 8/30 [1:11:19<2:37:05, 428.44s/it]
5. {'train_runtime': 201.6586, 'train_samples_per_second': 0.635, 'train_steps_per_second': 0.079, 'train_loss': 0.005392419756390154, 'epoch': 1.0}


5. 100%|██████████| 16/16 [03:21<00:00, 12.60s/it][A
5. [A
5. 100%|██████████| 16/16 [03:21<00:00, 12.60s/it][A
5. [A
5. 100%|██████████| 16/16 [03:21<00:00, 12.60s/it][A
5. 100%|██████████| 16/16 [03:21<00:00, 12.60s/it]
4.  81%|████████▏ | 13/16 [03:48<00:52, 17.63s/it][A
6.  25%|██▌       | 4/16 [01:25<04:17, 21.47s/it][A
4.  88%|████████▊ | 14/16 [04:06<00:35, 17.63s/it][A


7.  33%|███▎      | 10/30 [1:21:58<2:35:45, 467.25s/it]*** -> Training took 209.8548 seconds.
7.  37%|███▋      | 11/30 [1:22:44<2:07:47, 403.56s/it]retraining model for key '4c7dc4dd' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 917.69 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 902.05 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  94%|█████████▍| 15/16 [04:24<00:17, 17.64s/it][A


4.  33%|███▎      | 10/30 [1:19:09<2:33:45, 461.25s/it]
4. {'loss': 0.006, 'grad_norm': 0.8989460468292236, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  33%|███▎      | 10/30 [1:21:30<2:33:45, 461.25s/it]
4. {'train_runtime': 282.0379, 'train_samples_per_second': 0.454, 'train_steps_per_second': 0.057, 'train_loss': 0.018169240560382605, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:42<00:00, 17.67s/it][A
4. [A
4. 100%|██████████| 16/16 [04:42<00:00, 17.67s/it][A
4. [A
4. 100%|██████████| 16/16 [04:42<00:00, 17.67s/it][A
4. 100%|██████████| 16/16 [04:42<00:00, 17.63s/it]
6.  31%|███▏      | 5/16 [01:47<03:56, 21.46s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  38%|███▊      | 6/16 [02:08<03:34, 21.47s/it][A
7.   6%|▋         | 1/16 [00:19<04:57, 19.81s/it][A


4.  33%|███▎      | 10/30 [1:21:30<2:33:45, 461.25s/it]*** -> Training took 282.0379 seconds.
4.  37%|███▋      | 11/30 [1:22:10<2:12:58, 419.92s/it]retraining model for key '5961cc34' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 415.52 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 410.75 examples/s]
6.  44%|████▍     | 7/16 [02:30<03:13, 21.47s/it][A


6. {'loss': 0.0076, 'grad_norm': 0.12882071733474731, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:51<02:51, 21.44s/it][A
6. [A
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.  12%|█▎        | 2/16 [00:39<04:37, 19.80s/it][A
6.  50%|█████     | 8/16 [02:51<02:51, 21.44s/it][A
7.  19%|█▉        | 3/16 [00:59<04:17, 19.79s/it][A


5.  27%|██▋       | 8/30 [1:11:19<2:37:05, 428.44s/it]*** -> Training took 201.6586 seconds.
5.  30%|███       | 9/30 [1:13:06<2:17:14, 392.12s/it]retraining model for key '581f7754' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1218.53 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1194.52 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  56%|█████▋    | 9/16 [03:13<02:30, 21.44s/it][A
7.  25%|██▌       | 4/16 [01:19<03:57, 19.80s/it][A
5.   6%|▋         | 1/16 [00:14<03:31, 14.08s/it][A
7.  31%|███▏      | 5/16 [01:39<03:37, 19.81s/it][A
6.  62%|██████▎   | 10/16 [03:34<02:08, 21.44s/it][A
5.  12%|█▎        | 2/16 [00:28<03:16, 14.04s/it][A
4.   6%|▋         | 1/16 [00:39<09:59, 40.00s/it][A
7.  38%|███▊      | 6/16 [01:58<03:18, 19.83s/it][A
5.  19%|█▉        | 3/16 [00:42<03:02, 14.03s/it][A
6.  69%|██████▉   | 11/16 [03:56<01:47, 21.45s/it][A
5.  25%|██▌       | 4/16 [00:56<02:48, 14.04s/it][A
7.  44%|████▍     | 7/16 [02:18<02:58, 19.84s/it][A


7. {'loss': 0.074, 'grad_norm': 3.3172171115875244, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:38<02:38, 19.84s/it][A
7. [A
6.  75%|███████▌  | 12/16 [04:17<01:25, 21.48s/it][A
5.  31%|███▏      | 5/16 [01:10<02:34, 14.05s/it][A
4.  12%|█▎        | 2/16 [01:19<09:18, 39.89s/it][A
7.  50%|█████     | 8/16 [02:38<02:38, 19.84s/it][A
5.  38%|███▊      | 6/16 [01:24<02:20, 14.06s/it][A
6.  81%|████████▏ | 13/16 [04:39<01:04, 21.48s/it][A
5.  44%|████▍     | 7/16 [01:38<02:06, 14.06s/it][A


5. {'loss': 0.0058, 'grad_norm': 0.5094557404518127, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:52<01:52, 14.08s/it][A
5. [A
7.  56%|█████▋    | 9/16 [02:58<02:18, 19.84s/it][A
6.  88%|████████▊ | 14/16 [05:00<00:42, 21.48s/it][A
5.  50%|█████     | 8/16 [01:52<01:52, 14.08s/it][A
4.  19%|█▉        | 3/16 [01:59<08:37, 39.84s/it][A
7.  62%|██████▎   | 10/16 [03:18<01:59, 19.84s/it][A
5.  56%|█████▋    | 9/16 [02:06<01:38, 14.07s/it][A
6.  94%|█████████▍| 15/16 [05:22<00:21, 21.48s/it][A


6.  33%|███▎      | 10/30 [1:17:43<2:32:50, 458.55s/it]
6. {'loss': 0.0047, 'grad_norm': 0.1960277557373047, 'learning_rate': 5e-05, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:43<00:00, 21.48s/it][A
6. [A
6. 100%|██████████| 16/16 [05:43<00:00, 21.48s/it][A
6. [A


6.  33%|███▎      | 10/30 [1:20:35<2:32:50, 458.55s/it]
6. {'train_runtime': 343.5605, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.047, 'train_loss': 0.00612979126162827, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:43<00:00, 21.48s/it][A
6. 100%|██████████| 16/16 [05:43<00:00, 21.47s/it]
7.  69%|██████▉   | 11/16 [03:38<01:39, 19.84s/it][A
5.  62%|██████▎   | 10/16 [02:20<01:24, 14.07s/it][A
5.  69%|██████▉   | 11/16 [02:34<01:10, 14.07s/it][A
4.  25%|██▌       | 4/16 [02:39<07:57, 39.78s/it][A
7.  75%|███████▌  | 12/16 [03:57<01:19, 19.83s/it][A
5.  75%|███████▌  | 12/16 [02:48<00:56, 14.07s/it][A
7.  81%|████████▏ | 13/16 [04:17<00:59, 19.82s/it][A
5.  81%|████████▏ | 13/16 [03:02<00:42, 14.07s/it][A
5.  88%|████████▊ | 14/16 [03:16<00:28, 14.08s/it][A
4.  31%|███▏      | 5/16 [03:18<07:17, 39.74s/it][A
7.  88%|████████▊ | 14/16 [04:37<00:39, 19.82s/it][A
5.  94%|█████████▍| 15/16 [03:31<00:14, 14.09s/it][A


5.  30%|███       | 9/30 [1:15:02<2:17:14, 392.12s/it]
5. {'loss': 0.0008, 'grad_norm': 0.05141795799136162, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  30%|███       | 9/30 [1:16:54<2:17:14, 392.12s/it]
5. {'train_runtime': 225.1004, 'train_samples_per_second': 0.569, 'train_steps_per_second': 0.071, 'train_loss': 0.003282766498159617, 'epoch': 1.0}


5. 100%|██████████| 16/16 [03:45<00:00, 14.08s/it][A
5. [A
5. 100%|██████████| 16/16 [03:45<00:00, 14.08s/it][A
5. [A
5. 100%|██████████| 16/16 [03:45<00:00, 14.08s/it][A
5. 100%|██████████| 16/16 [03:45<00:00, 14.07s/it]
7.  94%|█████████▍| 15/16 [04:57<00:19, 19.82s/it][A


7.  37%|███▋      | 11/30 [1:25:26<2:07:47, 403.56s/it]
7. {'loss': 0.0095, 'grad_norm': 10.883174896240234, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  37%|███▋      | 11/30 [1:28:04<2:07:47, 403.56s/it]
7. {'train_runtime': 317.1824, 'train_samples_per_second': 0.404, 'train_steps_per_second': 0.05, 'train_loss': 0.04173897113651037, 'epoch': 1.0}


7. 100%|██████████| 16/16 [05:17<00:00, 19.82s/it][A
7. [A
7. 100%|██████████| 16/16 [05:17<00:00, 19.82s/it][A
7. [A
7. 100%|██████████| 16/16 [05:17<00:00, 19.82s/it][A
7. 100%|██████████| 16/16 [05:17<00:00, 19.82s/it]
4.  38%|███▊      | 6/16 [03:58<06:37, 39.73s/it][A


7.  37%|███▋      | 11/30 [1:28:04<2:07:47, 403.56s/it]*** -> Training took 317.1824 seconds.
7.  40%|████      | 12/30 [1:28:36<1:56:20, 387.79s/it]retraining model for key '4e34c42c' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1201.56 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1175.48 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  44%|████▍     | 7/16 [04:38<05:57, 39.71s/it][A


4. {'loss': 0.0016, 'grad_norm': 0.06306633353233337, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [05:18<05:18, 39.79s/it][A
4. [A
7.   6%|▋         | 1/16 [00:14<03:43, 14.90s/it][A
7.  12%|█▎        | 2/16 [00:29<03:28, 14.89s/it][A
7.  19%|█▉        | 3/16 [00:44<03:13, 14.88s/it][A
4.  50%|█████     | 8/16 [05:18<05:18, 39.79s/it][A
7.  25%|██▌       | 4/16 [00:59<02:58, 14.87s/it][A
7.  31%|███▏      | 5/16 [01:14<02:43, 14.87s/it][A


5.  30%|███       | 9/30 [1:16:54<2:17:14, 392.12s/it]*** -> Training took 225.1004 seconds.
5.  33%|███▎      | 10/30 [1:19:14<2:08:13, 384.70s/it]retraining model for key '58490d8a' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 963.27 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 946.98 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  56%|█████▋    | 9/16 [05:58<04:38, 39.82s/it][A
7.  38%|███▊      | 6/16 [01:29<02:28, 14.87s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  44%|████▍     | 7/16 [01:44<02:13, 14.87s/it][A


7. {'loss': 0.0183, 'grad_norm': 0.9747369289398193, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:58<01:58, 14.87s/it][A
7. [A
7.  50%|█████     | 8/16 [01:58<01:58, 14.87s/it][A
5.   6%|▋         | 1/16 [00:17<04:20, 17.38s/it][A
4.  62%|██████▎   | 10/16 [06:38<03:58, 39.82s/it][A
7.  56%|█████▋    | 9/16 [02:13<01:44, 14.87s/it][A
5.  12%|█▎        | 2/16 [00:34<04:03, 17.43s/it][A
7.  62%|██████▎   | 10/16 [02:28<01:29, 14.87s/it][A
5.  19%|█▉        | 3/16 [00:52<03:47, 17.50s/it][A


6.  33%|███▎      | 10/30 [1:20:35<2:32:50, 458.55s/it]*** -> Training took 343.5605 seconds.
6.  37%|███▋      | 11/30 [1:25:30<2:43:03, 514.93s/it]retraining model for key '8698868d' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 996.18 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 977.95 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  69%|██████▉   | 11/16 [02:43<01:14, 14.87s/it][A
4.  69%|██████▉   | 11/16 [07:17<03:19, 39.85s/it][A
5.  25%|██▌       | 4/16 [01:10<03:30, 17.54s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  75%|███████▌  | 12/16 [02:58<00:59, 14.86s/it][A
5.  31%|███▏      | 5/16 [01:27<03:13, 17.57s/it][A
7.  81%|████████▏ | 13/16 [03:13<00:44, 14.87s/it][A
6.   6%|▋         | 1/16 [00:16<04:14, 16.96s/it][A
5.  38%|███▊      | 6/16 [01:45<02:55, 17.60s/it][A
7.  88%|████████▊ | 14/16 [03:28<00:29, 14.89s/it][A
4.  75%|███████▌  | 12/16 [07:57<02:39, 39.88s/it][A
6.  12%|█▎        | 2/16 [00:34<03:58, 17.01s/it][A
7.  94%|█████████▍| 15/16 [03:43<00:14, 14.91s/it][A
7. 100%|██████████| 16/16 [03:58<00:00, 14.90s/it][A
7. [A
7. 100%|██████████| 16/16 [03:58<00:00, 14.90s/it][A
7. [A
7. 100%|██████████| 16/16 [03:58<00:00, 14.90s/it][A
7. 100%|██████████| 16/16 [03:58<00:00, 14.88s/it]


7.  40%|████      | 12/30 [1:30:38<1:56:20, 387.79s/it]
7. {'loss': 0.0011, 'grad_norm': 0.6665194630622864, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  40%|████      | 12/30 [1:32:37<1:56:20, 387.79s/it]
7. {'train_runtime': 238.1081, 'train_samples_per_second': 0.538, 'train_steps_per_second': 0.067, 'train_loss': 0.00969544576946646, 'epoch': 1.0}


5.  44%|████▍     | 7/16 [02:02<02:38, 17.63s/it][A


5. {'loss': 0.0166, 'grad_norm': 0.7819599509239197, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:20<02:21, 17.65s/it][A
5. [A
6.  19%|█▉        | 3/16 [00:51<03:41, 17.03s/it][A
5.  50%|█████     | 8/16 [02:20<02:21, 17.65s/it][A
6.  25%|██▌       | 4/16 [01:08<03:24, 17.04s/it][A
4.  81%|████████▏ | 13/16 [08:37<01:59, 39.88s/it][A


7.  40%|████      | 12/30 [1:32:37<1:56:20, 387.79s/it]*** -> Training took 238.1081 seconds.
7.  43%|████▎     | 13/30 [1:33:10<1:40:08, 353.46s/it]retraining model for key '62593bfd' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 603.22 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 595.10 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


5.  56%|█████▋    | 9/16 [02:38<02:03, 17.68s/it][A
6.  31%|███▏      | 5/16 [01:25<03:07, 17.03s/it][A
5.  62%|██████▎   | 10/16 [02:56<01:46, 17.69s/it][A
6.  38%|███▊      | 6/16 [01:42<02:50, 17.01s/it][A
4.  88%|████████▊ | 14/16 [09:17<01:19, 39.87s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  44%|████▍     | 7/16 [01:59<02:33, 17.03s/it][A


6. {'loss': 0.0098, 'grad_norm': 0.3585757911205292, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:16<02:16, 17.03s/it][A
6. [A
5.  69%|██████▉   | 11/16 [03:13<01:28, 17.70s/it][A
6.  50%|█████     | 8/16 [02:16<02:16, 17.03s/it][A
5.  75%|███████▌  | 12/16 [03:31<01:10, 17.70s/it][A
7.   6%|▋         | 1/16 [00:31<07:47, 31.19s/it][A
4.  94%|█████████▍| 15/16 [09:57<00:39, 39.84s/it][A


4.  37%|███▋      | 11/30 [1:27:33<2:12:58, 419.92s/it]
4. {'loss': 0.0002, 'grad_norm': 0.1908053159713745, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  37%|███▋      | 11/30 [1:32:52<2:12:58, 419.92s/it]
4. {'train_runtime': 637.0568, 'train_samples_per_second': 0.201, 'train_steps_per_second': 0.025, 'train_loss': 0.0009092597611015663, 'epoch': 1.0}


4. 100%|██████████| 16/16 [10:37<00:00, 39.79s/it][A
4. [A
4. 100%|██████████| 16/16 [10:37<00:00, 39.79s/it][A
4. [A
4. 100%|██████████| 16/16 [10:37<00:00, 39.79s/it][A
4. 100%|██████████| 16/16 [10:37<00:00, 39.82s/it]
6.  56%|█████▋    | 9/16 [02:33<01:59, 17.02s/it][A
5.  81%|████████▏ | 13/16 [03:49<00:53, 17.70s/it][A
6.  62%|██████▎   | 10/16 [02:50<01:42, 17.01s/it][A
5.  88%|████████▊ | 14/16 [04:06<00:35, 17.70s/it][A
7.  12%|█▎        | 2/16 [01:02<07:17, 31.22s/it][A
6.  69%|██████▉   | 11/16 [03:07<01:25, 17.00s/it][A
5.  94%|█████████▍| 15/16 [04:24<00:17, 17.71s/it][A


5.  33%|███▎      | 10/30 [1:21:38<2:08:13, 384.70s/it]
5. {'loss': 0.0076, 'grad_norm': 0.3308018147945404, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  33%|███▎      | 10/30 [1:24:00<2:08:13, 384.70s/it]
5. {'train_runtime': 282.4602, 'train_samples_per_second': 0.453, 'train_steps_per_second': 0.057, 'train_loss': 0.012092412449419498, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:42<00:00, 17.72s/it][A
5. [A
5. 100%|██████████| 16/16 [04:42<00:00, 17.72s/it][A
5. [A
5. 100%|██████████| 16/16 [04:42<00:00, 17.72s/it][A
5. 100%|██████████| 16/16 [04:42<00:00, 17.65s/it]
6.  75%|███████▌  | 12/16 [03:24<01:08, 17.00s/it][A
7.  19%|█▉        | 3/16 [01:33<06:46, 31.23s/it][A
6.  81%|████████▏ | 13/16 [03:41<00:50, 17.00s/it][A


5.  33%|███▎      | 10/30 [1:24:00<2:08:13, 384.70s/it]*** -> Training took 282.4602 seconds.
5.  37%|███▋      | 11/30 [1:24:36<1:55:41, 365.32s/it]retraining model for key '65b59efc' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1189.53 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1167.53 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


6.  88%|████████▊ | 14/16 [03:58<00:33, 17.00s/it][A
7.  25%|██▌       | 4/16 [02:04<06:14, 31.24s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  94%|█████████▍| 15/16 [04:15<00:16, 16.99s/it][A


6.  37%|███▋      | 11/30 [1:27:50<2:43:03, 514.93s/it]


6. 100%|██████████| 16/16 [04:32<00:00, 16.98s/it][A
6. [A
6. 100%|██████████| 16/16 [04:32<00:00, 16.98s/it][A
6. [A
6. 100%|██████████| 16/16 [04:32<00:00, 16.98s/it][A
6. 100%|██████████| 16/16 [04:32<00:00, 17.00s/it]


6. {'loss': 0.0009, 'grad_norm': 0.1422388106584549, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  37%|███▋      | 11/30 [1:30:06<2:43:03, 514.93s/it]
6. {'train_runtime': 272.0739, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.059, 'train_loss': 0.005361829826142639, 'epoch': 1.0}


5.   6%|▋         | 1/16 [00:14<03:38, 14.60s/it][A
7.  31%|███▏      | 5/16 [02:36<05:43, 31.26s/it][A
5.  12%|█▎        | 2/16 [00:29<03:24, 14.58s/it][A
5.  19%|█▉        | 3/16 [00:43<03:09, 14.56s/it][A
5.  25%|██▌       | 4/16 [00:58<02:54, 14.54s/it][A
7.  38%|███▊      | 6/16 [03:07<05:12, 31.29s/it][A


6.  37%|███▋      | 11/30 [1:30:06<2:43:03, 514.93s/it]*** -> Training took 272.0739 seconds.
6.  40%|████      | 12/30 [1:30:53<2:16:57, 456.50s/it]retraining model for key '8f215267' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 517.85 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 511.22 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  31%|███▏      | 5/16 [01:12<02:39, 14.53s/it][A
5.  38%|███▊      | 6/16 [01:27<02:25, 14.52s/it][A
7.  44%|████▍     | 7/16 [03:38<04:41, 31.33s/it][A


7. {'loss': 0.0046, 'grad_norm': 0.0870102196931839, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [04:10<04:11, 31.39s/it][A
7. [A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  44%|████▍     | 7/16 [01:41<02:10, 14.51s/it][A


5. {'loss': 0.0135, 'grad_norm': 0.3909570574760437, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:56<01:56, 14.51s/it][A
5. [A
5.  50%|█████     | 8/16 [01:56<01:56, 14.51s/it][A
7.  50%|█████     | 8/16 [04:10<04:11, 31.39s/it][A
5.  56%|█████▋    | 9/16 [02:10<01:41, 14.51s/it][A
6.   6%|▋         | 1/16 [00:34<08:31, 34.12s/it][A
5.  62%|██████▎   | 10/16 [02:25<01:27, 14.51s/it][A
7.  56%|█████▋    | 9/16 [04:41<03:39, 31.40s/it][A
5.  69%|██████▉   | 11/16 [02:39<01:12, 14.51s/it][A
6.  12%|█▎        | 2/16 [01:08<07:56, 34.05s/it][A
5.  75%|███████▌  | 12/16 [02:54<00:58, 14.51s/it][A
7.  62%|██████▎   | 10/16 [05:13<03:08, 31.36s/it][A
5.  81%|████████▏ | 13/16 [03:08<00:43, 14.50s/it][A
6.  19%|█▉        | 3/16 [01:42<07:21, 34.00s/it][A
5.  88%|████████▊ | 14/16 [03:23<00:28, 14.50s/it][A
7.  69%|██████▉   | 11/16 [05:44<02:36, 31.34s/it][A
5.  94%|█████████▍| 15/16 [03:37<00:14, 14.50s/it][A


5.  37%|███▋      | 11/30 [1:26:35<1:55:41, 365.32s/it]
5. {'loss': 0.0027, 'grad_norm': 0.4079092741012573, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  37%|███▋      | 11/30 [1:28:31<1:55:41, 365.32s/it]
5. {'train_runtime': 232.2491, 'train_samples_per_second': 0.551, 'train_steps_per_second': 0.069, 'train_loss': 0.008108763489872217, 'epoch': 1.0}


5. 100%|██████████| 16/16 [03:52<00:00, 14.50s/it][A
5. [A
5. 100%|██████████| 16/16 [03:52<00:00, 14.50s/it][A
5. [A
5. 100%|██████████| 16/16 [03:52<00:00, 14.50s/it][A
5. 100%|██████████| 16/16 [03:52<00:00, 14.52s/it]
6.  25%|██▌       | 4/16 [02:16<06:48, 34.01s/it][A
7.  75%|███████▌  | 12/16 [06:15<02:05, 31.34s/it][A
6.  31%|███▏      | 5/16 [02:50<06:14, 34.00s/it][A


5.  37%|███▋      | 11/30 [1:28:31<1:55:41, 365.32s/it]*** -> Training took 232.2491 seconds.
5.  40%|████      | 12/30 [1:29:24<1:42:35, 341.98s/it]retraining model for key '6ffbe589' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 940.27 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 925.27 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  81%|████████▏ | 13/16 [06:47<01:33, 31.32s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A


4.  37%|███▋      | 11/30 [1:32:52<2:12:58, 419.92s/it]*** -> Training took 637.0568 seconds.
4.  40%|████      | 12/30 [1:39:21<3:01:43, 605.76s/it]retraining model for key '5dbc8537' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1542.52 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  38%|███▊      | 6/16 [03:24<05:40, 34.01s/it][A
7.  88%|████████▊ | 14/16 [07:18<01:02, 31.31s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   6%|▋         | 1/16 [00:18<04:31, 18.08s/it][A
4.   6%|▋         | 1/16 [00:10<02:37, 10.47s/it][A
5.  12%|█▎        | 2/16 [00:36<04:13, 18.08s/it][A
4.  12%|█▎        | 2/16 [00:20<02:25, 10.41s/it][A
6.  44%|████▍     | 7/16 [03:58<05:05, 33.99s/it][A


6. {'loss': 0.004, 'grad_norm': 0.22683216631412506, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [04:32<04:32, 34.00s/it][A
6. [A
7.  94%|█████████▍| 15/16 [07:49<00:31, 31.29s/it][A


7.  43%|████▎     | 13/30 [1:37:25<1:40:08, 353.46s/it]
7. {'loss': 0.0007, 'grad_norm': 0.08454418927431107, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  43%|████▎     | 13/30 [1:41:35<1:40:08, 353.46s/it]
7. {'train_runtime': 500.9331, 'train_samples_per_second': 0.256, 'train_steps_per_second': 0.032, 'train_loss': 0.0026422272494528443, 'epoch': 1.0}


7. 100%|██████████| 16/16 [08:20<00:00, 31.29s/it][A
7. [A
7. 100%|██████████| 16/16 [08:20<00:00, 31.29s/it][A
7. [A
7. 100%|██████████| 16/16 [08:20<00:00, 31.29s/it][A
7. 100%|██████████| 16/16 [08:20<00:00, 31.31s/it]
4.  19%|█▉        | 3/16 [00:31<02:15, 10.39s/it][A
5.  19%|█▉        | 3/16 [00:54<03:55, 18.09s/it][A
4.  25%|██▌       | 4/16 [00:41<02:04, 10.38s/it][A
4.  31%|███▏      | 5/16 [00:51<01:54, 10.38s/it][A
5.  25%|██▌       | 4/16 [01:12<03:36, 18.08s/it][A
6.  50%|█████     | 8/16 [04:32<04:32, 34.00s/it][A
4.  38%|███▊      | 6/16 [01:02<01:43, 10.38s/it][A
4.  44%|████▍     | 7/16 [01:12<01:33, 10.38s/it][A


4. {'loss': 0.0189, 'grad_norm': 0.4354828894138336, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:23<01:23, 10.38s/it][A
4. [A
5.  31%|███▏      | 5/16 [01:30<03:19, 18.11s/it][A
4.  50%|█████     | 8/16 [01:23<01:23, 10.38s/it][A
5.  38%|███▊      | 6/16 [01:48<03:01, 18.12s/it][A
6.  56%|█████▋    | 9/16 [05:06<03:58, 34.01s/it][A
4.  56%|█████▋    | 9/16 [01:33<01:12, 10.39s/it][A
4.  62%|██████▎   | 10/16 [01:43<01:02, 10.38s/it][A
5.  44%|████▍     | 7/16 [02:06<02:42, 18.09s/it][A


5. {'loss': 0.0226, 'grad_norm': 1.3693344593048096, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:24<02:24, 18.07s/it][A
5. [A
4.  69%|██████▉   | 11/16 [01:54<00:51, 10.38s/it][A
4.  75%|███████▌  | 12/16 [02:04<00:41, 10.37s/it][A
5.  50%|█████     | 8/16 [02:24<02:24, 18.07s/it][A
6.  62%|██████▎   | 10/16 [05:40<03:24, 34.00s/it][A
4.  81%|████████▏ | 13/16 [02:14<00:31, 10.38s/it][A
5.  56%|█████▋    | 9/16 [02:42<02:06, 18.05s/it][A
4.  88%|████████▊ | 14/16 [02:25<00:20, 10.37s/it][A
4.  94%|█████████▍| 15/16 [02:35<00:10, 10.37s/it][A


4.  40%|████      | 12/30 [1:40:47<3:01:43, 605.76s/it]


4. 100%|██████████| 16/16 [02:46<00:00, 10.37s/it][A
4. [A
4. 100%|██████████| 16/16 [02:46<00:00, 10.37s/it][A
4. [A


4. {'loss': 0.0022, 'grad_norm': 0.028023062273859978, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  40%|████      | 12/30 [1:42:10<3:01:43, 605.76s/it]
4. {'train_runtime': 166.0786, 'train_samples_per_second': 0.771, 'train_steps_per_second': 0.096, 'train_loss': 0.01058955246116966, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:46<00:00, 10.37s/it][A
4. 100%|██████████| 16/16 [02:46<00:00, 10.38s/it]
6.  69%|██████▉   | 11/16 [06:14<02:49, 34.00s/it][A
5.  62%|██████▎   | 10/16 [03:00<01:48, 18.09s/it][A
5.  69%|██████▉   | 11/16 [03:19<01:30, 18.11s/it][A


4.  40%|████      | 12/30 [1:42:10<3:01:43, 605.76s/it]*** -> Training took 166.0786 seconds.
4.  43%|████▎     | 13/30 [1:42:40<2:16:40, 482.40s/it]retraining model for key '64efde09' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 589.27 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 580.46 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  75%|███████▌  | 12/16 [06:48<02:16, 34.00s/it][A
5.  75%|███████▌  | 12/16 [03:37<01:12, 18.12s/it][A
5.  81%|████████▏ | 13/16 [03:55<00:54, 18.10s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  81%|████████▏ | 13/16 [07:22<01:42, 34.00s/it][A
5.  88%|████████▊ | 14/16 [04:13<00:36, 18.09s/it][A
4.   6%|▋         | 1/16 [00:28<07:10, 28.72s/it][A
5.  94%|█████████▍| 15/16 [04:31<00:18, 18.11s/it][A


5.  40%|████      | 12/30 [1:31:52<1:42:35, 341.98s/it]
5. {'loss': 0.0068, 'grad_norm': 1.3293839693069458, 'learning_rate': 5e-05, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:49<00:00, 18.13s/it][A
5. [A


5.  40%|████      | 12/30 [1:34:17<1:42:35, 341.98s/it]
5. {'train_runtime': 289.6147, 'train_samples_per_second': 0.442, 'train_steps_per_second': 0.055, 'train_loss': 0.01468356722034514, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:49<00:00, 18.13s/it][A
5. [A
5. 100%|██████████| 16/16 [04:49<00:00, 18.13s/it][A
5. 100%|██████████| 16/16 [04:49<00:00, 18.10s/it]
6.  88%|████████▊ | 14/16 [07:56<01:07, 33.99s/it][A


5.  40%|████      | 12/30 [1:34:17<1:42:35, 341.98s/it]*** -> Training took 289.6147 seconds.
5.  43%|████▎     | 13/30 [1:34:27<1:33:33, 330.22s/it]retraining model for key '800d221b' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 984.70 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 967.73 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  12%|█▎        | 2/16 [00:57<06:42, 28.75s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  94%|█████████▍| 15/16 [08:30<00:34, 34.00s/it][A


6.  40%|████      | 12/30 [1:35:29<2:16:57, 456.50s/it]
6. {'loss': 0.0014, 'grad_norm': 0.1596417874097824, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  40%|████      | 12/30 [1:40:01<2:16:57, 456.50s/it]
6. {'train_runtime': 544.0317, 'train_samples_per_second': 0.235, 'train_steps_per_second': 0.029, 'train_loss': 0.0026847630506381392, 'epoch': 1.0}


6. 100%|██████████| 16/16 [09:04<00:00, 33.98s/it][A
6. [A
6. 100%|██████████| 16/16 [09:04<00:00, 33.98s/it][A
6. [A
6. 100%|██████████| 16/16 [09:04<00:00, 33.98s/it][A
6. 100%|██████████| 16/16 [09:04<00:00, 34.00s/it]
5.   6%|▋         | 1/16 [00:17<04:21, 17.41s/it][A
4.  19%|█▉        | 3/16 [01:26<06:14, 28.78s/it][A
5.  12%|█▎        | 2/16 [00:34<04:04, 17.48s/it][A
4.  25%|██▌       | 4/16 [01:55<05:46, 28.86s/it][A
5.  19%|█▉        | 3/16 [00:52<03:47, 17.47s/it][A
5.  25%|██▌       | 4/16 [01:09<03:29, 17.43s/it][A
4.  31%|███▏      | 5/16 [02:24<05:17, 28.87s/it][A
5.  31%|███▏      | 5/16 [01:27<03:11, 17.39s/it][A
5.  38%|███▊      | 6/16 [01:44<02:53, 17.35s/it][A
4.  38%|███▊      | 6/16 [02:53<04:48, 28.86s/it][A
5.  44%|████▍     | 7/16 [02:01<02:36, 17.36s/it][A


5. {'loss': 0.0023, 'grad_norm': 0.20553365349769592, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:19<02:18, 17.37s/it][A
5. [A


7.  43%|████▎     | 13/30 [1:41:35<1:40:08, 353.46s/it]*** -> Training took 500.9331 seconds.
7.  47%|████▋     | 14/30 [1:48:00<2:17:28, 515.54s/it]retraining model for key '67e490f4' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 813.04 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 798.75 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  44%|████▍     | 7/16 [03:21<04:19, 28.85s/it][A


4. {'loss': 0.004, 'grad_norm': 0.137748122215271, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [03:50<03:50, 28.86s/it][A
4. [A
5.  50%|█████     | 8/16 [02:19<02:18, 17.37s/it][A
5.  56%|█████▋    | 9/16 [02:36<02:01, 17.39s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  50%|█████     | 8/16 [03:50<03:50, 28.86s/it][A
5.  62%|██████▎   | 10/16 [02:53<01:44, 17.38s/it][A
7.   6%|▋         | 1/16 [00:22<05:41, 22.73s/it][A
5.  69%|██████▉   | 11/16 [03:11<01:26, 17.38s/it][A
4.  56%|█████▋    | 9/16 [04:19<03:22, 28.86s/it][A
7.  12%|█▎        | 2/16 [00:45<05:17, 22.71s/it][A
5.  75%|███████▌  | 12/16 [03:28<01:09, 17.39s/it][A
4.  62%|██████▎   | 10/16 [04:48<02:53, 28.87s/it][A
7.  19%|█▉        | 3/16 [01:08<04:55, 22.70s/it][A
5.  81%|████████▏ | 13/16 [03:46<00:52, 17.39s/it][A
5.  88%|████████▊ | 14/16 [04:03<00:34, 17.38s/it][A
7.  25%|██▌       | 4/16 [01:30<04:32, 22.70s/it][A
4.  69%|██████▉   | 11/16 [05:17<02:24, 28.87s/it][A


6.  40%|████      | 12/30 [1:40:01<2:16:57, 456.50s/it]*** -> Training took 544.0317 seconds.


5.  94%|█████████▍| 15/16 [04:20<00:17, 17.38s/it][A


5.  43%|████▎     | 13/30 [1:36:50<1:33:33, 330.22s/it]
5. {'loss': 0.0005, 'grad_norm': 0.2673611640930176, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  43%|████▎     | 13/30 [1:39:09<1:33:33, 330.22s/it]
5. {'train_runtime': 278.2424, 'train_samples_per_second': 0.46, 'train_steps_per_second': 0.058, 'train_loss': 0.0013723369338549674, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:38<00:00, 17.38s/it][A
5. [A
5. 100%|██████████| 16/16 [04:38<00:00, 17.38s/it][A
5. [A
5. 100%|██████████| 16/16 [04:38<00:00, 17.38s/it][A
5. 100%|██████████| 16/16 [04:38<00:00, 17.39s/it]


6.  43%|████▎     | 13/30 [1:44:09<2:38:29, 559.37s/it]retraining model for key '9aaea919' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 289.20 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 286.34 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  31%|███▏      | 5/16 [01:53<04:09, 22.70s/it][A
4.  75%|███████▌  | 12/16 [05:46<01:55, 28.87s/it][A
7.  38%|███▊      | 6/16 [02:16<03:47, 22.71s/it][A
4.  81%|████████▏ | 13/16 [06:15<01:26, 28.87s/it][A
7.  44%|████▍     | 7/16 [02:39<03:24, 22.73s/it][A


7. {'loss': 0.0105, 'grad_norm': 0.3553115129470825, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [03:01<03:01, 22.74s/it][A
7. [A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  88%|████████▊ | 14/16 [06:44<00:57, 28.91s/it][A
7.  50%|█████     | 8/16 [03:01<03:01, 22.74s/it][A
7.  56%|█████▋    | 9/16 [03:24<02:39, 22.73s/it][A
4.  94%|█████████▍| 15/16 [07:13<00:28, 28.94s/it][A


4.  43%|████▎     | 13/30 [1:46:34<2:16:40, 482.40s/it]
4. {'loss': 0.0008, 'grad_norm': 0.22575818002223969, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  43%|████▎     | 13/30 [1:50:26<2:16:40, 482.40s/it]
4. {'train_runtime': 462.0083, 'train_samples_per_second': 0.277, 'train_steps_per_second': 0.035, 'train_loss': 0.0024166252405848354, 'epoch': 1.0}


4. 100%|██████████| 16/16 [07:41<00:00, 28.93s/it][A
4. [A
4. 100%|██████████| 16/16 [07:41<00:00, 28.93s/it][A
4. [A
4. 100%|██████████| 16/16 [07:42<00:00, 28.93s/it][A
4. 100%|██████████| 16/16 [07:42<00:00, 28.88s/it]
7.  62%|██████▎   | 10/16 [03:47<02:16, 22.73s/it][A
6.   6%|▋         | 1/16 [01:02<15:36, 62.43s/it][A
7.  69%|██████▉   | 11/16 [04:09<01:53, 22.72s/it][A
7.  75%|███████▌  | 12/16 [04:32<01:30, 22.72s/it][A
6.  12%|█▎        | 2/16 [02:04<14:29, 62.08s/it][A
7.  81%|████████▏ | 13/16 [04:55<01:08, 22.73s/it][A
7.  88%|████████▊ | 14/16 [05:18<00:45, 22.72s/it][A
7.  94%|█████████▍| 15/16 [05:40<00:22, 22.73s/it][A


7.  47%|████▋     | 14/30 [1:51:06<2:17:28, 515.54s/it]
7. {'loss': 0.0007, 'grad_norm': 0.6450966596603394, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  47%|████▋     | 14/30 [1:54:08<2:17:28, 515.54s/it]
7. {'train_runtime': 363.5274, 'train_samples_per_second': 0.352, 'train_steps_per_second': 0.044, 'train_loss': 0.005572533002123237, 'epoch': 1.0}


7. 100%|██████████| 16/16 [06:03<00:00, 22.72s/it][A
7. [A
7. 100%|██████████| 16/16 [06:03<00:00, 22.72s/it][A
7. [A
7. 100%|██████████| 16/16 [06:03<00:00, 22.72s/it][A
7. 100%|██████████| 16/16 [06:03<00:00, 22.72s/it]
6.  19%|█▉        | 3/16 [03:06<13:26, 62.03s/it][A


7.  47%|████▋     | 14/30 [1:54:08<2:17:28, 515.54s/it]*** -> Training took 363.5274 seconds.
7.  50%|█████     | 15/30 [1:54:46<2:00:35, 482.35s/it]retraining model for key '7491f3cf' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1127.61 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1107.33 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...
4.  43%|████▎     | 13/30 [1:50:26<2:16:40, 482.40s/it]*** -> Training took 462.0083 seconds.
4.  47%|████▋     | 14/30 [1:53:31<2:22:15, 533.48s/it]retraining model for key '6e4f6532' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 752.20 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 739.82 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:16<04:10, 16.71s/it][A
6.  25%|██▌       | 4/16 [04:08<12:24, 62.01s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  12%|█▎        | 2/16 [00:33<03:53, 16.68s/it][A
4.   6%|▋         | 1/16 [00:22<05:35, 22.39s/it][A
7.  19%|█▉        | 3/16 [00:50<03:36, 16.68s/it][A
4.  12%|█▎        | 2/16 [00:44<05:13, 22.37s/it][A
7.  25%|██▌       | 4/16 [01:06<03:20, 16.69s/it][A
6.  31%|███▏      | 5/16 [05:10<11:21, 61.95s/it][A


5.  43%|████▎     | 13/30 [1:39:09<1:33:33, 330.22s/it]*** -> Training took 278.2424 seconds.


7.  31%|███▏      | 5/16 [01:23<03:03, 16.71s/it][A


5.  47%|████▋     | 14/30 [1:45:28<1:54:41, 430.08s/it]retraining model for key '88e364bc' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 647.44 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 639.12 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  19%|█▉        | 3/16 [01:07<04:50, 22.36s/it][A
7.  38%|███▊      | 6/16 [01:40<02:47, 16.71s/it][A
4.  25%|██▌       | 4/16 [01:29<04:28, 22.38s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  44%|████▍     | 7/16 [01:56<02:30, 16.72s/it][A


7. {'loss': 0.0086, 'grad_norm': 0.20788699388504028, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:13<02:13, 16.72s/it][A
7. [A
4.  31%|███▏      | 5/16 [01:51<04:06, 22.40s/it][A
7.  50%|█████     | 8/16 [02:13<02:13, 16.72s/it][A
5.   6%|▋         | 1/16 [00:27<06:49, 27.30s/it][A
6.  38%|███▊      | 6/16 [06:11<10:19, 61.95s/it][A
7.  56%|█████▋    | 9/16 [02:30<01:56, 16.71s/it][A
4.  38%|███▊      | 6/16 [02:14<03:43, 22.40s/it][A
7.  62%|██████▎   | 10/16 [02:47<01:40, 16.71s/it][A
5.  12%|█▎        | 2/16 [00:54<06:22, 27.30s/it][A
4.  44%|████▍     | 7/16 [02:36<03:21, 22.38s/it][A


4. {'loss': 0.0102, 'grad_norm': 0.39253416657447815, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:59<02:58, 22.36s/it][A
4. [A
7.  69%|██████▉   | 11/16 [03:03<01:23, 16.73s/it][A
5.  19%|█▉        | 3/16 [01:21<05:55, 27.32s/it][A
4.  50%|█████     | 8/16 [02:59<02:58, 22.36s/it][A
7.  75%|███████▌  | 12/16 [03:20<01:06, 16.73s/it][A
6.  44%|████▍     | 7/16 [07:13<09:17, 61.92s/it][A


6. {'loss': 0.0051, 'grad_norm': 0.080276720225811, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [08:15<08:15, 61.93s/it][A
6. [A
7.  81%|████████▏ | 13/16 [03:37<00:50, 16.72s/it][A
4.  56%|█████▋    | 9/16 [03:21<02:36, 22.34s/it][A
5.  25%|██▌       | 4/16 [01:49<05:27, 27.31s/it][A
7.  88%|████████▊ | 14/16 [03:53<00:33, 16.72s/it][A
4.  62%|██████▎   | 10/16 [03:43<02:14, 22.34s/it][A
7.  94%|█████████▍| 15/16 [04:10<00:16, 16.72s/it][A


7.  50%|█████     | 15/30 [1:57:03<2:00:35, 482.35s/it]
7. {'loss': 0.004, 'grad_norm': 0.14301863312721252, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  50%|█████     | 15/30 [1:59:17<2:00:35, 482.35s/it]


7. 100%|██████████| 16/16 [04:27<00:00, 16.71s/it][A
7. [A
7. 100%|██████████| 16/16 [04:27<00:00, 16.71s/it][A
7. [A


7. {'train_runtime': 267.4178, 'train_samples_per_second': 0.479, 'train_steps_per_second': 0.06, 'train_loss': 0.006332793738692999, 'epoch': 1.0}


7. 100%|██████████| 16/16 [04:27<00:00, 16.71s/it][A
7. 100%|██████████| 16/16 [04:27<00:00, 16.71s/it]
5.  31%|███▏      | 5/16 [02:16<05:00, 27.31s/it][A
6.  50%|█████     | 8/16 [08:15<08:15, 61.93s/it][A
4.  69%|██████▉   | 11/16 [04:05<01:51, 22.34s/it][A
5.  38%|███▊      | 6/16 [02:43<04:33, 27.32s/it][A
4.  75%|███████▌  | 12/16 [04:28<01:29, 22.33s/it][A
5.  44%|████▍     | 7/16 [03:11<04:05, 27.31s/it][A
5.  50%|█████     | 8/16 [03:38<03:38, 27.32s/it][A
5. [A


5. {'loss': 0.0029, 'grad_norm': 0.060749270021915436, 'learning_rate': 5e-05, 'epoch': 0.5}
7.  50%|█████     | 15/30 [1:59:17<2:00:35, 482.35s/it]*** -> Training took 267.4178 seconds.
7.  53%|█████▎    | 16/30 [2:00:17<1:41:57, 436.95s/it]retraining model for key '7666fa5d' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1527.44 examples/s]
4.  81%|████████▏ | 13/16 [04:50<01:06, 22.31s/it][A
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  56%|█████▋    | 9/16 [09:17<07:13, 61.92s/it][A
5.  50%|█████     | 8/16 [03:38<03:38, 27.32s/it][A
4.  88%|████████▊ | 14/16 [05:12<00:44, 22.30s/it][A
7.   6%|▋         | 1/16 [00:11<02:47, 11.20s/it][A
7.  12%|█▎        | 2/16 [00:22<02:36, 11.18s/it][A
4.  94%|█████████▍| 15/16 [05:35<00:22, 22.29s/it][A


4.  47%|████▋     | 14/30 [1:56:34<2:22:15, 533.48s/it]
4. {'loss': 0.0023, 'grad_norm': 0.18445579707622528, 'learning_rate': 5e-05, 'epoch': 1.0}


4. 100%|██████████| 16/16 [05:57<00:00, 22.28s/it][A
4. [A
4. 100%|██████████| 16/16 [05:57<00:00, 22.28s/it][A
4. [A
4. 100%|██████████| 16/16 [05:57<00:00, 22.28s/it][A
4. 100%|██████████| 16/16 [05:57<00:00, 22.34s/it]


4.  47%|████▋     | 14/30 [1:59:32<2:22:15, 533.48s/it]
4. {'train_runtime': 357.374, 'train_samples_per_second': 0.358, 'train_steps_per_second': 0.045, 'train_loss': 0.006254275445826352, 'epoch': 1.0}


7.  19%|█▉        | 3/16 [00:33<02:25, 11.19s/it][A
5.  56%|█████▋    | 9/16 [04:05<03:11, 27.32s/it][A
7.  25%|██▌       | 4/16 [00:44<02:14, 11.19s/it][A
7.  31%|███▏      | 5/16 [00:55<02:03, 11.21s/it][A
5.  62%|██████▎   | 10/16 [04:33<02:43, 27.32s/it][A
6.  62%|██████▎   | 10/16 [10:19<06:11, 61.89s/it][A
7.  38%|███▊      | 6/16 [01:07<01:52, 11.22s/it][A
7.  44%|████▍     | 7/16 [01:18<01:41, 11.22s/it][A


7. {'loss': 0.0161, 'grad_norm': 0.28719577193260193, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:29<01:29, 11.24s/it][A
7. [A
7.  50%|█████     | 8/16 [01:29<01:29, 11.24s/it][A
5.  69%|██████▉   | 11/16 [05:00<02:16, 27.31s/it][A
7.  56%|█████▋    | 9/16 [01:40<01:18, 11.24s/it][A
7.  62%|██████▎   | 10/16 [01:52<01:07, 11.25s/it][A
5.  75%|███████▌  | 12/16 [05:27<01:49, 27.32s/it][A
7.  69%|██████▉   | 11/16 [02:03<00:56, 11.26s/it][A
6.  69%|██████▉   | 11/16 [11:21<05:09, 61.89s/it][A
7.  75%|███████▌  | 12/16 [02:14<00:45, 11.26s/it][A
5.  81%|████████▏ | 13/16 [05:55<01:21, 27.32s/it][A
7.  81%|████████▏ | 13/16 [02:26<00:33, 11.27s/it][A
7.  88%|████████▊ | 14/16 [02:37<00:22, 11.28s/it][A
7.  94%|█████████▍| 15/16 [02:48<00:11, 11.28s/it][A


7.  53%|█████▎    | 16/30 [2:01:50<1:41:57, 436.95s/it]
7. {'loss': 0.0047, 'grad_norm': 0.07511958479881287, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  53%|█████▎    | 16/30 [2:03:20<1:41:57, 436.95s/it]
7. {'train_runtime': 179.9703, 'train_samples_per_second': 0.711, 'train_steps_per_second': 0.089, 'train_loss': 0.010388202033936977, 'epoch': 1.0}


7. 100%|██████████| 16/16 [02:59<00:00, 11.28s/it][A
7. [A
7. 100%|██████████| 16/16 [02:59<00:00, 11.28s/it][A
7. [A
7. 100%|██████████| 16/16 [02:59<00:00, 11.28s/it][A
7. 100%|██████████| 16/16 [02:59<00:00, 11.25s/it]
5.  88%|████████▊ | 14/16 [06:22<00:54, 27.31s/it][A
6.  75%|███████▌  | 12/16 [12:23<04:07, 61.89s/it][A
5.  94%|█████████▍| 15/16 [06:49<00:27, 27.32s/it][A


5.  47%|████▋     | 14/30 [1:49:11<1:54:41, 430.08s/it]
5. {'loss': 0.0004, 'grad_norm': 0.14605487883090973, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  47%|████▋     | 14/30 [1:52:49<1:54:41, 430.08s/it]
5. {'train_runtime': 437.0188, 'train_samples_per_second': 0.293, 'train_steps_per_second': 0.037, 'train_loss': 0.0016805720515549183, 'epoch': 1.0}


5. 100%|██████████| 16/16 [07:17<00:00, 27.31s/it][A
5. [A
5. 100%|██████████| 16/16 [07:17<00:00, 27.31s/it][A
5. [A
5. 100%|██████████| 16/16 [07:17<00:00, 27.31s/it][A
5. 100%|██████████| 16/16 [07:17<00:00, 27.31s/it]


4.  47%|████▋     | 14/30 [1:59:32<2:22:15, 533.48s/it]*** -> Training took 357.374 seconds.
4.  50%|█████     | 15/30 [2:03:06<2:16:30, 546.06s/it]retraining model for key '71e489b6' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 903.14 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 888.13 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  81%|████████▏ | 13/16 [13:25<03:05, 61.88s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
4.   6%|▋         | 1/16 [00:18<04:36, 18.40s/it][A


7.  53%|█████▎    | 16/30 [2:03:20<1:41:57, 436.95s/it]*** -> Training took 179.9703 seconds.
7.  57%|█████▋    | 17/30 [2:05:31<1:26:37, 399.85s/it]retraining model for key '78332cb0' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 2361.62 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  12%|█▎        | 2/16 [00:36<04:16, 18.35s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  88%|████████▊ | 14/16 [14:27<02:03, 61.89s/it][A
7.   6%|▋         | 1/16 [00:07<01:47,  7.16s/it][A
4.  19%|█▉        | 3/16 [00:55<03:58, 18.35s/it][A
7.  12%|█▎        | 2/16 [00:14<01:39,  7.13s/it][A
7.  19%|█▉        | 3/16 [00:21<01:32,  7.14s/it][A
7.  25%|██▌       | 4/16 [00:28<01:25,  7.15s/it][A
4.  25%|██▌       | 4/16 [01:13<03:39, 18.33s/it][A
7.  31%|███▏      | 5/16 [00:35<01:18,  7.15s/it][A
7.  38%|███▊      | 6/16 [00:42<01:11,  7.15s/it][A
4.  31%|███▏      | 5/16 [01:31<03:21, 18.34s/it][A
7.  44%|████▍     | 7/16 [00:50<01:04,  7.15s/it][A


7. {'loss': 0.0067, 'grad_norm': 0.323025107383728, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [00:57<00:57,  7.16s/it][A
7. [A
7.  50%|█████     | 8/16 [00:57<00:57,  7.16s/it][A
7.  56%|█████▋    | 9/16 [01:04<00:50,  7.16s/it][A
6.  94%|█████████▍| 15/16 [15:28<01:01, 61.88s/it][A


6.  43%|████▎     | 13/30 [1:52:30<2:38:29, 559.37s/it]
6. {'loss': 0.0013, 'grad_norm': 0.046533357352018356, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  43%|████▎     | 13/30 [2:00:45<2:38:29, 559.37s/it]


6. 100%|██████████| 16/16 [16:30<00:00, 61.89s/it][A
6. [A
6. 100%|██████████| 16/16 [16:30<00:00, 61.89s/it][A
6. [A


6. {'train_runtime': 990.8455, 'train_samples_per_second': 0.129, 'train_steps_per_second': 0.016, 'train_loss': 0.00324725522659719, 'epoch': 1.0}


6. 100%|██████████| 16/16 [16:30<00:00, 61.89s/it][A
6. 100%|██████████| 16/16 [16:30<00:00, 61.93s/it]
4.  38%|███▊      | 6/16 [01:50<03:03, 18.34s/it][A
7.  62%|██████▎   | 10/16 [01:11<00:42,  7.16s/it][A
7.  69%|██████▉   | 11/16 [01:18<00:35,  7.16s/it][A
7.  75%|███████▌  | 12/16 [01:25<00:28,  7.17s/it][A
4.  44%|████▍     | 7/16 [02:08<02:45, 18.34s/it][A


4. {'loss': 0.007, 'grad_norm': 0.3824373185634613, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:26<02:26, 18.34s/it][A
4. [A
7.  81%|████████▏ | 13/16 [01:33<00:21,  7.17s/it][A
7.  88%|████████▊ | 14/16 [01:40<00:14,  7.18s/it][A
4.  50%|█████     | 8/16 [02:26<02:26, 18.34s/it][A
7.  94%|█████████▍| 15/16 [01:47<00:07,  7.19s/it][A


7.  57%|█████▋    | 17/30 [2:06:31<1:26:37, 399.85s/it]
7. {'loss': 0.0021, 'grad_norm': 0.07523422688245773, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  57%|█████▋    | 17/30 [2:07:28<1:26:37, 399.85s/it]
7. {'train_runtime': 114.7096, 'train_samples_per_second': 1.116, 'train_steps_per_second': 0.139, 'train_loss': 0.004440664779394865, 'epoch': 1.0}


7. 100%|██████████| 16/16 [01:54<00:00,  7.20s/it][A
7. [A
7. 100%|██████████| 16/16 [01:54<00:00,  7.20s/it][A
7. [A
7. 100%|██████████| 16/16 [01:54<00:00,  7.20s/it][A
7. 100%|██████████| 16/16 [01:54<00:00,  7.17s/it]
4.  56%|█████▋    | 9/16 [02:45<02:08, 18.35s/it][A
4.  62%|██████▎   | 10/16 [03:03<01:50, 18.36s/it][A
4.  69%|██████▉   | 11/16 [03:21<01:31, 18.35s/it][A
4.  75%|███████▌  | 12/16 [03:40<01:13, 18.35s/it][A


7.  57%|█████▋    | 17/30 [2:07:28<1:26:37, 399.85s/it]*** -> Training took 114.7096 seconds.
7.  60%|██████    | 18/30 [2:08:50<1:07:56, 339.71s/it]retraining model for key '7b3084d4' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1200.14 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1176.53 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  81%|████████▏ | 13/16 [03:58<00:55, 18.36s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  88%|████████▊ | 14/16 [04:16<00:36, 18.36s/it][A
7.   6%|▋         | 1/16 [00:14<03:41, 14.79s/it][A
4.  94%|█████████▍| 15/16 [04:35<00:18, 18.36s/it][A


4.  50%|█████     | 15/30 [2:05:37<2:16:30, 546.06s/it]


4. 100%|██████████| 16/16 [04:53<00:00, 18.36s/it][A
4. [A
4. 100%|██████████| 16/16 [04:53<00:00, 18.36s/it][A
4. [A
4. 100%|██████████| 16/16 [04:53<00:00, 18.36s/it][A
4. 100%|██████████| 16/16 [04:53<00:00, 18.35s/it]


4. {'loss': 0.003, 'grad_norm': 0.04418440908193588, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  50%|█████     | 15/30 [2:08:04<2:16:30, 546.06s/it]
4. {'train_runtime': 293.6519, 'train_samples_per_second': 0.436, 'train_steps_per_second': 0.054, 'train_loss': 0.004976104479283094, 'epoch': 1.0}


7.  12%|█▎        | 2/16 [00:29<03:26, 14.78s/it][A
7.  19%|█▉        | 3/16 [00:44<03:12, 14.80s/it][A


5.  47%|████▋     | 14/30 [1:52:49<1:54:41, 430.08s/it]*** -> Training took 437.0188 seconds.
5.  50%|█████     | 15/30 [1:58:58<2:16:07, 544.50s/it]retraining model for key '898e7135' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 954.82 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 938.20 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  25%|██▌       | 4/16 [00:59<02:57, 14.83s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  31%|███▏      | 5/16 [01:14<02:43, 14.85s/it][A
7.  38%|███▊      | 6/16 [01:29<02:28, 14.87s/it][A
5.   6%|▋         | 1/16 [00:17<04:27, 17.86s/it][A
7.  44%|████▍     | 7/16 [01:43<02:13, 14.89s/it][A


7. {'loss': 0.0281, 'grad_norm': 3.3231728076934814, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:58<01:59, 14.90s/it][A
7. [A
5.  12%|█▎        | 2/16 [00:36<04:12, 18.03s/it][A
7.  50%|█████     | 8/16 [01:58<01:59, 14.90s/it][A
5.  19%|█▉        | 3/16 [00:53<03:53, 17.98s/it][A
7.  56%|█████▋    | 9/16 [02:13<01:44, 14.90s/it][A
5.  25%|██▌       | 4/16 [01:11<03:35, 17.94s/it][A
7.  62%|██████▎   | 10/16 [02:28<01:29, 14.92s/it][A
5.  31%|███▏      | 5/16 [01:29<03:16, 17.91s/it][A
7.  69%|██████▉   | 11/16 [02:43<01:14, 14.93s/it][A
7.  75%|███████▌  | 12/16 [02:58<00:59, 14.95s/it][A
5.  38%|███▊      | 6/16 [01:47<02:58, 17.89s/it][A
7.  81%|████████▏ | 13/16 [03:13<00:44, 14.96s/it][A
5.  44%|████▍     | 7/16 [02:05<02:41, 17.90s/it][A


5. {'loss': 0.0147, 'grad_norm': 0.4408934712409973, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:23<02:23, 17.90s/it][A
5. [A
7.  88%|████████▊ | 14/16 [03:28<00:29, 14.96s/it][A


4.  50%|█████     | 15/30 [2:08:04<2:16:30, 546.06s/it]*** -> Training took 293.6519 seconds.


5.  50%|█████     | 8/16 [02:23<02:23, 17.90s/it][A


4.  53%|█████▎    | 16/30 [2:11:14<2:03:18, 528.48s/it]retraining model for key '7b80bb43' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 675.54 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 666.25 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.  94%|█████████▍| 15/16 [03:43<00:14, 14.97s/it][A


7.  60%|██████    | 18/30 [2:10:52<1:07:56, 339.71s/it]
7. {'loss': 0.0047, 'grad_norm': 4.65121603012085, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  60%|██████    | 18/30 [2:12:52<1:07:56, 339.71s/it]
7. {'train_runtime': 238.5718, 'train_samples_per_second': 0.537, 'train_steps_per_second': 0.067, 'train_loss': 0.01638570101931691, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:58<00:00, 14.95s/it][A
7. [A
7. 100%|██████████| 16/16 [03:58<00:00, 14.95s/it][A
7. [A
7. 100%|██████████| 16/16 [03:58<00:00, 14.95s/it][A
7. 100%|██████████| 16/16 [03:58<00:00, 14.91s/it]
5.  56%|█████▋    | 9/16 [02:41<02:05, 17.90s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  62%|██████▎   | 10/16 [02:59<01:47, 17.89s/it][A


7.  60%|██████    | 18/30 [2:12:52<1:07:56, 339.71s/it]*** -> Training took 238.5718 seconds.
7.  63%|██████▎   | 19/30 [2:13:24<58:38, 319.83s/it]  retraining model for key '7b5033c1' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 2500.41 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.   6%|▋         | 1/16 [00:24<06:11, 24.77s/it][A
5.  69%|██████▉   | 11/16 [03:17<01:29, 17.89s/it][A
7.   6%|▋         | 1/16 [00:07<01:45,  7.06s/it][A
7.  12%|█▎        | 2/16 [00:14<01:38,  7.02s/it][A
7.  19%|█▉        | 3/16 [00:21<01:30,  7.00s/it][A
5.  75%|███████▌  | 12/16 [03:34<01:11, 17.90s/it][A
7.  25%|██▌       | 4/16 [00:28<01:23,  7.00s/it][A
4.  12%|█▎        | 2/16 [00:49<05:47, 24.80s/it][A
7.  31%|███▏      | 5/16 [00:35<01:16,  7.00s/it][A
5.  81%|████████▏ | 13/16 [03:52<00:53, 17.91s/it][A
7.  38%|███▊      | 6/16 [00:42<01:09,  7.00s/it][A
7.  44%|████▍     | 7/16 [00:49<01:02,  7.00s/it][A


7. {'loss': 0.1034, 'grad_norm': 2.891230583190918, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [00:56<00:55,  7.00s/it][A
7. [A
4.  19%|█▉        | 3/16 [01:14<05:22, 24.79s/it][A
7.  50%|█████     | 8/16 [00:56<00:55,  7.00s/it][A
5.  88%|████████▊ | 14/16 [04:10<00:35, 17.90s/it][A
7.  56%|█████▋    | 9/16 [01:03<00:48,  7.00s/it][A
7.  62%|██████▎   | 10/16 [01:10<00:41,  7.00s/it][A
5.  94%|█████████▍| 15/16 [04:28<00:17, 17.90s/it][A


5.  50%|█████     | 15/30 [2:01:25<2:16:07, 544.50s/it]
5. {'loss': 0.003, 'grad_norm': 0.46902909874916077, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  50%|█████     | 15/30 [2:03:48<2:16:07, 544.50s/it]
5. {'train_runtime': 286.5561, 'train_samples_per_second': 0.447, 'train_steps_per_second': 0.056, 'train_loss': 0.008831107523292303, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:46<00:00, 17.90s/it][A
5. [A
5. 100%|██████████| 16/16 [04:46<00:00, 17.90s/it][A
5. [A
5. 100%|██████████| 16/16 [04:46<00:00, 17.90s/it][A
5. 100%|██████████| 16/16 [04:46<00:00, 17.91s/it]
7.  69%|██████▉   | 11/16 [01:17<00:34,  7.00s/it][A
4.  25%|██▌       | 4/16 [01:39<04:57, 24.80s/it][A


6.  43%|████▎     | 13/30 [2:00:45<2:38:29, 559.37s/it]*** -> Training took 990.8455 seconds.
6.  47%|████▋     | 14/30 [2:08:54<3:43:43, 838.99s/it]retraining model for key 'a251c730' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 721.16 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 711.11 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160
7.  75%|███████▌  | 12/16 [01:24<00:27,  7.00s/it][A


6. *** Start training run...


7.  81%|████████▏ | 13/16 [01:31<00:20,  7.00s/it][A
7.  88%|████████▊ | 14/16 [01:38<00:13,  7.00s/it][A
4.  31%|███▏      | 5/16 [02:04<04:33, 24.83s/it][A
7.  94%|█████████▍| 15/16 [01:44<00:06,  7.00s/it][A


7.  63%|██████▎   | 19/30 [2:14:23<58:38, 319.83s/it]
7. {'loss': 0.0149, 'grad_norm': 2.5220959186553955, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  63%|██████▎   | 19/30 [2:15:19<58:38, 319.83s/it]
7. {'train_runtime': 112.0054, 'train_samples_per_second': 1.143, 'train_steps_per_second': 0.143, 'train_loss': 0.059156931936740875, 'epoch': 1.0}


7. 100%|██████████| 16/16 [01:51<00:00,  7.00s/it][A
7. [A
7. 100%|██████████| 16/16 [01:51<00:00,  7.00s/it][A
7. [A
7. 100%|██████████| 16/16 [01:51<00:00,  7.00s/it][A
7. 100%|██████████| 16/16 [01:51<00:00,  7.00s/it]
6.   0%|          | 0/16 [00:00<?, ?it/s][A


7.  63%|██████▎   | 19/30 [2:15:19<58:38, 319.83s/it]*** -> Training took 112.0054 seconds.
7.  67%|██████▋   | 20/30 [2:15:33<43:45, 262.59s/it]retraining model for key '80a900e0' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 848.54 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 836.22 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  38%|███▊      | 6/16 [02:28<04:08, 24.86s/it][A
6.   6%|▋         | 1/16 [00:23<05:54, 23.64s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A


5.  50%|█████     | 15/30 [2:03:48<2:16:07, 544.50s/it]*** -> Training took 286.5561 seconds.
5.  53%|█████▎    | 16/30 [2:05:01<1:54:20, 490.03s/it]retraining model for key '8b7bacbf' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 720.18 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 710.63 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  44%|████▍     | 7/16 [02:54<03:44, 24.92s/it][A


4. {'loss': 0.0014, 'grad_norm': 0.018607618287205696, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [03:19<03:19, 24.96s/it][A
4. [A
6.  12%|█▎        | 2/16 [00:47<05:30, 23.64s/it][A
7.   6%|▋         | 1/16 [00:21<05:26, 21.77s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  50%|█████     | 8/16 [03:19<03:19, 24.96s/it][A
6.  19%|█▉        | 3/16 [01:10<05:06, 23.60s/it][A
7.  12%|█▎        | 2/16 [00:43<05:04, 21.74s/it][A
5.   6%|▋         | 1/16 [00:24<06:08, 24.54s/it][A
6.  25%|██▌       | 4/16 [01:34<04:42, 23.56s/it][A
4.  56%|█████▋    | 9/16 [03:44<02:54, 25.00s/it][A
7.  19%|█▉        | 3/16 [01:05<04:42, 21.75s/it][A
5.  12%|█▎        | 2/16 [00:49<05:43, 24.51s/it][A
6.  31%|███▏      | 5/16 [01:57<04:19, 23.56s/it][A
4.  62%|██████▎   | 10/16 [04:09<02:30, 25.02s/it][A
7.  25%|██▌       | 4/16 [01:27<04:21, 21.77s/it][A
6.  38%|███▊      | 6/16 [02:21<03:55, 23.57s/it][A
5.  19%|█▉        | 3/16 [01:13<05:18, 24.49s/it][A
7.  31%|███▏      | 5/16 [01:48<03:59, 21.79s/it][A
4.  69%|██████▉   | 11/16 [04:34<02:05, 25.03s

6. {'loss': 0.0174, 'grad_norm': 0.8394134044647217, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [03:08<03:08, 23.58s/it][A
6. [A
7.  38%|███▊      | 6/16 [02:10<03:38, 21.80s/it][A
5.  25%|██▌       | 4/16 [01:37<04:53, 24.49s/it][A
4.  75%|███████▌  | 12/16 [04:59<01:40, 25.04s/it][A
7.  44%|████▍     | 7/16 [02:32<03:16, 21.81s/it][A


7. {'loss': 0.0068, 'grad_norm': 0.09816405177116394, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:54<02:54, 21.81s/it][A
7. [A
6.  50%|█████     | 8/16 [03:08<03:08, 23.58s/it][A
5.  31%|███▏      | 5/16 [02:02<04:29, 24.50s/it][A
4.  81%|████████▏ | 13/16 [05:24<01:15, 25.04s/it][A
7.  50%|█████     | 8/16 [02:54<02:54, 21.81s/it][A
6.  56%|█████▋    | 9/16 [03:32<02:44, 23.56s/it][A
5.  38%|███▊      | 6/16 [02:26<04:04, 24.49s/it][A
4.  88%|████████▊ | 14/16 [05:49<00:50, 25.04s/it][A
7.  56%|█████▋    | 9/16 [03:16<02:32, 21.82s/it][A
6.  62%|██████▎   | 10/16 [03:55<02:21, 23.55s/it][A
5.  44%|████▍     | 7/16 [02:51<03:40, 24.49s/it][A


5. {'loss': 0.0012, 'grad_norm': 0.37267789244651794, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [03:16<03:16, 24.51s/it][A
5. [A
4.  94%|█████████▍| 15/16 [06:14<00:25, 25.04s/it][A


4.  53%|█████▎    | 16/30 [2:14:37<2:03:18, 528.48s/it]
4. {'loss': 0.0, 'grad_norm': 0.00011745141819119453, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  53%|█████▎    | 16/30 [2:17:57<2:03:18, 528.48s/it]
4. {'train_runtime': 399.4796, 'train_samples_per_second': 0.32, 'train_steps_per_second': 0.04, 'train_loss': 0.0006949786638870137, 'epoch': 1.0}


4. 100%|██████████| 16/16 [06:39<00:00, 25.04s/it][A
4. [A
4. 100%|██████████| 16/16 [06:39<00:00, 25.04s/it][A
4. [A
4. 100%|██████████| 16/16 [06:39<00:00, 25.04s/it][A
4. 100%|██████████| 16/16 [06:39<00:00, 24.97s/it]
7.  62%|██████▎   | 10/16 [03:38<02:10, 21.82s/it][A
6.  69%|██████▉   | 11/16 [04:19<01:57, 23.55s/it][A
5.  50%|█████     | 8/16 [03:16<03:16, 24.51s/it][A
7.  69%|██████▉   | 11/16 [03:59<01:49, 21.82s/it][A
6.  75%|███████▌  | 12/16 [04:42<01:34, 23.56s/it][A
5.  56%|█████▋    | 9/16 [03:40<02:51, 24.52s/it][A
7.  75%|███████▌  | 12/16 [04:21<01:27, 21.83s/it][A
6.  81%|████████▏ | 13/16 [05:06<01:10, 23.56s/it][A
5.  62%|██████▎   | 10/16 [04:05<02:27, 24.51s/it][A
7.  81%|████████▏ | 13/16 [04:43<01:05, 21.83s/it][A
6.  88%|████████▊ | 14/16 [05:29<00:47, 23.56s/it][A
5.  69%|██████▉   | 11/16 [04:29<02:02, 24.51s/it][A
7.  88%|████████▊ | 14/16 [05:05<00:43, 21.82s/it][A
6.  94%|█████████▍| 15/16 [05:53<00:23, 23.56s/it][A


6.  47%|████▋     | 14/30 [2:12:07<3:43:43, 838.99s/it]


6. 100%|██████████| 16/16 [06:17<00:00, 23.55s/it][A
6. [A
6. 100%|██████████| 16/16 [06:17<00:00, 23.55s/it][A
6. [A


6. {'loss': 0.0032, 'grad_norm': 0.4546780586242676, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  47%|████▋     | 14/30 [2:15:15<3:43:43, 838.99s/it]
6. {'train_runtime': 377.0307, 'train_samples_per_second': 0.339, 'train_steps_per_second': 0.042, 'train_loss': 0.010329776909202337, 'epoch': 1.0}


6. 100%|██████████| 16/16 [06:17<00:00, 23.55s/it][A
6. 100%|██████████| 16/16 [06:17<00:00, 23.56s/it]
5.  75%|███████▌  | 12/16 [04:54<01:38, 24.50s/it][A
7.  94%|█████████▍| 15/16 [05:27<00:21, 21.82s/it][A


7.  67%|██████▋   | 20/30 [2:18:31<43:45, 262.59s/it]
7. {'loss': 0.001, 'grad_norm': 0.18198223412036896, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  67%|██████▋   | 20/30 [2:21:25<43:45, 262.59s/it]
7. {'train_runtime': 348.9738, 'train_samples_per_second': 0.367, 'train_steps_per_second': 0.046, 'train_loss': 0.0039198032463900745, 'epoch': 1.0}


7. 100%|██████████| 16/16 [05:48<00:00, 21.82s/it][A
7. [A
7. 100%|██████████| 16/16 [05:48<00:00, 21.82s/it][A
7. [A
7. 100%|██████████| 16/16 [05:48<00:00, 21.82s/it][A
7. 100%|██████████| 16/16 [05:48<00:00, 21.81s/it]
5.  81%|████████▏ | 13/16 [05:18<01:13, 24.49s/it][A


6.  47%|████▋     | 14/30 [2:15:15<3:43:43, 838.99s/it]*** -> Training took 377.0307 seconds.
6.  50%|█████     | 15/30 [2:15:57<2:58:21, 713.43s/it]retraining model for key 'a32d8b75' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 482.46 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 476.35 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  88%|████████▊ | 14/16 [05:43<00:49, 24.50s/it][A


7.  67%|██████▋   | 20/30 [2:21:25<43:45, 262.59s/it]*** -> Training took 348.9738 seconds.
7.  70%|███████   | 21/30 [2:22:24<46:03, 307.02s/it]retraining model for key '8e5c0c38' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 866.18 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 851.61 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  94%|█████████▍| 15/16 [06:07<00:24, 24.50s/it][A


5.  53%|█████▎    | 16/30 [2:08:21<1:54:20, 490.03s/it]
5. {'loss': 0.0008, 'grad_norm': 0.23495741188526154, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  53%|█████▎    | 16/30 [2:11:37<1:54:20, 490.03s/it]
5. {'train_runtime': 391.9914, 'train_samples_per_second': 0.327, 'train_steps_per_second': 0.041, 'train_loss': 0.0009631152788642794, 'epoch': 1.0}


5. 100%|██████████| 16/16 [06:31<00:00, 24.49s/it][A
5. [A
5. 100%|██████████| 16/16 [06:31<00:00, 24.49s/it][A
5. [A
5. 100%|██████████| 16/16 [06:31<00:00, 24.49s/it][A
5. 100%|██████████| 16/16 [06:31<00:00, 24.50s/it]
7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:21<05:18, 21.27s/it][A
6.   6%|▋         | 1/16 [00:36<09:13, 36.89s/it][A


4.  53%|█████▎    | 16/30 [2:17:57<2:03:18, 528.48s/it]*** -> Training took 399.4796 seconds.


7.  12%|█▎        | 2/16 [00:42<04:56, 21.19s/it][A


4.  57%|█████▋    | 17/30 [2:22:00<2:02:09, 563.82s/it]retraining model for key '7c66cb00' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 336.72 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 333.31 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  12%|█▎        | 2/16 [01:13<08:35, 36.79s/it][A
7.  19%|█▉        | 3/16 [01:03<04:34, 21.15s/it][A
7.  25%|██▌       | 4/16 [01:24<04:13, 21.13s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  19%|█▉        | 3/16 [01:50<07:56, 36.69s/it][A
7.  31%|███▏      | 5/16 [01:45<03:52, 21.16s/it][A
7.  38%|███▊      | 6/16 [02:07<03:31, 21.18s/it][A
6.  25%|██▌       | 4/16 [02:26<07:20, 36.71s/it][A
4.   6%|▋         | 1/16 [00:48<12:03, 48.20s/it][A
7.  44%|████▍     | 7/16 [02:28<03:10, 21.18s/it][A


7. {'loss': 0.0033, 'grad_norm': 0.08343151956796646, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:49<02:49, 21.17s/it][A
7. [A
7.  50%|█████     | 8/16 [02:49<02:49, 21.17s/it][A
6.  31%|███▏      | 5/16 [03:03<06:43, 36.72s/it][A
7.  56%|█████▋    | 9/16 [03:10<02:28, 21.17s/it][A
4.  12%|█▎        | 2/16 [01:36<11:13, 48.11s/it][A
6.  38%|███▊      | 6/16 [03:40<06:07, 36.70s/it][A
7.  62%|██████▎   | 10/16 [03:31<02:07, 21.17s/it][A
7.  69%|██████▉   | 11/16 [03:52<01:45, 21.18s/it][A
4.  19%|█▉        | 3/16 [02:24<10:25, 48.08s/it][A
6.  44%|████▍     | 7/16 [04:17<05:30, 36.72s/it][A


6. {'loss': 0.0072, 'grad_norm': 0.6384323239326477, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [04:53<04:52, 36.60s/it][A
6. [A
7.  75%|███████▌  | 12/16 [04:14<01:24, 21.19s/it][A
7.  81%|████████▏ | 13/16 [04:35<01:03, 21.18s/it][A
6.  50%|█████     | 8/16 [04:53<04:52, 36.60s/it][A
4.  25%|██▌       | 4/16 [03:12<09:37, 48.11s/it][A
7.  88%|████████▊ | 14/16 [04:56<00:42, 21.18s/it][A
7.  94%|█████████▍| 15/16 [05:17<00:21, 21.17s/it][A


7.  70%|███████   | 21/30 [2:25:16<46:03, 307.02s/it]
7. {'loss': 0.0, 'grad_norm': 0.00422581285238266, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  70%|███████   | 21/30 [2:28:06<46:03, 307.02s/it]
7. {'train_runtime': 338.7667, 'train_samples_per_second': 0.378, 'train_steps_per_second': 0.047, 'train_loss': 0.00168077244052256, 'epoch': 1.0}


7. 100%|██████████| 16/16 [05:38<00:00, 21.16s/it][A
7. [A
7. 100%|██████████| 16/16 [05:38<00:00, 21.16s/it][A
7. [A
7. 100%|██████████| 16/16 [05:38<00:00, 21.16s/it][A
7. 100%|██████████| 16/16 [05:38<00:00, 21.17s/it]
6.  56%|█████▋    | 9/16 [05:30<04:16, 36.63s/it][A
4.  31%|███▏      | 5/16 [04:00<08:49, 48.16s/it][A
6.  62%|██████▎   | 10/16 [06:06<03:39, 36.64s/it][A
4.  38%|███▊      | 6/16 [04:48<08:01, 48.18s/it][A
6.  69%|██████▉   | 11/16 [06:43<03:03, 36.67s/it][A
6.  75%|███████▌  | 12/16 [07:20<02:26, 36.69s/it][A
4.  44%|████▍     | 7/16 [05:37<07:13, 48.17s/it][A


4. {'loss': 0.005, 'grad_norm': 0.1763349026441574, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [06:25<06:25, 48.19s/it][A
4. [A


5.  53%|█████▎    | 16/30 [2:11:37<1:54:20, 490.03s/it]*** -> Training took 391.9914 seconds.
5.  57%|█████▋    | 17/30 [2:19:03<2:09:05, 595.79s/it]retraining model for key '8f3a5a89' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 783.71 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 771.58 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  81%|████████▏ | 13/16 [07:56<01:50, 36.68s/it][A
5.   6%|▋         | 1/16 [00:17<04:19, 17.28s/it][A
4.  50%|█████     | 8/16 [06:25<06:25, 48.19s/it][A
5.  12%|█▎        | 2/16 [00:34<04:02, 17.34s/it][A
6.  88%|████████▊ | 14/16 [08:33<01:13, 36.69s/it][A
5.  19%|█▉        | 3/16 [00:52<03:45, 17.38s/it][A
5.  25%|██▌       | 4/16 [01:09<03:28, 17.39s/it][A
4.  56%|█████▋    | 9/16 [07:13<05:37, 48.21s/it][A
6.  94%|█████████▍| 15/16 [09:10<00:36, 36.70s/it][A


6.  50%|█████     | 15/30 [2:20:54<2:58:21, 713.43s/it]


6. 100%|██████████| 16/16 [09:46<00:00, 36.66s/it][A
6. [A


6. {'loss': 0.0017, 'grad_norm': 0.2199534773826599, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  50%|█████     | 15/30 [2:25:48<2:58:21, 713.43s/it]
6. {'train_runtime': 586.9133, 'train_samples_per_second': 0.218, 'train_steps_per_second': 0.027, 'train_loss': 0.004479747556615621, 'epoch': 1.0}


6. 100%|██████████| 16/16 [09:46<00:00, 36.66s/it][A
6. [A
6. 100%|██████████| 16/16 [09:46<00:00, 36.66s/it][A
6. 100%|██████████| 16/16 [09:46<00:00, 36.68s/it]
5.  31%|███▏      | 5/16 [01:26<03:11, 17.38s/it][A
5.  38%|███▊      | 6/16 [01:44<02:53, 17.38s/it][A


7.  70%|███████   | 21/30 [2:28:06<46:03, 307.02s/it]*** -> Training took 338.7667 seconds.
7.  73%|███████▎  | 22/30 [2:32:16<52:20, 392.54s/it]retraining model for key '9385bd28' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1190.27 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1167.70 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  62%|██████▎   | 10/16 [08:01<04:49, 48.24s/it][A
5.  44%|████▍     | 7/16 [02:01<02:36, 17.37s/it][A


5. {'loss': 0.0058, 'grad_norm': 0.05217922106385231, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:18<02:19, 17.38s/it][A
5. [A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  50%|█████     | 8/16 [02:18<02:19, 17.38s/it][A
7.   6%|▋         | 1/16 [00:15<03:47, 15.18s/it][A
5.  56%|█████▋    | 9/16 [02:36<02:01, 17.39s/it][A
7.  12%|█▎        | 2/16 [00:30<03:32, 15.17s/it][A
4.  69%|██████▉   | 11/16 [08:50<04:01, 48.22s/it][A
5.  62%|██████▎   | 10/16 [02:53<01:44, 17.39s/it][A
7.  19%|█▉        | 3/16 [00:45<03:17, 15.18s/it][A
7.  25%|██▌       | 4/16 [01:00<03:02, 15.20s/it][A
5.  69%|██████▉   | 11/16 [03:11<01:26, 17.38s/it][A
7.  31%|███▏      | 5/16 [01:15<02:47, 15.21s/it][A
5.  75%|███████▌  | 12/16 [03:28<01:09, 17.37s/it][A
4.  75%|███████▌  | 12/16 [09:38<03:12, 48.22s/it][A
7.  38%|███▊      | 6/16 [01:31<02:32, 15.22s/it][A
5.  81%|████████▏ | 13/16 [03:45<00:52, 17.37s/it][A
7.  44%|████▍     | 7/16 [01:46<02:17, 15.23s/it][A


7. {'loss': 0.0027, 'grad_norm': 0.2349514663219452, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:01<02:01, 15.23s/it][A
7. [A
5.  88%|████████▊ | 14/16 [04:03<00:34, 17.37s/it][A
7.  50%|█████     | 8/16 [02:01<02:01, 15.23s/it][A
5.  94%|█████████▍| 15/16 [04:20<00:17, 17.36s/it][A


5.  57%|█████▋    | 17/30 [2:21:25<2:09:05, 595.79s/it]
5. {'loss': 0.0029, 'grad_norm': 0.035656873136758804, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  57%|█████▋    | 17/30 [2:23:44<2:09:05, 595.79s/it]
5. {'train_runtime': 277.8992, 'train_samples_per_second': 0.461, 'train_steps_per_second': 0.058, 'train_loss': 0.0043258717050775886, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:37<00:00, 17.35s/it][A
5. [A
5. 100%|██████████| 16/16 [04:37<00:00, 17.35s/it][A
5. [A
5. 100%|██████████| 16/16 [04:37<00:00, 17.35s/it][A
5. 100%|██████████| 16/16 [04:37<00:00, 17.37s/it]
4.  81%|████████▏ | 13/16 [10:26<02:24, 48.21s/it][A
7.  56%|█████▋    | 9/16 [02:16<01:46, 15.24s/it][A
7.  62%|██████▎   | 10/16 [02:32<01:31, 15.25s/it][A
7.  69%|██████▉   | 11/16 [02:47<01:16, 15.26s/it][A
4.  88%|████████▊ | 14/16 [11:14<01:36, 48.19s/it][A
7.  75%|███████▌  | 12/16 [03:02<01:01, 15.27s/it][A
7.  81%|████████▏ | 13/16 [03:18<00:45, 15.28s/it][A


5.  57%|█████▋    | 17/30 [2:23:44<2:09:05, 595.79s/it]*** -> Training took 277.8992 seconds.
5.  60%|██████    | 18/30 [2:25:01<1:44:50, 524.24s/it]retraining model for key '97d7923e' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1694.87 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  88%|████████▊ | 14/16 [03:33<00:30, 15.28s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  94%|█████████▍| 15/16 [03:48<00:15, 15.28s/it][A


7.  73%|███████▎  | 22/30 [2:34:21<52:20, 392.54s/it]
7. {'loss': 0.0009, 'grad_norm': 0.31774258613586426, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  73%|███████▎  | 22/30 [2:36:23<52:20, 392.54s/it]
7. {'train_runtime': 243.9953, 'train_samples_per_second': 0.525, 'train_steps_per_second': 0.066, 'train_loss': 0.001798907178454101, 'epoch': 1.0}


7. 100%|██████████| 16/16 [04:03<00:00, 15.28s/it][A
7. [A
7. 100%|██████████| 16/16 [04:03<00:00, 15.28s/it][A
7. [A
7. 100%|██████████| 16/16 [04:03<00:00, 15.28s/it][A
7. 100%|██████████| 16/16 [04:03<00:00, 15.25s/it]
5.   6%|▋         | 1/16 [00:09<02:26,  9.77s/it][A
4.  94%|█████████▍| 15/16 [12:02<00:48, 48.22s/it][A
4. 100%|██████████| 16/16 [12:51<00:00, 48.26s/it][A
4. [A
4. 100%|██████████| 16/16 [12:51<00:00, 48.26s/it][A
4. [A


4.  57%|█████▋    | 17/30 [2:28:30<2:02:09, 563.82s/it]
4. {'loss': 0.0011, 'grad_norm': 0.2695651352405548, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  57%|█████▋    | 17/30 [2:34:56<2:02:09, 563.82s/it]
4. {'train_runtime': 771.2385, 'train_samples_per_second': 0.166, 'train_steps_per_second': 0.021, 'train_loss': 0.0030371291213668883, 'epoch': 1.0}


4. 100%|██████████| 16/16 [12:51<00:00, 48.26s/it][A
4. 100%|██████████| 16/16 [12:51<00:00, 48.20s/it]
5.  12%|█▎        | 2/16 [00:19<02:16,  9.75s/it][A
5.  19%|█▉        | 3/16 [00:29<02:06,  9.76s/it][A
5.  25%|██▌       | 4/16 [00:39<01:57,  9.78s/it][A
5.  31%|███▏      | 5/16 [00:48<01:47,  9.82s/it][A
5.  38%|███▊      | 6/16 [00:58<01:38,  9.85s/it][A
5.  44%|████▍     | 7/16 [01:08<01:29,  9.90s/it][A


5. {'loss': 0.0028, 'grad_norm': 0.23943601548671722, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:18<01:19,  9.91s/it][A
5. [A
5.  50%|█████     | 8/16 [01:18<01:19,  9.91s/it][A
5.  56%|█████▋    | 9/16 [01:28<01:09,  9.91s/it][A
5.  62%|██████▎   | 10/16 [01:38<00:59,  9.90s/it][A
5.  69%|██████▉   | 11/16 [01:48<00:49,  9.89s/it][A
5.  75%|███████▌  | 12/16 [01:58<00:39,  9.89s/it][A
5.  81%|████████▏ | 13/16 [02:08<00:29,  9.88s/it][A
5.  88%|████████▊ | 14/16 [02:18<00:19,  9.89s/it][A
5.  94%|█████████▍| 15/16 [02:28<00:09,  9.89s/it][A


5.  60%|██████    | 18/30 [2:26:23<1:44:50, 524.24s/it]
5. {'loss': 0.0006, 'grad_norm': 0.033546242862939835, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  60%|██████    | 18/30 [2:27:42<1:44:50, 524.24s/it]
5. {'train_runtime': 157.9504, 'train_samples_per_second': 0.81, 'train_steps_per_second': 0.101, 'train_loss': 0.001687599957222119, 'epoch': 1.0}


5. 100%|██████████| 16/16 [02:37<00:00,  9.90s/it][A
5. [A
5. 100%|██████████| 16/16 [02:37<00:00,  9.90s/it][A
5. [A
5. 100%|██████████| 16/16 [02:37<00:00,  9.90s/it][A
5. 100%|██████████| 16/16 [02:37<00:00,  9.87s/it]


7.  73%|███████▎  | 22/30 [2:36:23<52:20, 392.54s/it]*** -> Training took 243.9953 seconds.
7.  77%|███████▋  | 23/30 [2:38:50<45:51, 393.09s/it]retraining model for key 'a395ee82' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 594.23 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 585.79 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:31<07:54, 31.60s/it][A


4.  57%|█████▋    | 17/30 [2:34:56<2:02:09, 563.82s/it]*** -> Training took 771.2385 seconds.
4.  60%|██████    | 18/30 [2:38:38<2:18:50, 694.23s/it]retraining model for key '7ed72f31' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1164.39 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1142.48 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...
5.  60%|██████    | 18/30 [2:27:42<1:44:50, 524.24s/it]*** -> Training took 157.9504 seconds.
5.  63%|██████▎   | 19/30 [2:29:17<1:21:20, 443.70s/it]retraining model for key 'a25697e4' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 574.78 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 567.58 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  12%|█▎        | 2/16 [01:03<07:23, 31.66s/it][A
4.   6%|▋         | 1/16 [00:14<03:35, 14.39s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  12%|█▎        | 2/16 [00:28<03:21, 14.39s/it][A
7.  19%|█▉        | 3/16 [01:35<06:52, 31.71s/it][A
4.  19%|█▉        | 3/16 [00:43<03:06, 14.38s/it][A


6.  50%|█████     | 15/30 [2:25:48<2:58:21, 713.43s/it]*** -> Training took 586.9133 seconds.
6.  53%|█████▎    | 16/30 [2:35:08<3:17:14, 845.33s/it]retraining model for key 'b10624e5' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 532.04 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 525.39 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  25%|██▌       | 4/16 [00:57<02:52, 14.36s/it][A
5.   6%|▋         | 1/16 [00:30<07:38, 30.54s/it][A
7.  25%|██▌       | 4/16 [02:06<06:21, 31.77s/it][A
4.  31%|███▏      | 5/16 [01:11<02:37, 14.34s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  38%|███▊      | 6/16 [01:26<02:23, 14.34s/it][A
5.  12%|█▎        | 2/16 [01:01<07:07, 30.51s/it][A
7.  31%|███▏      | 5/16 [02:38<05:50, 31.82s/it][A
4.  44%|████▍     | 7/16 [01:40<02:09, 14.34s/it][A


4. {'loss': 0.0061, 'grad_norm': 0.13453492522239685, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:54<01:54, 14.34s/it][A
4. [A
6.   6%|▋         | 1/16 [00:32<08:07, 32.48s/it][A
4.  50%|█████     | 8/16 [01:54<01:54, 14.34s/it][A
5.  19%|█▉        | 3/16 [01:31<06:37, 30.59s/it][A
4.  56%|█████▋    | 9/16 [02:09<01:40, 14.33s/it][A
7.  38%|███▊      | 6/16 [03:10<05:18, 31.89s/it][A
4.  62%|██████▎   | 10/16 [02:23<01:25, 14.32s/it][A
6.  12%|█▎        | 2/16 [01:04<07:34, 32.45s/it][A
5.  25%|██▌       | 4/16 [02:02<06:07, 30.64s/it][A
4.  69%|██████▉   | 11/16 [02:37<01:11, 14.32s/it][A
7.  44%|████▍     | 7/16 [03:42<04:47, 31.89s/it][A


7. {'loss': 0.0029, 'grad_norm': 0.24899323284626007, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [04:14<04:15, 31.88s/it][A
7. [A
4.  75%|███████▌  | 12/16 [02:51<00:57, 14.30s/it][A
6.  19%|█▉        | 3/16 [01:37<07:00, 32.34s/it][A
5.  31%|███▏      | 5/16 [02:33<05:37, 30.67s/it][A
4.  81%|████████▏ | 13/16 [03:06<00:42, 14.30s/it][A
7.  50%|█████     | 8/16 [04:14<04:15, 31.88s/it][A
4.  88%|████████▊ | 14/16 [03:20<00:28, 14.29s/it][A
6.  25%|██▌       | 4/16 [02:09<06:28, 32.34s/it][A
5.  38%|███▊      | 6/16 [03:03<05:07, 30.72s/it][A
4.  94%|█████████▍| 15/16 [03:34<00:14, 14.28s/it][A


4.  60%|██████    | 18/30 [2:40:36<2:18:50, 694.23s/it]
4. {'loss': 0.0007, 'grad_norm': 0.021837592124938965, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  60%|██████    | 18/30 [2:42:30<2:18:50, 694.23s/it]
4. {'train_runtime': 229.0642, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.07, 'train_loss': 0.003419779008254409, 'epoch': 1.0}


4. 100%|██████████| 16/16 [03:49<00:00, 14.27s/it][A
4. [A
4. 100%|██████████| 16/16 [03:49<00:00, 14.27s/it][A
4. [A
4. 100%|██████████| 16/16 [03:49<00:00, 14.27s/it][A
4. 100%|██████████| 16/16 [03:49<00:00, 14.32s/it]
7.  56%|█████▋    | 9/16 [04:46<03:43, 31.90s/it][A
6.  31%|███▏      | 5/16 [02:41<05:55, 32.35s/it][A
5.  44%|████▍     | 7/16 [03:34<04:36, 30.74s/it][A


5. {'loss': 0.004, 'grad_norm': 0.12931400537490845, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [04:05<04:05, 30.74s/it][A
5. [A
7.  62%|██████▎   | 10/16 [05:18<03:11, 31.91s/it][A
5.  50%|█████     | 8/16 [04:05<04:05, 30.74s/it][A
6.  38%|███▊      | 6/16 [03:14<05:23, 32.36s/it][A
7.  69%|██████▉   | 11/16 [05:50<02:39, 31.91s/it][A
5.  56%|█████▋    | 9/16 [04:36<03:35, 30.74s/it][A
6.  44%|████▍     | 7/16 [03:46<04:51, 32.34s/it][A


6. {'loss': 0.0045, 'grad_norm': 0.0268641896545887, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [04:18<04:18, 32.33s/it][A
6. [A
7.  75%|███████▌  | 12/16 [06:22<02:07, 31.91s/it][A
5.  62%|██████▎   | 10/16 [05:06<03:04, 30.74s/it][A
6.  50%|█████     | 8/16 [04:18<04:18, 32.33s/it][A
7.  81%|████████▏ | 13/16 [06:54<01:35, 31.90s/it][A
5.  69%|██████▉   | 11/16 [05:37<02:33, 30.74s/it][A
6.  56%|█████▋    | 9/16 [04:51<03:46, 32.32s/it][A
7.  88%|████████▊ | 14/16 [07:26<01:03, 31.88s/it][A
5.  75%|███████▌  | 12/16 [06:08<02:02, 30.73s/it][A
6.  62%|██████▎   | 10/16 [05:23<03:13, 32.33s/it][A
7.  94%|█████████▍| 15/16 [07:57<00:31, 31.89s/it][A


7.  77%|███████▋  | 23/30 [2:43:09<45:51, 393.09s/it]
7. {'loss': 0.0004, 'grad_norm': 0.015531918965280056, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  77%|███████▋  | 23/30 [2:47:24<45:51, 393.09s/it]
7. {'train_runtime': 509.8369, 'train_samples_per_second': 0.251, 'train_steps_per_second': 0.031, 'train_loss': 0.001630057638976723, 'epoch': 1.0}


7. 100%|██████████| 16/16 [08:29<00:00, 31.89s/it][A
7. [A
7. 100%|██████████| 16/16 [08:29<00:00, 31.89s/it][A
7. [A
7. 100%|██████████| 16/16 [08:29<00:00, 31.89s/it][A
7. 100%|██████████| 16/16 [08:29<00:00, 31.86s/it]
5.  81%|████████▏ | 13/16 [06:39<01:32, 30.74s/it][A
6.  69%|██████▉   | 11/16 [05:55<02:41, 32.32s/it][A


4.  60%|██████    | 18/30 [2:42:30<2:18:50, 694.23s/it]*** -> Training took 229.0642 seconds.
4.  63%|██████▎   | 19/30 [2:46:12<1:54:02, 622.02s/it]retraining model for key '88bcf3b4' (retrain_dataset_size=10)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1095.74 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1072.33 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  88%|████████▊ | 14/16 [07:09<01:01, 30.73s/it][A
6.  75%|███████▌  | 12/16 [06:28<02:09, 32.31s/it][A
4.   6%|▋         | 1/16 [00:14<03:44, 14.98s/it][A
4.  12%|█▎        | 2/16 [00:29<03:29, 14.95s/it][A
5.  94%|█████████▍| 15/16 [07:40<00:30, 30.73s/it][A


5.  63%|██████▎   | 19/30 [2:33:26<1:21:20, 443.70s/it]
5. {'loss': 0.0015, 'grad_norm': 0.07839101552963257, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  63%|██████▎   | 19/30 [2:37:32<1:21:20, 443.70s/it]
5. {'train_runtime': 491.3718, 'train_samples_per_second': 0.26, 'train_steps_per_second': 0.033, 'train_loss': 0.002762713353149593, 'epoch': 1.0}


5. 100%|██████████| 16/16 [08:11<00:00, 30.73s/it][A
5. [A
5. 100%|██████████| 16/16 [08:11<00:00, 30.73s/it][A
5. [A
5. 100%|██████████| 16/16 [08:11<00:00, 30.73s/it][A
5. 100%|██████████| 16/16 [08:11<00:00, 30.71s/it]
4.  19%|█▉        | 3/16 [00:44<03:14, 14.94s/it][A
6.  81%|████████▏ | 13/16 [07:00<01:36, 32.32s/it][A
4.  25%|██▌       | 4/16 [00:59<02:59, 14.95s/it][A
4.  31%|███▏      | 5/16 [01:14<02:44, 14.95s/it][A
6.  88%|████████▊ | 14/16 [07:32<01:04, 32.31s/it][A
4.  38%|███▊      | 6/16 [01:29<02:29, 14.95s/it][A
4.  44%|████▍     | 7/16 [01:44<02:14, 14.96s/it][A


4. {'loss': 0.0028, 'grad_norm': 0.2312879115343094, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:59<01:59, 14.97s/it][A
4. [A
6.  94%|█████████▍| 15/16 [08:04<00:32, 32.30s/it][A


6.  53%|█████▎    | 16/30 [2:39:31<3:17:14, 845.33s/it]
6. {'loss': 0.0005, 'grad_norm': 0.14641882479190826, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  53%|█████▎    | 16/30 [2:43:50<3:17:14, 845.33s/it]
6. {'train_runtime': 517.2435, 'train_samples_per_second': 0.247, 'train_steps_per_second': 0.031, 'train_loss': 0.002507351731765084, 'epoch': 1.0}


6. 100%|██████████| 16/16 [08:37<00:00, 32.29s/it][A
6. [A
6. 100%|██████████| 16/16 [08:37<00:00, 32.29s/it][A
6. [A
6. 100%|██████████| 16/16 [08:37<00:00, 32.29s/it][A
6. 100%|██████████| 16/16 [08:37<00:00, 32.33s/it]
4.  50%|█████     | 8/16 [01:59<01:59, 14.97s/it][A
4.  56%|█████▋    | 9/16 [02:14<01:44, 14.98s/it][A


7.  77%|███████▋  | 23/30 [2:47:24<45:51, 393.09s/it]*** -> Training took 509.8369 seconds.
7.  80%|████████  | 24/30 [2:50:18<48:09, 481.57s/it]retraining model for key 'b99e7126' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 339.40 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 335.46 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  62%|██████▎   | 10/16 [02:29<01:29, 14.99s/it][A
4.  69%|██████▉   | 11/16 [02:44<01:14, 14.99s/it][A
4.  75%|███████▌  | 12/16 [02:59<00:59, 14.99s/it][A
4.  81%|████████▏ | 13/16 [03:14<00:44, 15.00s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  88%|████████▊ | 14/16 [03:29<00:30, 15.00s/it][A
4.  94%|█████████▍| 15/16 [03:44<00:15, 15.01s/it][A


4.  63%|██████▎   | 19/30 [2:48:15<1:54:02, 622.02s/it]


4. 100%|██████████| 16/16 [03:59<00:00, 15.02s/it][A
4. [A
4. 100%|██████████| 16/16 [03:59<00:00, 15.02s/it][A
4. [A


4. {'loss': 0.001, 'grad_norm': 0.5668958425521851, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  63%|██████▎   | 19/30 [2:50:15<1:54:02, 622.02s/it]
4. {'train_runtime': 239.7971, 'train_samples_per_second': 0.534, 'train_steps_per_second': 0.067, 'train_loss': 0.0019312590593472123, 'epoch': 1.0}


4. 100%|██████████| 16/16 [03:59<00:00, 15.02s/it][A
4. 100%|██████████| 16/16 [03:59<00:00, 14.99s/it]
7.   6%|▋         | 1/16 [00:57<14:17, 57.14s/it][A


6.  53%|█████▎    | 16/30 [2:43:50<3:17:14, 845.33s/it]*** -> Training took 517.2435 seconds.
6.  57%|█████▋    | 17/30 [2:46:20<2:51:50, 793.11s/it]retraining model for key 'b5ca7ac4' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 528.99 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 522.19 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  12%|█▎        | 2/16 [01:54<13:17, 56.98s/it][A
6.   6%|▋         | 1/16 [00:32<08:10, 32.67s/it][A
6.  12%|█▎        | 2/16 [01:05<07:37, 32.66s/it][A
7.  19%|█▉        | 3/16 [02:51<12:22, 57.08s/it][A
6.  19%|█▉        | 3/16 [01:37<07:03, 32.56s/it][A
6.  25%|██▌       | 4/16 [02:10<06:31, 32.59s/it][A
7.  25%|██▌       | 4/16 [03:48<11:24, 57.01s/it][A
6.  31%|███▏      | 5/16 [02:43<05:58, 32.60s/it][A
7.  31%|███▏      | 5/16 [04:45<10:27, 57.02s/it][A
6.  38%|███▊      | 6/16 [03:15<05:25, 32.59s/it][A
6.  44%|████▍     | 7/16 [03:48<04:53, 32.57s/it][A


6. {'loss': 0.0043, 'grad_norm': 0.18265783786773682, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [04:20<04:20, 32.58s/it][A
6. [A


5.  63%|██████▎   | 19/30 [2:37:32<1:21:20, 443.70s/it]*** -> Training took 491.3718 seconds.
5.  67%|██████▋   | 20/30 [2:45:53<1:41:36, 609.62s/it]retraining model for key 'a47bf94d' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 564.21 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 557.36 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  38%|███▊      | 6/16 [05:42<09:29, 56.99s/it][A
6.  50%|█████     | 8/16 [04:20<04:20, 32.58s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  56%|█████▋    | 9/16 [04:53<03:47, 32.57s/it][A
7.  44%|████▍     | 7/16 [06:38<08:32, 56.95s/it][A


7. {'loss': 0.0018, 'grad_norm': 0.05625205859541893, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [07:35<07:35, 56.91s/it][A
7. [A


4.  63%|██████▎   | 19/30 [2:50:15<1:54:02, 622.02s/it]*** -> Training took 239.7971 seconds.
4.  67%|██████▋   | 20/30 [2:56:30<1:43:27, 620.79s/it]retraining model for key '89565ca0' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 881.14 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 867.48 examples/s]
5.   6%|▋         | 1/16 [00:31<07:53, 31.59s/it][A
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  62%|██████▎   | 10/16 [05:25<03:15, 32.56s/it][A
5.  12%|█▎        | 2/16 [01:03<07:21, 31.55s/it][A
4.   6%|▋         | 1/16 [00:18<04:40, 18.68s/it][A
7.  50%|█████     | 8/16 [07:35<07:35, 56.91s/it][A
6.  69%|██████▉   | 11/16 [05:58<02:42, 32.56s/it][A
4.  12%|█▎        | 2/16 [00:37<04:21, 18.67s/it][A
5.  19%|█▉        | 3/16 [01:34<06:49, 31.50s/it][A
4.  19%|█▉        | 3/16 [00:56<04:03, 18.71s/it][A
6.  75%|███████▌  | 12/16 [06:30<02:10, 32.57s/it][A
5.  25%|██▌       | 4/16 [02:06<06:17, 31.49s/it][A
4.  25%|██▌       | 4/16 [01:14<03:44, 18.73s/it][A
7.  56%|█████▋    | 9/16 [08:32<06:38, 56.90s/it][A
4.  31%|███▏      | 5/16 [01:33<03:25, 18.73s/it][A
6.  81%|████████▏ | 13/16 [07:03<01:37, 32.57s/it][A
5.  31%|███▏      | 5/16 [02:37<05:46, 31.49s/it][A
4.  38%|███▊      | 6/16 [01:52<03:07, 18.74s/it][A
4.  44%|████▍     | 7/16 [02:11<02:48, 18.75s/it][A


4. {'loss': 0.161, 'grad_norm': 3.3190667629241943, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:29<02:30, 18.76s/it][A
4. [A
6.  88%|████████▊ | 14/16 [07:36<01:05, 32.54s/it][A
5.  38%|███▊      | 6/16 [03:08<05:14, 31.49s/it][A
7.  62%|██████▎   | 10/16 [09:29<05:41, 56.88s/it][A
4.  50%|█████     | 8/16 [02:29<02:30, 18.76s/it][A
6.  94%|█████████▍| 15/16 [08:08<00:32, 32.56s/it][A


6.  57%|█████▋    | 17/30 [2:50:45<2:51:50, 793.11s/it]
6. {'loss': 0.0011, 'grad_norm': 0.20953518152236938, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  57%|█████▋    | 17/30 [2:55:05<2:51:50, 793.11s/it]
6. {'train_runtime': 521.1511, 'train_samples_per_second': 0.246, 'train_steps_per_second': 0.031, 'train_loss': 0.0027190311811864376, 'epoch': 1.0}


6. 100%|██████████| 16/16 [08:41<00:00, 32.55s/it][A
6. [A
6. 100%|██████████| 16/16 [08:41<00:00, 32.55s/it][A
6. [A
6. 100%|██████████| 16/16 [08:41<00:00, 32.55s/it][A
6. 100%|██████████| 16/16 [08:41<00:00, 32.57s/it]
5.  44%|████▍     | 7/16 [03:40<04:43, 31.49s/it][A
5.  50%|█████     | 8/16 [04:11<04:11, 31.48s/it][A
5. [A


5. {'loss': 0.0042, 'grad_norm': 0.1652650535106659, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  56%|█████▋    | 9/16 [02:48<02:11, 18.78s/it][A
4.  62%|██████▎   | 10/16 [03:07<01:52, 18.80s/it][A
5.  50%|█████     | 8/16 [04:11<04:11, 31.48s/it][A
7.  69%|██████▉   | 11/16 [10:26<04:44, 56.90s/it][A
4.  69%|██████▉   | 11/16 [03:26<01:34, 18.82s/it][A
4.  75%|███████▌  | 12/16 [03:45<01:15, 18.83s/it][A
5.  56%|█████▋    | 9/16 [04:43<03:40, 31.49s/it][A
4.  81%|████████▏ | 13/16 [04:04<00:56, 18.84s/it][A
7.  75%|███████▌  | 12/16 [11:23<03:47, 56.91s/it][A
5.  62%|██████▎   | 10/16 [05:14<03:08, 31.50s/it][A
4.  88%|████████▊ | 14/16 [04:22<00:37, 18.84s/it][A
4.  94%|█████████▍| 15/16 [04:41<00:18, 18.84s/it][A


4.  67%|██████▋   | 20/30 [2:59:03<1:43:27, 620.79s/it]
4. {'loss': 0.039, 'grad_norm': 3.3928568363189697, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  67%|██████▋   | 20/30 [3:01:34<1:43:27, 620.79s/it]
4. {'train_runtime': 300.6738, 'train_samples_per_second': 0.426, 'train_steps_per_second': 0.053, 'train_loss': 0.099980054423213, 'epoch': 1.0}


4. 100%|██████████| 16/16 [05:00<00:00, 18.84s/it][A
4. [A
4. 100%|██████████| 16/16 [05:00<00:00, 18.84s/it][A
4. [A
4. 100%|██████████| 16/16 [05:00<00:00, 18.84s/it][A
4. 100%|██████████| 16/16 [05:00<00:00, 18.79s/it]
5.  69%|██████▉   | 11/16 [05:46<02:37, 31.49s/it][A


4.  67%|██████▋   | 20/30 [3:01:34<1:43:27, 620.79s/it]*** -> Training took 300.6738 seconds.
4.  70%|███████   | 21/30 [3:01:48<1:19:31, 530.17s/it]retraining model for key '8b9c3697' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 611.82 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 603.79 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.  81%|████████▏ | 13/16 [12:20<02:50, 56.92s/it][A


6.  57%|█████▋    | 17/30 [2:55:05<2:51:50, 793.11s/it]*** -> Training took 521.1511 seconds.
6.  60%|██████    | 18/30 [2:57:43<2:32:00, 760.07s/it]retraining model for key 'b6f77b65' (retrain_dataset_size=15)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 721.60 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 712.85 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  75%|███████▌  | 12/16 [06:17<02:05, 31.49s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.   6%|▋         | 1/16 [00:27<06:52, 27.50s/it][A
5.  81%|████████▏ | 13/16 [06:49<01:34, 31.49s/it][A
6.   6%|▋         | 1/16 [00:16<04:13, 16.89s/it][A
7.  88%|████████▊ | 14/16 [13:17<01:53, 56.95s/it][A
6.  12%|█▎        | 2/16 [00:33<03:56, 16.91s/it][A
4.  12%|█▎        | 2/16 [00:55<06:25, 27.53s/it][A
5.  88%|████████▊ | 14/16 [07:20<01:03, 31.50s/it][A
6.  19%|█▉        | 3/16 [00:50<03:39, 16.92s/it][A
6.  25%|██▌       | 4/16 [01:07<03:22, 16.89s/it][A
4.  19%|█▉        | 3/16 [01:22<05:57, 27.53s/it][A
5.  94%|█████████▍| 15/16 [07:52<00:31, 31.50s/it][A


5.  67%|██████▋   | 20/30 [2:50:09<1:41:36, 609.62s/it]


5. 100%|██████████| 16/16 [08:23<00:00, 31.48s/it][A


5. {'loss': 0.0007, 'grad_norm': 0.04952598735690117, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  67%|██████▋   | 20/30 [2:54:21<1:41:36, 609.62s/it]
5. {'train_runtime': 503.908, 'train_samples_per_second': 0.254, 'train_steps_per_second': 0.032, 'train_loss': 0.0024699526838958263, 'epoch': 1.0}


5. [A
5. 100%|██████████| 16/16 [08:23<00:00, 31.48s/it][A
5. [A
5. 100%|██████████| 16/16 [08:23<00:00, 31.48s/it][A
5. 100%|██████████| 16/16 [08:23<00:00, 31.49s/it]
6.  31%|███▏      | 5/16 [01:24<03:05, 16.87s/it][A
7.  94%|█████████▍| 15/16 [14:14<00:57, 57.01s/it][A


7.  80%|████████  | 24/30 [2:57:58<48:09, 481.57s/it]
7. {'loss': 0.0003, 'grad_norm': 0.10868732631206512, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  80%|████████  | 24/30 [3:05:34<48:09, 481.57s/it]
7. {'train_runtime': 911.3662, 'train_samples_per_second': 0.14, 'train_steps_per_second': 0.018, 'train_loss': 0.0010310644429409876, 'epoch': 1.0}


7. 100%|██████████| 16/16 [15:11<00:00, 56.97s/it][A
7. [A
7. 100%|██████████| 16/16 [15:11<00:00, 56.97s/it][A
7. [A
7. 100%|██████████| 16/16 [15:11<00:00, 56.97s/it][A
7. 100%|██████████| 16/16 [15:11<00:00, 56.96s/it]
4.  25%|██▌       | 4/16 [01:50<05:30, 27.53s/it][A
6.  38%|███▊      | 6/16 [01:41<02:48, 16.83s/it][A
6.  44%|████▍     | 7/16 [01:57<02:31, 16.80s/it][A


6. {'loss': 0.0115, 'grad_norm': 0.5122857093811035, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:14<02:14, 16.81s/it][A
6. [A
4.  31%|███▏      | 5/16 [02:17<05:02, 27.53s/it][A
6.  50%|█████     | 8/16 [02:14<02:14, 16.81s/it][A
4.  38%|███▊      | 6/16 [02:45<04:35, 27.54s/it][A
6.  56%|█████▋    | 9/16 [02:31<01:57, 16.84s/it][A
6.  62%|██████▎   | 10/16 [02:48<01:40, 16.82s/it][A
4.  44%|████▍     | 7/16 [03:12<04:07, 27.55s/it][A


4. {'loss': 0.0037, 'grad_norm': 0.18326915800571442, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [03:40<03:40, 27.56s/it][A
4. [A
6.  69%|██████▉   | 11/16 [03:05<01:24, 16.83s/it][A
6.  75%|███████▌  | 12/16 [03:22<01:07, 16.84s/it][A
4.  50%|█████     | 8/16 [03:40<03:40, 27.56s/it][A
6.  81%|████████▏ | 13/16 [03:38<00:50, 16.83s/it][A
4.  56%|█████▋    | 9/16 [04:07<03:13, 27.58s/it][A
6.  88%|████████▊ | 14/16 [03:55<00:33, 16.82s/it][A
6.  94%|█████████▍| 15/16 [04:12<00:16, 16.82s/it][A


6.  60%|██████    | 18/30 [3:00:02<2:32:00, 760.07s/it]
6. {'loss': 0.0056, 'grad_norm': 0.2947135865688324, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  60%|██████    | 18/30 [3:02:16<2:32:00, 760.07s/it]
6. {'train_runtime': 269.3942, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.059, 'train_loss': 0.008531620958819985, 'epoch': 1.0}


6. 100%|██████████| 16/16 [04:29<00:00, 16.82s/it][A
6. [A
6. 100%|██████████| 16/16 [04:29<00:00, 16.82s/it][A
6. [A
6. 100%|██████████| 16/16 [04:29<00:00, 16.82s/it][A
6. 100%|██████████| 16/16 [04:29<00:00, 16.84s/it]
4.  62%|██████▎   | 10/16 [04:35<02:45, 27.57s/it][A


5.  67%|██████▋   | 20/30 [2:54:21<1:41:36, 609.62s/it]*** -> Training took 503.908 seconds.
5.  70%|███████   | 21/30 [2:57:39<1:35:47, 638.64s/it]retraining model for key 'a6f40cea' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 752.85 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 742.05 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  69%|██████▉   | 11/16 [05:03<02:17, 27.57s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  75%|███████▌  | 12/16 [05:30<01:50, 27.57s/it][A
5.   6%|▋         | 1/16 [00:23<05:47, 23.17s/it][A
4.  81%|████████▏ | 13/16 [05:58<01:22, 27.60s/it][A
5.  12%|█▎        | 2/16 [00:46<05:25, 23.23s/it][A


6.  60%|██████    | 18/30 [3:02:16<2:32:00, 760.07s/it]*** -> Training took 269.3942 seconds.
6.  63%|██████▎   | 19/30 [3:04:06<1:58:35, 646.88s/it]retraining model for key 'c7f57c3e' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1167.98 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1146.08 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  88%|████████▊ | 14/16 [06:25<00:55, 27.59s/it][A
5.  19%|█▉        | 3/16 [01:09<05:01, 23.22s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   6%|▋         | 1/16 [00:14<03:40, 14.67s/it][A
5.  25%|██▌       | 4/16 [01:32<04:38, 23.23s/it][A
4.  94%|█████████▍| 15/16 [06:53<00:27, 27.59s/it][A


4.  70%|███████   | 21/30 [3:05:33<1:19:31, 530.17s/it]
4. {'loss': 0.0019, 'grad_norm': 0.029650548473000526, 'learning_rate': 5e-05, 'epoch': 1.0}


4. 100%|██████████| 16/16 [07:21<00:00, 27.60s/it][A
4. [A
4. 100%|██████████| 16/16 [07:21<00:00, 27.60s/it][A
4. [A


4.  70%|███████   | 21/30 [3:09:14<1:19:31, 530.17s/it]
4. {'train_runtime': 441.1218, 'train_samples_per_second': 0.29, 'train_steps_per_second': 0.036, 'train_loss': 0.0027927608462050557, 'epoch': 1.0}


4. 100%|██████████| 16/16 [07:21<00:00, 27.60s/it][A
4. 100%|██████████| 16/16 [07:21<00:00, 27.57s/it]
6.  12%|█▎        | 2/16 [00:29<03:25, 14.70s/it][A
5.  31%|███▏      | 5/16 [01:56<04:15, 23.23s/it][A
6.  19%|█▉        | 3/16 [00:44<03:11, 14.73s/it][A
6.  25%|██▌       | 4/16 [00:58<02:56, 14.71s/it][A
5.  38%|███▊      | 6/16 [02:19<03:52, 23.21s/it][A
6.  31%|███▏      | 5/16 [01:13<02:41, 14.68s/it][A
5.  44%|████▍     | 7/16 [02:42<03:28, 23.21s/it][A


5. {'loss': 0.0202, 'grad_norm': 0.36407655477523804, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [03:05<03:05, 23.21s/it][A
5. [A
6.  38%|███▊      | 6/16 [01:28<02:26, 14.67s/it][A


7.  80%|████████  | 24/30 [3:05:34<48:09, 481.57s/it]*** -> Training took 911.3662 seconds.
7.  83%|████████▎ | 25/30 [3:11:58<1:00:36, 727.22s/it]retraining model for key 'c4d067a0' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 523.66 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 516.85 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


6.  44%|████▍     | 7/16 [01:42<02:11, 14.66s/it][A


6. {'loss': 0.002, 'grad_norm': 0.1592325121164322, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:57<01:57, 14.64s/it][A
6. [A
5.  50%|█████     | 8/16 [03:05<03:05, 23.21s/it][A
6.  50%|█████     | 8/16 [01:57<01:57, 14.64s/it][A
6.  56%|█████▋    | 9/16 [02:11<01:42, 14.63s/it][A
5.  56%|█████▋    | 9/16 [03:28<02:42, 23.17s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  62%|██████▎   | 10/16 [02:26<01:27, 14.64s/it][A
5.  62%|██████▎   | 10/16 [03:51<02:18, 23.16s/it][A


4.  70%|███████   | 21/30 [3:09:14<1:19:31, 530.17s/it]*** -> Training took 441.1218 seconds.


6.  69%|██████▉   | 11/16 [02:41<01:13, 14.65s/it][A


4.  73%|███████▎  | 22/30 [3:11:35<1:12:57, 547.19s/it]retraining model for key '981571dc' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 276.69 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 274.35 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.   6%|▋         | 1/16 [00:35<08:45, 35.00s/it][A
6.  75%|███████▌  | 12/16 [02:55<00:58, 14.65s/it][A
5.  69%|██████▉   | 11/16 [04:15<01:55, 23.17s/it][A
6.  81%|████████▏ | 13/16 [03:10<00:43, 14.65s/it][A
5.  75%|███████▌  | 12/16 [04:38<01:32, 23.18s/it][A
7.  12%|█▎        | 2/16 [01:10<08:10, 35.02s/it][A
6.  88%|████████▊ | 14/16 [03:25<00:29, 14.64s/it][A
6.  94%|█████████▍| 15/16 [03:39<00:14, 14.63s/it][A


6.  63%|██████▎   | 19/30 [3:06:07<1:58:35, 646.88s/it]
6. {'loss': 0.0003, 'grad_norm': 0.009059879928827286, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  63%|██████▎   | 19/30 [3:08:04<1:58:35, 646.88s/it]
6. {'train_runtime': 234.4183, 'train_samples_per_second': 0.546, 'train_steps_per_second': 0.068, 'train_loss': 0.0011792338627856225, 'epoch': 1.0}


6. 100%|██████████| 16/16 [03:54<00:00, 14.62s/it][A
6. [A
6. 100%|██████████| 16/16 [03:54<00:00, 14.62s/it][A
6. [A
6. 100%|██████████| 16/16 [03:54<00:00, 14.62s/it][A
6. 100%|██████████| 16/16 [03:54<00:00, 14.65s/it]
5.  81%|████████▏ | 13/16 [05:01<01:09, 23.20s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  19%|█▉        | 3/16 [01:44<07:34, 34.97s/it][A
5.  88%|████████▊ | 14/16 [05:24<00:46, 23.19s/it][A
5.  94%|█████████▍| 15/16 [05:47<00:23, 23.19s/it][A


5.  70%|███████   | 21/30 [3:00:49<1:35:47, 638.64s/it]
5. {'loss': 0.0026, 'grad_norm': 0.1758640855550766, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  70%|███████   | 21/30 [3:03:54<1:35:47, 638.64s/it]
5. {'train_runtime': 371.1274, 'train_samples_per_second': 0.345, 'train_steps_per_second': 0.043, 'train_loss': 0.011441000388003886, 'epoch': 1.0}


5. 100%|██████████| 16/16 [06:11<00:00, 23.19s/it][A
5. [A
5. 100%|██████████| 16/16 [06:11<00:00, 23.19s/it][A
5. [A
5. 100%|██████████| 16/16 [06:11<00:00, 23.19s/it][A
5. 100%|██████████| 16/16 [06:11<00:00, 23.19s/it]
7.  25%|██▌       | 4/16 [02:19<06:59, 34.94s/it][A
4.   6%|▋         | 1/16 [01:00<15:06, 60.42s/it][A


5.  70%|███████   | 21/30 [3:03:54<1:35:47, 638.64s/it]*** -> Training took 371.1274 seconds.
5.  73%|███████▎  | 22/30 [3:04:17<1:15:31, 566.38s/it]retraining model for key 'aa4ec2a5' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 507.92 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 502.08 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


7.  31%|███▏      | 5/16 [02:55<06:25, 35.03s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  38%|███▊      | 6/16 [03:30<05:50, 35.09s/it][A
4.  12%|█▎        | 2/16 [02:00<14:06, 60.43s/it][A
5.   6%|▋         | 1/16 [00:34<08:40, 34.68s/it][A
7.  44%|████▍     | 7/16 [04:05<05:16, 35.13s/it][A


7. {'loss': 0.0017, 'grad_norm': 0.05394019931554794, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [04:40<04:41, 35.19s/it][A
7. [A
5.  12%|█▎        | 2/16 [01:09<08:05, 34.66s/it][A
4.  19%|█▉        | 3/16 [03:01<13:08, 60.63s/it][A
7.  50%|█████     | 8/16 [04:40<04:41, 35.19s/it][A
5.  19%|█▉        | 3/16 [01:43<07:29, 34.56s/it][A
7.  56%|█████▋    | 9/16 [05:15<04:06, 35.21s/it][A
4.  25%|██▌       | 4/16 [04:02<12:07, 60.63s/it][A
5.  25%|██▌       | 4/16 [02:18<06:54, 34.57s/it][A
7.  62%|██████▎   | 10/16 [05:51<03:31, 35.24s/it][A
5.  31%|███▏      | 5/16 [02:53<06:20, 34.60s/it][A


6.  63%|██████▎   | 19/30 [3:08:04<1:58:35, 646.88s/it]*** -> Training took 234.4183 seconds.
6.  67%|██████▋   | 20/30 [3:12:52<1:41:43, 610.36s/it]retraining model for key 'cb2d8a2c' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 665.45 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 655.84 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


7.  69%|██████▉   | 11/16 [06:26<02:56, 35.27s/it][A
4.  31%|███▏      | 5/16 [05:03<11:07, 60.67s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  38%|███▊      | 6/16 [03:27<05:46, 34.60s/it][A
7.  75%|███████▌  | 12/16 [07:01<02:21, 35.27s/it][A
6.   6%|▋         | 1/16 [00:26<06:32, 26.15s/it][A
5.  44%|████▍     | 7/16 [04:02<05:11, 34.59s/it][A


5. {'loss': 0.006, 'grad_norm': 0.14590959250926971, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [04:36<04:36, 34.60s/it][A
5. [A
4.  38%|███▊      | 6/16 [06:03<10:06, 60.67s/it][A
6.  12%|█▎        | 2/16 [00:52<06:07, 26.23s/it][A
7.  81%|████████▏ | 13/16 [07:37<01:45, 35.29s/it][A
5.  50%|█████     | 8/16 [04:36<04:36, 34.60s/it][A
6.  19%|█▉        | 3/16 [01:18<05:41, 26.27s/it][A
7.  88%|████████▊ | 14/16 [08:12<01:10, 35.28s/it][A
6.  25%|██▌       | 4/16 [01:44<05:14, 26.25s/it][A
5.  56%|█████▋    | 9/16 [05:11<04:02, 34.60s/it][A
4.  44%|████▍     | 7/16 [07:04<09:06, 60.67s/it][A


4. {'loss': 0.0003, 'grad_norm': 0.11740069091320038, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [08:05<08:05, 60.65s/it][A
4. [A
7.  94%|█████████▍| 15/16 [08:47<00:35, 35.21s/it][A


7.  83%|████████▎ | 25/30 [3:16:43<1:00:36, 727.22s/it]


7. 100%|██████████| 16/16 [09:22<00:00, 35.15s/it][A
7. [A
7. 100%|██████████| 16/16 [09:22<00:00, 35.15s/it][A
7. [A
7. 100%|██████████| 16/16 [09:22<00:00, 35.15s/it][A
7. 100%|██████████| 16/16 [09:22<00:00, 35.16s/it]


7. {'loss': 0.0004, 'grad_norm': 0.020928097888827324, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  83%|████████▎ | 25/30 [3:21:25<1:00:36, 727.22s/it]
7. {'train_runtime': 562.5701, 'train_samples_per_second': 0.228, 'train_steps_per_second': 0.028, 'train_loss': 0.0010455397423356771, 'epoch': 1.0}


6.  31%|███▏      | 5/16 [02:11<04:48, 26.22s/it][A
5.  62%|██████▎   | 10/16 [05:45<03:27, 34.59s/it][A
6.  38%|███▊      | 6/16 [02:37<04:22, 26.23s/it][A
4.  50%|█████     | 8/16 [08:05<08:05, 60.65s/it][A
5.  69%|██████▉   | 11/16 [06:20<02:52, 34.59s/it][A
6.  44%|████▍     | 7/16 [03:03<03:56, 26.25s/it][A
6.  50%|█████     | 8/16 [03:29<03:30, 26.25s/it][A
6. [A


6. {'loss': 0.0046, 'grad_norm': 0.15250332653522491, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [03:29<03:30, 26.25s/it][A
5.  75%|███████▌  | 12/16 [06:55<02:18, 34.57s/it][A
4.  56%|█████▋    | 9/16 [09:05<07:04, 60.65s/it][A
6.  56%|█████▋    | 9/16 [03:56<03:03, 26.24s/it][A
5.  81%|████████▏ | 13/16 [07:29<01:43, 34.56s/it][A
6.  62%|██████▎   | 10/16 [04:22<02:37, 26.24s/it][A
5.  88%|████████▊ | 14/16 [08:04<01:09, 34.55s/it][A
6.  69%|██████▉   | 11/16 [04:48<02:11, 26.24s/it][A
4.  62%|██████▎   | 10/16 [10:06<06:03, 60.65s/it][A
5.  94%|█████████▍| 15/16 [08:38<00:34, 34.56s/it][A


5.  73%|███████▎  | 22/30 [3:08:58<1:15:31, 566.38s/it]
5. {'loss': 0.004, 'grad_norm': 0.031392522156238556, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  73%|███████▎  | 22/30 [3:13:35<1:15:31, 566.38s/it]
5. {'train_runtime': 553.2117, 'train_samples_per_second': 0.231, 'train_steps_per_second': 0.029, 'train_loss': 0.005018243798986077, 'epoch': 1.0}


5. 100%|██████████| 16/16 [09:13<00:00, 34.54s/it][A
5. [A
5. 100%|██████████| 16/16 [09:13<00:00, 34.54s/it][A
5. [A
5. 100%|██████████| 16/16 [09:13<00:00, 34.54s/it][A
5. 100%|██████████| 16/16 [09:13<00:00, 34.58s/it]
6.  75%|███████▌  | 12/16 [05:14<01:44, 26.24s/it][A
6.  81%|████████▏ | 13/16 [05:41<01:18, 26.22s/it][A
4.  69%|██████▉   | 11/16 [11:07<05:03, 60.66s/it][A
6.  88%|████████▊ | 14/16 [06:07<00:52, 26.22s/it][A
6.  94%|█████████▍| 15/16 [06:33<00:26, 26.23s/it][A


6.  67%|██████▋   | 20/30 [3:16:26<1:41:43, 610.36s/it]
6. {'loss': 0.0007, 'grad_norm': 0.05707544833421707, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  67%|██████▋   | 20/30 [3:19:55<1:41:43, 610.36s/it]
6. {'train_runtime': 419.6654, 'train_samples_per_second': 0.305, 'train_steps_per_second': 0.038, 'train_loss': 0.0026518430386204273, 'epoch': 1.0}


6. 100%|██████████| 16/16 [06:59<00:00, 26.21s/it][A
6. [A
6. 100%|██████████| 16/16 [06:59<00:00, 26.21s/it][A
6. [A
6. 100%|██████████| 16/16 [06:59<00:00, 26.21s/it][A
6. 100%|██████████| 16/16 [06:59<00:00, 26.23s/it]
4.  75%|███████▌  | 12/16 [12:07<04:02, 60.65s/it][A


7.  83%|████████▎ | 25/30 [3:21:25<1:00:36, 727.22s/it]*** -> Training took 562.5701 seconds.
7.  87%|████████▋ | 26/30 [3:26:22<51:12, 768.19s/it]  retraining model for key 'db0c5428' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 878.51 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 864.50 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:20<05:13, 20.93s/it][A
4.  81%|████████▏ | 13/16 [13:08<03:01, 60.66s/it][A
7.  12%|█▎        | 2/16 [00:41<04:53, 20.93s/it][A
7.  19%|█▉        | 3/16 [01:02<04:32, 20.93s/it][A
7.  25%|██▌       | 4/16 [01:23<04:11, 20.93s/it][A
4.  88%|████████▊ | 14/16 [14:09<02:01, 60.67s/it][A
7.  31%|███▏      | 5/16 [01:44<03:50, 20.93s/it][A
7.  38%|███▊      | 6/16 [02:05<03:29, 20.92s/it][A
7.  44%|████▍     | 7/16 [02:26<03:08, 20.93s/it][A


7. {'loss': 0.0019, 'grad_norm': 0.1009528785943985, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:47<02:47, 20.92s/it][A
7. [A
4.  94%|█████████▍| 15/16 [15:09<01:00, 60.69s/it][A


4.  73%|███████▎  | 22/30 [3:19:45<1:12:57, 547.19s/it]
4. {'loss': 0.0002, 'grad_norm': 0.034590814262628555, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  73%|███████▎  | 22/30 [3:27:51<1:12:57, 547.19s/it]
4. {'train_runtime': 970.3322, 'train_samples_per_second': 0.132, 'train_steps_per_second': 0.016, 'train_loss': 0.00027054245583713055, 'epoch': 1.0}


4. 100%|██████████| 16/16 [16:10<00:00, 60.65s/it][A
4. [A
4. 100%|██████████| 16/16 [16:10<00:00, 60.65s/it][A
4. [A
4. 100%|██████████| 16/16 [16:10<00:00, 60.65s/it][A
4. 100%|██████████| 16/16 [16:10<00:00, 60.65s/it]
7.  50%|█████     | 8/16 [02:47<02:47, 20.92s/it][A
7.  56%|█████▋    | 9/16 [03:08<02:26, 20.92s/it][A
7.  62%|██████▎   | 10/16 [03:29<02:05, 20.93s/it][A
7.  69%|██████▉   | 11/16 [03:50<01:44, 20.93s/it][A


6.  67%|██████▋   | 20/30 [3:19:55<1:41:43, 610.36s/it]*** -> Training took 419.6654 seconds.
5.  73%|███████▎  | 22/30 [3:13:35<1:15:31, 566.38s/it]*** -> Training took 553.2117 seconds.
6.  70%|███████   | 21/30 [3:24:37<1:35:51, 639.01s/it]retraining model for key 'cbebaa4b' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 799.86 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 785.62 examples/s]


5.  77%|███████▋  | 23/30 [3:19:37<1:18:27, 672.51s/it]retraining model for key 'abc82100' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1031.19 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1013.62 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...
5. *** Start training run...


7.  75%|███████▌  | 12/16 [04:11<01:23, 20.94s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   6%|▋         | 1/16 [00:16<04:09, 16.62s/it][A
7.  81%|████████▏ | 13/16 [04:32<01:02, 20.96s/it][A
6.   6%|▋         | 1/16 [00:21<05:25, 21.71s/it][A
5.  12%|█▎        | 2/16 [00:33<03:53, 16.65s/it][A
7.  88%|████████▊ | 14/16 [04:53<00:41, 20.98s/it][A
6.  12%|█▎        | 2/16 [00:43<05:04, 21.73s/it][A
5.  19%|█▉        | 3/16 [00:49<03:36, 16.63s/it][A
7.  94%|█████████▍| 15/16 [05:14<00:20, 20.97s/it][A


7.  87%|████████▋ | 26/30 [3:29:13<51:12, 768.19s/it]
7. {'loss': 0.0001, 'grad_norm': 0.0061118993908166885, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  87%|████████▋ | 26/30 [3:32:01<51:12, 768.19s/it]
7. {'train_runtime': 335.0379, 'train_samples_per_second': 0.382, 'train_steps_per_second': 0.048, 'train_loss': 0.0009704426665848587, 'epoch': 1.0}


7. 100%|██████████| 16/16 [05:35<00:00, 20.94s/it][A
7. [A
7. 100%|██████████| 16/16 [05:35<00:00, 20.94s/it][A
7. [A
7. 100%|██████████| 16/16 [05:35<00:00, 20.94s/it][A
7. 100%|██████████| 16/16 [05:35<00:00, 20.94s/it]
5.  25%|██▌       | 4/16 [01:06<03:19, 16.58s/it][A
6.  19%|█▉        | 3/16 [01:05<04:42, 21.71s/it][A
5.  31%|███▏      | 5/16 [01:22<03:01, 16.53s/it][A
6.  25%|██▌       | 4/16 [01:26<04:19, 21.67s/it][A
5.  38%|███▊      | 6/16 [01:39<02:45, 16.53s/it][A
6.  31%|███▏      | 5/16 [01:48<03:58, 21.67s/it][A
5.  44%|████▍     | 7/16 [01:55<02:28, 16.54s/it][A


5. {'loss': 0.0053, 'grad_norm': 0.9292880296707153, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:12<02:12, 16.55s/it][A
5. [A
5.  50%|█████     | 8/16 [02:12<02:12, 16.55s/it][A
6.  38%|███▊      | 6/16 [02:10<03:36, 21.68s/it][A
5.  56%|█████▋    | 9/16 [02:29<01:55, 16.54s/it][A
6.  44%|████▍     | 7/16 [02:31<03:15, 21.68s/it][A


6. {'loss': 0.0061, 'grad_norm': 0.34329506754875183, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:53<02:53, 21.67s/it][A
6. [A
5.  62%|██████▎   | 10/16 [02:45<01:39, 16.53s/it][A
6.  50%|█████     | 8/16 [02:53<02:53, 21.67s/it][A
5.  69%|██████▉   | 11/16 [03:02<01:22, 16.53s/it][A


7.  87%|████████▋ | 26/30 [3:32:01<51:12, 768.19s/it]*** -> Training took 335.0379 seconds.
7.  90%|█████████ | 27/30 [3:34:09<33:53, 677.91s/it]retraining model for key 'de809cff' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1029.89 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1012.14 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


5.  75%|███████▌  | 12/16 [03:18<01:06, 16.54s/it][A
6.  56%|█████▋    | 9/16 [03:15<02:31, 21.67s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  81%|████████▏ | 13/16 [03:35<00:49, 16.55s/it][A
6.  62%|██████▎   | 10/16 [03:36<02:10, 21.68s/it][A
7.   6%|▋         | 1/16 [00:17<04:19, 17.30s/it][A
5.  88%|████████▊ | 14/16 [03:51<00:33, 16.55s/it][A
6.  69%|██████▉   | 11/16 [03:58<01:48, 21.68s/it][A
7.  12%|█▎        | 2/16 [00:34<04:02, 17.33s/it][A
5.  94%|█████████▍| 15/16 [04:08<00:16, 16.54s/it][A


5.  77%|███████▋  | 23/30 [3:21:54<1:18:27, 672.51s/it]
5. {'loss': 0.0026, 'grad_norm': 0.11454594880342484, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  77%|███████▋  | 23/30 [3:24:06<1:18:27, 672.51s/it]
5. {'train_runtime': 264.7704, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.06, 'train_loss': 0.0039426099974662066, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:24<00:00, 16.53s/it][A
5. [A
5. 100%|██████████| 16/16 [04:24<00:00, 16.53s/it][A
5. [A
5. 100%|██████████| 16/16 [04:24<00:00, 16.53s/it][A
5. 100%|██████████| 16/16 [04:24<00:00, 16.55s/it]
7.  19%|█▉        | 3/16 [00:52<03:45, 17.35s/it][A
6.  75%|███████▌  | 12/16 [04:20<01:26, 21.68s/it][A
7.  25%|██▌       | 4/16 [01:09<03:28, 17.37s/it][A
6.  81%|████████▏ | 13/16 [04:41<01:05, 21.68s/it][A
7.  31%|███▏      | 5/16 [01:26<03:11, 17.40s/it][A
6.  88%|████████▊ | 14/16 [05:03<00:43, 21.68s/it][A
7.  38%|███▊      | 6/16 [01:44<02:54, 17.41s/it][A
6.  94%|█████████▍| 15/16 [05:25<00:21, 21.66s/it][A


6.  70%|███████   | 21/30 [3:27:35<1:35:51, 639.01s/it]
6. {'loss': 0.0012, 'grad_norm': 0.09858505427837372, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  70%|███████   | 21/30 [3:30:28<1:35:51, 639.01s/it]
6. {'train_runtime': 346.7588, 'train_samples_per_second': 0.369, 'train_steps_per_second': 0.046, 'train_loss': 0.003629448590800166, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:46<00:00, 21.64s/it][A
6. [A
6. 100%|██████████| 16/16 [05:46<00:00, 21.64s/it][A
6. [A
6. 100%|██████████| 16/16 [05:46<00:00, 21.64s/it][A
6. 100%|██████████| 16/16 [05:46<00:00, 21.67s/it]
7.  44%|████▍     | 7/16 [02:01<02:36, 17.42s/it][A


7. {'loss': 0.0112, 'grad_norm': 0.06932200491428375, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:19<02:19, 17.42s/it][A
7. [A
7.  50%|█████     | 8/16 [02:19<02:19, 17.42s/it][A
7.  56%|█████▋    | 9/16 [02:36<02:02, 17.43s/it][A


4.  73%|███████▎  | 22/30 [3:27:51<1:12:57, 547.19s/it]*** -> Training took 970.3322 seconds.
4.  77%|███████▋  | 23/30 [3:35:50<1:35:37, 819.63s/it]retraining model for key '9bbf930d' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 932.40 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 914.56 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.  62%|██████▎   | 10/16 [02:54<01:44, 17.46s/it][A
7.  69%|██████▉   | 11/16 [03:11<01:27, 17.49s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  75%|███████▌  | 12/16 [03:29<01:10, 17.51s/it][A
4.   6%|▋         | 1/16 [00:18<04:34, 18.27s/it][A
7.  81%|████████▏ | 13/16 [03:46<00:52, 17.51s/it][A
4.  12%|█▎        | 2/16 [00:36<04:15, 18.26s/it][A


5.  77%|███████▋  | 23/30 [3:24:06<1:18:27, 672.51s/it]*** -> Training took 264.7704 seconds.
5.  80%|████████  | 24/30 [3:27:20<1:00:56, 609.48s/it]retraining model for key 'b0039139' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 2142.29 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  88%|████████▊ | 14/16 [04:04<00:35, 17.51s/it][A
4.  19%|█▉        | 3/16 [00:54<03:57, 18.26s/it][A
5.   6%|▋         | 1/16 [00:07<01:54,  7.61s/it][A
5.  12%|█▎        | 2/16 [00:15<01:46,  7.60s/it][A
7.  94%|█████████▍| 15/16 [04:21<00:17, 17.53s/it][A


7.  90%|█████████ | 27/30 [3:36:32<33:53, 677.91s/it]


7. 100%|██████████| 16/16 [04:39<00:00, 17.56s/it][A
7. [A
7. 100%|██████████| 16/16 [04:39<00:00, 17.56s/it][A
7. [A
7. 100%|██████████| 16/16 [04:39<00:00, 17.56s/it][A
7. 100%|██████████| 16/16 [04:39<00:00, 17.47s/it]


7. {'loss': 0.0031, 'grad_norm': 0.15544168651103973, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  90%|█████████ | 27/30 [3:38:52<33:53, 677.91s/it]
7. {'train_runtime': 279.4964, 'train_samples_per_second': 0.458, 'train_steps_per_second': 0.057, 'train_loss': 0.007159478031098843, 'epoch': 1.0}


5.  19%|█▉        | 3/16 [00:22<01:39,  7.62s/it][A
4.  25%|██▌       | 4/16 [01:13<03:38, 18.25s/it][A
5.  25%|██▌       | 4/16 [00:30<01:31,  7.63s/it][A
5.  31%|███▏      | 5/16 [00:38<01:24,  7.65s/it][A
4.  31%|███▏      | 5/16 [01:31<03:20, 18.24s/it][A
5.  38%|███▊      | 6/16 [00:45<01:16,  7.67s/it][A
5.  44%|████▍     | 7/16 [00:53<01:09,  7.68s/it][A


5. {'loss': 0.028, 'grad_norm': 1.4899073839187622, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:01<01:01,  7.70s/it][A
5. [A
4.  38%|███▊      | 6/16 [01:49<03:02, 18.25s/it][A
5.  50%|█████     | 8/16 [01:01<01:01,  7.70s/it][A
5.  56%|█████▋    | 9/16 [01:09<00:53,  7.70s/it][A
5.  62%|██████▎   | 10/16 [01:16<00:46,  7.70s/it][A
4.  44%|████▍     | 7/16 [02:07<02:44, 18.25s/it][A


4. {'loss': 0.005, 'grad_norm': 0.05529303103685379, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:26<02:25, 18.25s/it][A
4. [A
5.  69%|██████▉   | 11/16 [01:24<00:38,  7.71s/it][A
5.  75%|███████▌  | 12/16 [01:32<00:30,  7.72s/it][A
4.  50%|█████     | 8/16 [02:26<02:25, 18.25s/it][A
5.  81%|████████▏ | 13/16 [01:39<00:23,  7.72s/it][A
5.  88%|████████▊ | 14/16 [01:47<00:15,  7.71s/it][A
4.  56%|█████▋    | 9/16 [02:44<02:07, 18.23s/it][A
5.  94%|█████████▍| 15/16 [01:55<00:07,  7.72s/it][A


5.  80%|████████  | 24/30 [3:28:24<1:00:56, 609.48s/it]
5. {'loss': 0.0062, 'grad_norm': 0.4147650897502899, 'learning_rate': 5e-05, 'epoch': 1.0}


5. 100%|██████████| 16/16 [02:03<00:00,  7.71s/it][A
5. [A
5. 100%|██████████| 16/16 [02:03<00:00,  7.71s/it][A
5. [A
5. 100%|██████████| 16/16 [02:03<00:00,  7.71s/it][A
5. 100%|██████████| 16/16 [02:03<00:00,  7.69s/it]


5.  80%|████████  | 24/30 [3:29:26<1:00:56, 609.48s/it]
5. {'train_runtime': 123.0635, 'train_samples_per_second': 1.04, 'train_steps_per_second': 0.13, 'train_loss': 0.0170867380220443, 'epoch': 1.0}


4.  62%|██████▎   | 10/16 [03:02<01:49, 18.22s/it][A


7.  90%|█████████ | 27/30 [3:38:52<33:53, 677.91s/it]*** -> Training took 279.4964 seconds.
7.  93%|█████████▎| 28/30 [3:40:51<19:49, 594.97s/it]retraining model for key 'f931b4a8' (retrain_dataset_size=10)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 1568.49 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


4.  69%|██████▉   | 11/16 [03:20<01:30, 18.20s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
7.   6%|▋         | 1/16 [00:11<02:48, 11.25s/it][A
4.  75%|███████▌  | 12/16 [03:38<01:12, 18.21s/it][A
7.  12%|█▎        | 2/16 [00:22<02:37, 11.25s/it][A
7.  19%|█▉        | 3/16 [00:33<02:26, 11.25s/it][A
4.  81%|████████▏ | 13/16 [03:57<00:54, 18.22s/it][A


5.  80%|████████  | 24/30 [3:29:26<1:00:56, 609.48s/it]*** -> Training took 123.0635 seconds.
5.  83%|████████▎ | 25/30 [3:30:48<40:45, 489.06s/it]  retraining model for key 'b9e38dc0' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1159.00 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1137.01 examples/s]
7.  25%|██▌       | 4/16 [00:45<02:15, 11.25s/it][A
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  88%|████████▊ | 14/16 [04:15<00:36, 18.23s/it][A
7.  31%|███▏      | 5/16 [00:56<02:03, 11.25s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  38%|███▊      | 6/16 [01:07<01:52, 11.26s/it][A
4.  94%|█████████▍| 15/16 [04:33<00:18, 18.22s/it][A


4.  77%|███████▋  | 23/30 [3:38:20<1:35:37, 819.63s/it]
4. {'loss': 0.0005, 'grad_norm': 0.17842017114162445, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  77%|███████▋  | 23/30 [3:40:46<1:35:37, 819.63s/it]


4. 100%|██████████| 16/16 [04:51<00:00, 18.21s/it][A
4. [A
4. 100%|██████████| 16/16 [04:51<00:00, 18.21s/it][A
4. [A


4. {'train_runtime': 291.673, 'train_samples_per_second': 0.439, 'train_steps_per_second': 0.055, 'train_loss': 0.0027462328143883497, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:51<00:00, 18.21s/it][A
4. 100%|██████████| 16/16 [04:51<00:00, 18.23s/it]
5.   6%|▋         | 1/16 [00:14<03:37, 14.50s/it][A
7.  44%|████▍     | 7/16 [01:18<01:41, 11.27s/it][A


7. {'loss': 0.0054, 'grad_norm': 0.2172616422176361, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [01:30<01:30, 11.27s/it][A
7. [A
7.  50%|█████     | 8/16 [01:30<01:30, 11.27s/it][A
5.  12%|█▎        | 2/16 [00:29<03:23, 14.54s/it][A
7.  56%|█████▋    | 9/16 [01:41<01:18, 11.28s/it][A
5.  19%|█▉        | 3/16 [00:43<03:09, 14.61s/it][A


6.  70%|███████   | 21/30 [3:30:28<1:35:51, 639.01s/it]*** -> Training took 346.7588 seconds.
6.  73%|███████▎  | 22/30 [3:36:55<1:29:08, 668.58s/it]retraining model for key 'da515329' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 859.01 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 846.03 examples/s]
7.  62%|██████▎   | 10/16 [01:52<01:07, 11.28s/it][A
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  25%|██▌       | 4/16 [00:58<02:56, 14.71s/it][A
7.  69%|██████▉   | 11/16 [02:03<00:56, 11.29s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
7.  75%|███████▌  | 12/16 [02:15<00:45, 11.29s/it][A
5.  31%|███▏      | 5/16 [01:13<02:42, 14.78s/it][A
7.  81%|████████▏ | 13/16 [02:26<00:33, 11.30s/it][A
5.  38%|███▊      | 6/16 [01:28<02:28, 14.82s/it][A
6.   6%|▋         | 1/16 [00:20<05:00, 20.02s/it][A
7.  88%|████████▊ | 14/16 [02:37<00:22, 11.32s/it][A
5.  44%|████▍     | 7/16 [01:43<02:13, 14.84s/it][A


5. {'loss': 0.0042, 'grad_norm': 0.1251697540283203, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:58<01:58, 14.84s/it][A
5. [A
7.  94%|█████████▍| 15/16 [02:49<00:11, 11.32s/it][A


7.  93%|█████████▎| 28/30 [3:42:24<19:49, 594.97s/it]
7. {'loss': 0.0011, 'grad_norm': 0.26186227798461914, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  93%|█████████▎| 28/30 [3:43:55<19:49, 594.97s/it]
7. {'train_runtime': 180.6133, 'train_samples_per_second': 0.709, 'train_steps_per_second': 0.089, 'train_loss': 0.0032686228514648974, 'epoch': 1.0}


7. 100%|██████████| 16/16 [03:00<00:00, 11.32s/it][A
7. [A
7. 100%|██████████| 16/16 [03:00<00:00, 11.32s/it][A
7. [A
7. 100%|██████████| 16/16 [03:00<00:00, 11.32s/it][A
7. 100%|██████████| 16/16 [03:00<00:00, 11.29s/it]
6.  12%|█▎        | 2/16 [00:40<04:40, 20.05s/it][A
5.  50%|█████     | 8/16 [01:58<01:58, 14.84s/it][A


7.  93%|█████████▎| 28/30 [3:43:55<19:49, 594.97s/it]*** -> Training took 180.6133 seconds.
7.  97%|█████████▋| 29/30 [3:44:09<07:55, 475.95s/it]retraining model for key 'fc7cae8d' (retrain_dataset_size=5)
7. *** Set model state_dict...


7. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 956.60 examples/s][A
7. Map: 100%|██████████| 128/128 [00:00<00:00, 940.71 examples/s]
7. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
7.    \\   /|    Num examples = 128 | Num Epochs = 1
7. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
7. \        /    Total batch size = 8 | Total steps = 16
7.  "-____-"     Number of trainable parameters = 94,044,160


7. *** Start training run...


6.  19%|█▉        | 3/16 [01:00<04:20, 20.05s/it][A
5.  56%|█████▋    | 9/16 [02:12<01:43, 14.84s/it][A
7.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  62%|██████▎   | 10/16 [02:27<01:29, 14.85s/it][A
6.  25%|██▌       | 4/16 [01:20<04:00, 20.01s/it][A
5.  69%|██████▉   | 11/16 [02:42<01:14, 14.86s/it][A
7.   6%|▋         | 1/16 [00:19<04:49, 19.32s/it][A
6.  31%|███▏      | 5/16 [01:40<03:40, 20.01s/it][A
5.  75%|███████▌  | 12/16 [02:57<00:59, 14.87s/it][A
7.  12%|█▎        | 2/16 [00:38<04:29, 19.27s/it][A
6.  38%|███▊      | 6/16 [01:59<03:19, 19.96s/it][A
5.  81%|████████▏ | 13/16 [03:12<00:44, 14.87s/it][A


4.  77%|███████▋  | 23/30 [3:40:46<1:35:37, 819.63s/it]*** -> Training took 291.673 seconds.
4.  80%|████████  | 24/30 [3:43:56<1:11:55, 719.29s/it]retraining model for key 'bf45cf4b' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 1613.87 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


7.  19%|█▉        | 3/16 [00:57<04:10, 19.27s/it][A
5.  88%|████████▊ | 14/16 [03:27<00:29, 14.87s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  44%|████▍     | 7/16 [02:19<02:59, 19.98s/it][A


6. {'loss': 0.0092, 'grad_norm': 0.45221665501594543, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:40<02:39, 20.00s/it][A
6. [A
4.   6%|▋         | 1/16 [00:09<02:28,  9.90s/it][A
7.  25%|██▌       | 4/16 [01:17<03:51, 19.29s/it][A
5.  94%|█████████▍| 15/16 [03:42<00:14, 14.86s/it][A


5.  83%|████████▎ | 25/30 [3:32:49<40:45, 489.06s/it]
5. {'loss': 0.001, 'grad_norm': 0.033129796385765076, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  83%|████████▎ | 25/30 [3:34:48<40:45, 489.06s/it]


5. 100%|██████████| 16/16 [03:57<00:00, 14.86s/it][A
5. [A
5. 100%|██████████| 16/16 [03:57<00:00, 14.86s/it][A
5. [A


5. {'train_runtime': 237.0597, 'train_samples_per_second': 0.54, 'train_steps_per_second': 0.067, 'train_loss': 0.0026431650621816516, 'epoch': 1.0}


5. 100%|██████████| 16/16 [03:57<00:00, 14.86s/it][A
5. 100%|██████████| 16/16 [03:57<00:00, 14.82s/it]
4.  12%|█▎        | 2/16 [00:19<02:18,  9.87s/it][A
6.  50%|█████     | 8/16 [02:40<02:39, 20.00s/it][A
4.  19%|█▉        | 3/16 [00:29<02:08,  9.86s/it][A
7.  31%|███▏      | 5/16 [01:36<03:32, 19.30s/it][A
4.  25%|██▌       | 4/16 [00:39<01:58,  9.85s/it][A
6.  56%|█████▋    | 9/16 [03:00<02:20, 20.01s/it][A
7.  38%|███▊      | 6/16 [01:55<03:13, 19.30s/it][A
4.  31%|███▏      | 5/16 [00:49<01:48,  9.87s/it][A
4.  38%|███▊      | 6/16 [00:59<01:38,  9.90s/it][A
6.  62%|██████▎   | 10/16 [03:20<02:00, 20.00s/it][A


5.  83%|████████▎ | 25/30 [3:34:48<40:45, 489.06s/it]*** -> Training took 237.0597 seconds.


7.  44%|████▍     | 7/16 [02:15<02:53, 19.31s/it][A


7. {'loss': 0.0283, 'grad_norm': 0.5998648405075073, 'learning_rate': 5e-05, 'epoch': 0.5}


7.  50%|█████     | 8/16 [02:34<02:34, 19.30s/it][A
7. [A


5.  87%|████████▋ | 26/30 [3:35:45<28:45, 431.45s/it]retraining model for key 'd59b0160' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 975.81 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 958.80 examples/s]
4.  44%|████▍     | 7/16 [01:09<01:29,  9.93s/it][A


4. {'loss': 0.007, 'grad_norm': 0.47900575399398804, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [01:19<01:19,  9.96s/it][A
4. [A
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  50%|█████     | 8/16 [01:19<01:19,  9.96s/it][A
6.  69%|██████▉   | 11/16 [03:39<01:39, 19.98s/it][A
7.  50%|█████     | 8/16 [02:34<02:34, 19.30s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  56%|█████▋    | 9/16 [01:29<01:09,  9.99s/it][A
4.  62%|██████▎   | 10/16 [01:39<01:00, 10.01s/it][A
6.  75%|███████▌  | 12/16 [03:59<01:19, 19.96s/it][A
5.   6%|▋         | 1/16 [00:17<04:27, 17.85s/it][A
7.  56%|█████▋    | 9/16 [02:53<02:15, 19.30s/it][A
4.  69%|██████▉   | 11/16 [01:49<00:50, 10.02s/it][A
4.  75%|███████▌  | 12/16 [01:59<00:40, 10.01s/it][A
6.  81%|████████▏ | 13/16 [04:19<00:59, 19.95s/it][A
5.  12%|█▎        | 2/16 [00:35<04:09, 17.81s/it][A
7.  62%|██████▎   | 10/16 [03:12<01:55, 19.30s/it][A
4.  81%|████████▏ | 13/16 [02:09<00:29, 10.00s/it][A
4.  88%|████████▊ | 14/16 [02:19<00:19,  9.99s/it][A
6.  88%|████████▊ | 14/16 [04:39<00:39, 19.96s/it][A
5.  19%|█▉        | 3/16 [00:53<03:50, 17.76s/it][A
7.  69%|██████▉   | 11/16 [03:32<01:36, 19.3

4.  80%|████████  | 24/30 [3:45:18<1:11:55, 719.29s/it]
4. {'loss': 0.002, 'grad_norm': 0.5310130715370178, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  80%|████████  | 24/30 [3:46:38<1:11:55, 719.29s/it]


4. 100%|██████████| 16/16 [02:39<00:00,  9.95s/it][A
4. [A
4. 100%|██████████| 16/16 [02:39<00:00,  9.95s/it][A
4. [A


4. {'train_runtime': 159.2183, 'train_samples_per_second': 0.804, 'train_steps_per_second': 0.1, 'train_loss': 0.004507002653554082, 'epoch': 1.0}


4. 100%|██████████| 16/16 [02:39<00:00,  9.95s/it][A
4. 100%|██████████| 16/16 [02:39<00:00,  9.95s/it]
5.  25%|██▌       | 4/16 [01:10<03:32, 17.72s/it][A
6.  94%|█████████▍| 15/16 [04:59<00:19, 19.98s/it][A


6.  73%|███████▎  | 22/30 [3:39:39<1:29:08, 668.58s/it]
6. {'loss': 0.0034, 'grad_norm': 0.12535515427589417, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  73%|███████▎  | 22/30 [3:42:19<1:29:08, 668.58s/it]
6. {'train_runtime': 319.8285, 'train_samples_per_second': 0.4, 'train_steps_per_second': 0.05, 'train_loss': 0.006279423832893372, 'epoch': 1.0}


6. 100%|██████████| 16/16 [05:19<00:00, 19.98s/it][A
6. [A
6. 100%|██████████| 16/16 [05:19<00:00, 19.98s/it][A
6. [A
6. 100%|██████████| 16/16 [05:19<00:00, 19.98s/it][A
6. 100%|██████████| 16/16 [05:19<00:00, 19.99s/it]
7.  75%|███████▌  | 12/16 [03:51<01:17, 19.31s/it][A


4.  80%|████████  | 24/30 [3:46:38<1:11:55, 719.29s/it]*** -> Training took 159.2183 seconds.
4.  83%|████████▎ | 25/30 [3:46:56<46:27, 557.54s/it]  retraining model for key 'd35bdbdc' (retrain_dataset_size=15)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 2136.70 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


4.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  31%|███▏      | 5/16 [01:28<03:14, 17.71s/it][A
7.  81%|████████▏ | 13/16 [04:10<00:57, 19.31s/it][A
4.   6%|▋         | 1/16 [00:07<01:49,  7.27s/it][A
4.  12%|█▎        | 2/16 [00:14<01:41,  7.25s/it][A
5.  38%|███▊      | 6/16 [01:46<02:57, 17.72s/it][A
4.  19%|█▉        | 3/16 [00:21<01:34,  7.23s/it][A
7.  88%|████████▊ | 14/16 [04:30<00:38, 19.31s/it][A
4.  25%|██▌       | 4/16 [00:28<01:26,  7.23s/it][A
5.  44%|████▍     | 7/16 [02:04<02:39, 17.73s/it][A


5. {'loss': 0.0026, 'grad_norm': 0.41488978266716003, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [02:21<02:21, 17.74s/it][A
5. [A
4.  31%|███▏      | 5/16 [00:36<01:19,  7.23s/it][A


6.  73%|███████▎  | 22/30 [3:42:19<1:29:08, 668.58s/it]*** -> Training took 319.8285 seconds.
6.  77%|███████▋  | 23/30 [3:43:14<1:07:51, 581.66s/it]retraining model for key 'db695cfb' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1050.30 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1032.06 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  38%|███▊      | 6/16 [00:43<01:12,  7.23s/it][A
7.  94%|█████████▍| 15/16 [04:49<00:19, 19.33s/it][A


7.  97%|█████████▋| 29/30 [3:46:47<07:55, 475.95s/it]
7. {'loss': 0.0037, 'grad_norm': 0.4727959632873535, 'learning_rate': 5e-05, 'epoch': 1.0}
7.  97%|█████████▋| 29/30 [3:49:21<07:55, 475.95s/it]
7. {'train_runtime': 308.9688, 'train_samples_per_second': 0.414, 'train_steps_per_second': 0.052, 'train_loss': 0.01602553960401565, 'epoch': 1.0}


7. 100%|██████████| 16/16 [05:08<00:00, 19.34s/it][A
7. [A
7. 100%|██████████| 16/16 [05:08<00:00, 19.34s/it][A
7. [A
7. 100%|██████████| 16/16 [05:08<00:00, 19.34s/it][A
7. 100%|██████████| 16/16 [05:08<00:00, 19.31s/it]
4.  44%|████▍     | 7/16 [00:50<01:05,  7.23s/it][A


4. {'loss': 0.0096, 'grad_norm': 0.5668365359306335, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [00:57<00:57,  7.23s/it][A
4. [A
5.  50%|█████     | 8/16 [02:21<02:21, 17.74s/it][A
4.  50%|█████     | 8/16 [00:57<00:57,  7.23s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A


7.  97%|█████████▋| 29/30 [3:49:21<07:55, 475.95s/it]*** -> Training took 308.9688 seconds.
7. 100%|██████████| 30/30 [3:49:37<00:00, 431.52s/it]
7. 100%|██████████| 30/30 [3:49:37<00:00, 459.25s/it]
7. *** Completed inference run.


4.  56%|█████▋    | 9/16 [01:05<00:50,  7.23s/it][A


7. calculate augmented scores:   0%|          | 0/28 [00:00<?, ?it/s]


5.  56%|█████▋    | 9/16 [02:39<02:04, 17.73s/it][A
4.  62%|██████▎   | 10/16 [01:12<00:43,  7.23s/it][A
6.   6%|▋         | 1/16 [00:16<04:04, 16.32s/it][A
4.  69%|██████▉   | 11/16 [01:19<00:36,  7.23s/it][A
4.  75%|███████▌  | 12/16 [01:26<00:28,  7.23s/it][A
5.  62%|██████▎   | 10/16 [02:57<01:46, 17.73s/it][A
6.  12%|█▎        | 2/16 [00:32<03:48, 16.34s/it][A


7. calculate augmented scores:   4%|▎         | 1/28 [00:09<04:11,  9.33s/it]


4.  81%|████████▏ | 13/16 [01:34<00:21,  7.23s/it][A


7. calculate augmented scores:   7%|▋         | 2/28 [00:30<07:01, 16.22s/it]
7. calculate augmented scores:  11%|█         | 3/28 [00:33<04:13, 10.14s/it]


4.  88%|████████▊ | 14/16 [01:41<00:14,  7.23s/it][A
5.  69%|██████▉   | 11/16 [03:15<01:28, 17.72s/it][A
6.  19%|█▉        | 3/16 [00:48<03:32, 16.33s/it][A
4.  94%|█████████▍| 15/16 [01:48<00:07,  7.23s/it][A


4.  83%|████████▎ | 25/30 [3:47:57<46:27, 557.54s/it]
4. {'loss': 0.0012, 'grad_norm': 0.17086081206798553, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  83%|████████▎ | 25/30 [3:48:55<46:27, 557.54s/it]
4. {'train_runtime': 115.7194, 'train_samples_per_second': 1.106, 'train_steps_per_second': 0.138, 'train_loss': 0.0053915337775833905, 'epoch': 1.0}


4. 100%|██████████| 16/16 [01:55<00:00,  7.23s/it][A
4. [A
4. 100%|██████████| 16/16 [01:55<00:00,  7.23s/it][A
4. [A
4. 100%|██████████| 16/16 [01:55<00:00,  7.23s/it][A
4. 100%|██████████| 16/16 [01:55<00:00,  7.23s/it]


7. calculate augmented scores:  14%|█▍        | 4/28 [00:36<02:54,  7.26s/it]


6.  25%|██▌       | 4/16 [01:05<03:15, 16.31s/it][A
5.  75%|███████▌  | 12/16 [03:32<01:10, 17.72s/it][A


7. calculate augmented scores:  18%|█▊        | 5/28 [01:00<05:13, 13.61s/it]


6.  31%|███▏      | 5/16 [01:21<02:59, 16.28s/it][A
5.  81%|████████▏ | 13/16 [03:50<00:53, 17.72s/it][A
6.  38%|███▊      | 6/16 [01:37<02:42, 16.25s/it][A
5.  88%|████████▊ | 14/16 [04:08<00:35, 17.72s/it][A
6.  44%|████▍     | 7/16 [01:53<02:26, 16.24s/it][A


6. {'loss': 0.0052, 'grad_norm': 0.2116723358631134, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [02:10<02:09, 16.24s/it][A
6. [A


7. calculate augmented scores:  21%|██▏       | 6/28 [01:13<04:49, 13.15s/it]


5.  94%|█████████▍| 15/16 [04:25<00:17, 17.73s/it][A


5.  87%|████████▋ | 26/30 [3:38:10<28:45, 431.45s/it]
5. {'loss': 0.0004, 'grad_norm': 0.2824195921421051, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  87%|████████▋ | 26/30 [3:40:32<28:45, 431.45s/it]
5. {'train_runtime': 283.6792, 'train_samples_per_second': 0.451, 'train_steps_per_second': 0.056, 'train_loss': 0.0015252268058247864, 'epoch': 1.0}


5. 100%|██████████| 16/16 [04:43<00:00, 17.72s/it][A
5. [A
5. 100%|██████████| 16/16 [04:43<00:00, 17.72s/it][A
5. [A
5. 100%|██████████| 16/16 [04:43<00:00, 17.72s/it][A
5. 100%|██████████| 16/16 [04:43<00:00, 17.73s/it]
6.  50%|█████     | 8/16 [02:10<02:09, 16.24s/it][A


7. calculate augmented scores:  25%|██▌       | 7/28 [01:51<07:30, 21.44s/it]


6.  56%|█████▋    | 9/16 [02:26<01:53, 16.24s/it][A


7. calculate augmented scores:  29%|██▊       | 8/28 [02:20<07:57, 23.89s/it]


6.  62%|██████▎   | 10/16 [02:42<01:37, 16.25s/it][A


7. calculate augmented scores:  32%|███▏      | 9/28 [02:26<05:46, 18.25s/it]


6.  69%|██████▉   | 11/16 [02:58<01:21, 16.26s/it][A


7. calculate augmented scores:  36%|███▌      | 10/28 [02:46<05:36, 18.70s/it]
4.  83%|████████▎ | 25/30 [3:48:55<46:27, 557.54s/it]*** -> Training took 115.7194 seconds.
4.  87%|████████▋ | 26/30 [3:51:12<31:08, 467.06s/it]retraining model for key 'dfadab01' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 763.73 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 752.89 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...
7. calculate augmented scores:  39%|███▉      | 11/28 [03:01<05:01, 17.73s/it]


6.  75%|███████▌  | 12/16 [03:15<01:04, 16.24s/it][A


7. calculate augmented scores:  43%|████▎     | 12/28 [03:09<03:51, 14.48s/it]


6.  81%|████████▏ | 13/16 [03:31<00:48, 16.24s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A


7. calculate augmented scores:  46%|████▋     | 13/28 [03:27<03:55, 15.70s/it]
7. calculate augmented scores:  50%|█████     | 14/28 [03:37<03:14, 13.88s/it]


6.  88%|████████▊ | 14/16 [03:47<00:32, 16.23s/it][A
4.   6%|▋         | 1/16 [00:22<05:30, 22.06s/it][A
6.  94%|█████████▍| 15/16 [04:03<00:16, 16.23s/it][A


6.  77%|███████▋  | 23/30 [3:45:28<1:07:51, 581.66s/it]
6. {'loss': 0.0016, 'grad_norm': 0.0637824609875679, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  77%|███████▋  | 23/30 [3:47:38<1:07:51, 581.66s/it]
6. {'train_runtime': 260.0914, 'train_samples_per_second': 0.492, 'train_steps_per_second': 0.062, 'train_loss': 0.0033914277446456254, 'epoch': 1.0}


6. 100%|██████████| 16/16 [04:20<00:00, 16.24s/it][A
6. [A
6. 100%|██████████| 16/16 [04:20<00:00, 16.24s/it][A
6. [A
6. 100%|██████████| 16/16 [04:20<00:00, 16.24s/it][A
6. 100%|██████████| 16/16 [04:20<00:00, 16.26s/it]


5.  87%|████████▋ | 26/30 [3:40:32<28:45, 431.45s/it]*** -> Training took 283.6792 seconds.
5.  90%|█████████ | 27/30 [3:42:47<21:26, 428.70s/it]retraining model for key 'd8e07eb2' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 288.49 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 285.90 examples/s]
4.  12%|█▎        | 2/16 [00:44<05:09, 22.08s/it][A
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...
7. calculate augmented scores:  54%|█████▎    | 15/28 [03:43<02:31, 11.62s/it]


4.  19%|█▉        | 3/16 [01:06<04:48, 22.17s/it][A


7. calculate augmented scores:  57%|█████▋    | 16/28 [04:37<04:51, 24.28s/it]
6.  77%|███████▋  | 23/30 [3:47:38<1:07:51, 581.66s/it]*** -> Training took 260.0914 seconds.
6.  80%|████████  | 24/30 [3:48:27<50:07, 501.20s/it]  retraining model for key 'dbff022c' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 2162.29 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  25%|██▌       | 4/16 [01:28<04:27, 22.27s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   6%|▋         | 1/16 [00:07<01:55,  7.69s/it][A


7. calculate augmented scores:  61%|██████    | 17/28 [04:50<03:50, 20.96s/it]
7. calculate augmented scores:  64%|██████▍   | 18/28 [05:12<03:33, 21.34s/it]


6.  12%|█▎        | 2/16 [00:15<01:47,  7.67s/it][A
5.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  31%|███▏      | 5/16 [01:51<04:05, 22.32s/it][A
6.  19%|█▉        | 3/16 [00:22<01:39,  7.66s/it][A


7. calculate augmented scores:  68%|██████▊   | 19/28 [05:16<02:25, 16.12s/it]


6.  25%|██▌       | 4/16 [00:30<01:31,  7.66s/it][A


7. calculate augmented scores:  71%|███████▏  | 20/28 [05:31<02:05, 15.75s/it]


6.  31%|███▏      | 5/16 [00:38<01:24,  7.66s/it][A
4.  38%|███▊      | 6/16 [02:13<03:42, 22.28s/it][A
6.  38%|███▊      | 6/16 [00:45<01:16,  7.66s/it][A
6.  44%|████▍     | 7/16 [00:53<01:08,  7.65s/it][A


6. {'loss': 0.0023, 'grad_norm': 0.4227255880832672, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:01<01:01,  7.66s/it][A
6. [A


7. calculate augmented scores:  75%|███████▌  | 21/28 [05:39<01:33, 13.32s/it]


6.  50%|█████     | 8/16 [01:01<01:01,  7.66s/it][A


7. calculate augmented scores:  79%|███████▊  | 22/28 [05:58<01:31, 15.26s/it]


4.  44%|████▍     | 7/16 [02:35<03:20, 22.26s/it][A


4. {'loss': 0.0117, 'grad_norm': 0.195554718375206, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:57<02:58, 22.26s/it][A
4. [A
6.  56%|█████▋    | 9/16 [01:08<00:53,  7.67s/it][A


7. calculate augmented scores:  82%|████████▏ | 23/28 [06:05<01:03, 12.74s/it]


6.  62%|██████▎   | 10/16 [01:16<00:46,  7.67s/it][A
5.   6%|▋         | 1/16 [01:02<15:43, 62.90s/it][A
6.  69%|██████▉   | 11/16 [01:24<00:38,  7.68s/it][A
4.  50%|█████     | 8/16 [02:57<02:58, 22.26s/it][A
6.  75%|███████▌  | 12/16 [01:32<00:30,  7.68s/it][A


7. calculate augmented scores:  86%|████████▌ | 24/28 [06:12<00:43, 10.94s/it]


6.  81%|████████▏ | 13/16 [01:39<00:23,  7.67s/it][A
6.  88%|████████▊ | 14/16 [01:47<00:15,  7.67s/it][A
4.  56%|█████▋    | 9/16 [03:20<02:35, 22.27s/it][A
6.  94%|█████████▍| 15/16 [01:54<00:07,  7.66s/it][A


6.  80%|████████  | 24/30 [3:49:32<50:07, 501.20s/it]
6. {'loss': 0.0007, 'grad_norm': 0.015289627015590668, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  80%|████████  | 24/30 [3:50:33<50:07, 501.20s/it]
6. {'train_runtime': 122.6108, 'train_samples_per_second': 1.044, 'train_steps_per_second': 0.13, 'train_loss': 0.001503572188084945, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:02<00:00,  7.65s/it][A
6. [A
6. 100%|██████████| 16/16 [02:02<00:00,  7.65s/it][A
6. [A
6. 100%|██████████| 16/16 [02:02<00:00,  7.65s/it][A
6. 100%|██████████| 16/16 [02:02<00:00,  7.66s/it]
4.  62%|██████▎   | 10/16 [03:42<02:13, 22.28s/it][A
5.  12%|█▎        | 2/16 [02:05<14:34, 62.49s/it][A
4.  69%|██████▉   | 11/16 [04:04<01:51, 22.27s/it][A
4.  75%|███████▌  | 12/16 [04:27<01:29, 22.28s/it][A


7. calculate augmented scores:  89%|████████▉ | 25/28 [06:39<00:47, 15.70s/it]


4.  81%|████████▏ | 13/16 [04:49<01:06, 22.26s/it][A


6.  80%|████████  | 24/30 [3:50:33<50:07, 501.20s/it]*** -> Training took 122.6108 seconds.
6.  83%|████████▎ | 25/30 [3:51:58<34:30, 414.12s/it]retraining model for key 'dd6b8c4b' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1865.00 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


5.  19%|█▉        | 3/16 [03:07<13:31, 62.43s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   6%|▋         | 1/16 [00:08<02:09,  8.61s/it][A
4.  88%|████████▊ | 14/16 [05:11<00:44, 22.24s/it][A
6.  12%|█▎        | 2/16 [00:17<02:01,  8.66s/it][A


7. calculate augmented scores:  93%|█████████▎| 26/28 [08:11<01:17, 38.55s/it]


6.  19%|█▉        | 3/16 [00:25<01:52,  8.63s/it][A
4.  94%|█████████▍| 15/16 [05:33<00:22, 22.24s/it][A


4.  87%|████████▋ | 26/30 [3:54:14<31:08, 467.06s/it]
4. {'loss': 0.0052, 'grad_norm': 0.2449154555797577, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  87%|████████▋ | 26/30 [3:57:12<31:08, 467.06s/it]
4. {'train_runtime': 356.0272, 'train_samples_per_second': 0.36, 'train_steps_per_second': 0.045, 'train_loss': 0.008462432073429227, 'epoch': 1.0}


4. 100%|██████████| 16/16 [05:56<00:00, 22.25s/it][A
4. [A
4. 100%|██████████| 16/16 [05:56<00:00, 22.25s/it][A
4. [A
4. 100%|██████████| 16/16 [05:56<00:00, 22.25s/it][A
4. 100%|██████████| 16/16 [05:56<00:00, 22.25s/it]


7. calculate augmented scores:  96%|█████████▋| 27/28 [08:59<00:41, 41.34s/it]
7. calculate augmented scores: 100%|██████████| 28/28 [09:07<00:00, 31.47s/it]
7. calculate augmented scores: 100%|██████████| 28/28 [09:07<00:00, 19.56s/it]
7. *** GPU: NVIDIA L4, used 12.9 / 22.3 GB.


6.  25%|██▌       | 4/16 [00:34<01:43,  8.62s/it][A
6.  31%|███▏      | 5/16 [00:43<01:34,  8.63s/it][A
6.  38%|███▊      | 6/16 [00:51<01:26,  8.65s/it][A
5.  25%|██▌       | 4/16 [04:09<12:28, 62.36s/it][A
6.  44%|████▍     | 7/16 [01:00<01:18,  8.67s/it][A


6. {'loss': 0.0177, 'grad_norm': 0.3905198276042938, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:09<01:09,  8.70s/it][A
6. [A
6.  50%|█████     | 8/16 [01:09<01:09,  8.70s/it][A
6.  56%|█████▋    | 9/16 [01:18<01:01,  8.72s/it][A
6.  62%|██████▎   | 10/16 [01:26<00:52,  8.75s/it][A
6.  69%|██████▉   | 11/16 [01:35<00:43,  8.76s/it][A
6.  75%|███████▌  | 12/16 [01:44<00:35,  8.76s/it][A


4.  87%|████████▋ | 26/30 [3:57:12<31:08, 467.06s/it]*** -> Training took 356.0272 seconds.
4.  90%|█████████ | 27/30 [3:58:31<22:55, 458.62s/it]retraining model for key 'e376de54' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 978.40 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 961.52 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160
6.  81%|████████▏ | 13/16 [01:53<00:26,  8.76s/it][A


4. *** Start training run...


5.  31%|███▏      | 5/16 [05:12<11:26, 62.38s/it][A
6.  88%|████████▊ | 14/16 [02:01<00:17,  8.76s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  94%|█████████▍| 15/16 [02:10<00:08,  8.75s/it][A


6.  83%|████████▎ | 25/30 [3:53:11<34:30, 414.12s/it]
6. {'loss': 0.0031, 'grad_norm': 0.24586121737957, 'learning_rate': 5e-05, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:19<00:00,  8.74s/it][A
6. [A
6. 100%|██████████| 16/16 [02:19<00:00,  8.74s/it][A


6.  83%|████████▎ | 25/30 [3:54:21<34:30, 414.12s/it]
6. {'train_runtime': 139.416, 'train_samples_per_second': 0.918, 'train_steps_per_second': 0.115, 'train_loss': 0.010409485548734665, 'epoch': 1.0}


6. [A
6. 100%|██████████| 16/16 [02:19<00:00,  8.74s/it][A
6. 100%|██████████| 16/16 [02:19<00:00,  8.71s/it]
4.   6%|▋         | 1/16 [00:16<04:13, 16.93s/it][A
4.  12%|█▎        | 2/16 [00:33<03:56, 16.90s/it][A
5.  38%|███▊      | 6/16 [06:14<10:23, 62.34s/it][A
4.  19%|█▉        | 3/16 [00:50<03:39, 16.90s/it][A
4.  25%|██▌       | 4/16 [01:07<03:22, 16.91s/it][A


6.  83%|████████▎ | 25/30 [3:54:21<34:30, 414.12s/it]*** -> Training took 139.416 seconds.
6.  87%|████████▋ | 26/30 [3:55:41<23:46, 356.66s/it]retraining model for key 'e3721c99' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 444.54 examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 439.41 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160
4.  31%|███▏      | 5/16 [01:24<03:05, 16.90s/it][A


6. *** Start training run...


4.  38%|███▊      | 6/16 [01:41<02:49, 16.90s/it][A
5.  44%|████▍     | 7/16 [07:16<09:21, 62.37s/it][A


5. {'loss': 0.0101, 'grad_norm': 0.12756319344043732, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [08:19<08:18, 62.36s/it][A
5. [A
4.  44%|████▍     | 7/16 [01:58<02:32, 16.91s/it][A


4. {'loss': 0.0022, 'grad_norm': 0.15851931273937225, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:15<02:15, 16.91s/it][A
4. [A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
4.  50%|█████     | 8/16 [02:15<02:15, 16.91s/it][A
4.  56%|█████▋    | 9/16 [02:32<01:58, 16.93s/it][A
6.   6%|▋         | 1/16 [00:40<10:08, 40.60s/it][A
4.  62%|██████▎   | 10/16 [02:49<01:41, 16.94s/it][A
5.  50%|█████     | 8/16 [08:19<08:18, 62.36s/it][A
4.  69%|██████▉   | 11/16 [03:06<01:24, 16.95s/it][A
4.  75%|███████▌  | 12/16 [03:23<01:07, 16.95s/it][A
6.  12%|█▎        | 2/16 [01:20<09:24, 40.34s/it][A
4.  81%|████████▏ | 13/16 [03:40<00:50, 16.96s/it][A
5.  56%|█████▋    | 9/16 [09:21<07:16, 62.36s/it][A
4.  88%|████████▊ | 14/16 [03:57<00:33, 16.95s/it][A
6.  19%|█▉        | 3/16 [02:00<08:43, 40.28s/it][A
4.  94%|█████████▍| 15/16 [04:13<00:16, 16.96s/it][A


4.  90%|█████████ | 27/30 [4:00:49<22:55, 458.62s/it]
4. {'loss': 0.0009, 'grad_norm': 0.01447428297251463, 'learning_rate': 5e-05, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:30<00:00, 16.96s/it][A
4. [A
4. 100%|██████████| 16/16 [04:30<00:00, 16.96s/it][A
4. [A


4.  90%|█████████ | 27/30 [4:03:05<22:55, 458.62s/it]
4. {'train_runtime': 270.96, 'train_samples_per_second': 0.472, 'train_steps_per_second': 0.059, 'train_loss': 0.0015798072854522616, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:30<00:00, 16.96s/it][A
4. 100%|██████████| 16/16 [04:30<00:00, 16.93s/it]
6.  25%|██▌       | 4/16 [02:41<08:03, 40.28s/it][A
5.  62%|██████▎   | 10/16 [10:23<06:14, 62.36s/it][A
6.  31%|███▏      | 5/16 [03:21<07:22, 40.27s/it][A


4.  90%|█████████ | 27/30 [4:03:05<22:55, 458.62s/it]*** -> Training took 270.96 seconds.
4.  93%|█████████▎| 28/30 [4:04:41<14:24, 432.24s/it]retraining model for key 'e87109e9' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 494.32 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 488.10 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


5.  69%|██████▉   | 11/16 [11:26<05:11, 62.35s/it][A
6.  38%|███▊      | 6/16 [04:01<06:42, 40.28s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  44%|████▍     | 7/16 [04:42<06:02, 40.27s/it][A


6. {'loss': 0.0022, 'grad_norm': 0.012254561297595501, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [05:22<05:22, 40.27s/it][A
6. [A
5.  75%|███████▌  | 12/16 [12:28<04:09, 62.34s/it][A
4.   6%|▋         | 1/16 [00:34<08:37, 34.48s/it][A
6.  50%|█████     | 8/16 [05:22<05:22, 40.27s/it][A
4.  12%|█▎        | 2/16 [01:08<08:03, 34.50s/it][A
5.  81%|████████▏ | 13/16 [13:30<03:07, 62.34s/it][A
6.  56%|█████▋    | 9/16 [06:02<04:41, 40.29s/it][A
4.  19%|█▉        | 3/16 [01:43<07:29, 34.56s/it][A
4.  25%|██▌       | 4/16 [02:18<06:55, 34.62s/it][A
6.  62%|██████▎   | 10/16 [06:42<04:01, 40.28s/it][A
5.  88%|████████▊ | 14/16 [14:33<02:04, 62.32s/it][A
4.  31%|███▏      | 5/16 [02:53<06:21, 34.66s/it][A
6.  69%|██████▉   | 11/16 [07:23<03:21, 40.28s/it][A
4.  38%|███▊      | 6/16 [03:27<05:46, 34.68s/it][A
6.  75%|███████▌  | 12/16 [08:03<02:41, 40.29s/it][A
5.  94%|█████████▍| 15/16 [15:35<01:02, 62.32s/it][A


5.  90%|█████████ | 27/30 [3:51:11<21:26, 428.70s/it]
5. {'loss': 0.0043, 'grad_norm': 0.06269937753677368, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  90%|█████████ | 27/30 [3:59:30<21:26, 428.70s/it]
5. {'train_runtime': 997.7572, 'train_samples_per_second': 0.128, 'train_steps_per_second': 0.016, 'train_loss': 0.007199503248557448, 'epoch': 1.0}


5. 100%|██████████| 16/16 [16:37<00:00, 62.31s/it][A
5. [A
5. 100%|██████████| 16/16 [16:37<00:00, 62.31s/it][A
5. [A
5. 100%|██████████| 16/16 [16:37<00:00, 62.31s/it][A
5. 100%|██████████| 16/16 [16:37<00:00, 62.36s/it]
4.  44%|████▍     | 7/16 [04:02<05:12, 34.72s/it][A


4. {'loss': 0.0087, 'grad_norm': 0.20804952085018158, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [04:37<04:38, 34.76s/it][A
4. [A
6.  81%|████████▏ | 13/16 [08:43<02:00, 40.27s/it][A
4.  50%|█████     | 8/16 [04:37<04:38, 34.76s/it][A
6.  88%|████████▊ | 14/16 [09:23<01:20, 40.27s/it][A
4.  56%|█████▋    | 9/16 [05:12<04:03, 34.82s/it][A
6.  94%|█████████▍| 15/16 [10:04<00:40, 40.26s/it][A


6.  87%|████████▋ | 26/30 [4:01:08<23:46, 356.66s/it]
6. {'loss': 0.0008, 'grad_norm': 0.13306556642055511, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  87%|████████▋ | 26/30 [4:06:30<23:46, 356.66s/it]
6. {'train_runtime': 644.4822, 'train_samples_per_second': 0.199, 'train_steps_per_second': 0.025, 'train_loss': 0.0014586011820938438, 'epoch': 1.0}


6. 100%|██████████| 16/16 [10:44<00:00, 40.26s/it][A
6. [A
6. 100%|██████████| 16/16 [10:44<00:00, 40.26s/it][A
6. [A
6. 100%|██████████| 16/16 [10:44<00:00, 40.26s/it][A
6. 100%|██████████| 16/16 [10:44<00:00, 40.28s/it]
4.  62%|██████▎   | 10/16 [05:47<03:29, 34.87s/it][A
4.  69%|██████▉   | 11/16 [06:22<02:54, 34.89s/it][A
4.  75%|███████▌  | 12/16 [06:57<02:19, 34.95s/it][A
4.  81%|████████▏ | 13/16 [07:32<01:44, 34.94s/it][A
4.  88%|████████▊ | 14/16 [08:07<01:09, 34.94s/it][A


5.  90%|█████████ | 27/30 [3:59:30<21:26, 428.70s/it]*** -> Training took 997.7572 seconds.
5.  93%|█████████▎| 28/30 [4:04:02<22:45, 682.60s/it]retraining model for key 'e12f9a14' (retrain_dataset_size=10)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 650.52 examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 641.70 examples/s]
5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


4.  94%|█████████▍| 15/16 [08:42<00:34, 34.97s/it][A


4.  93%|█████████▎| 28/30 [4:09:23<14:24, 432.24s/it]
4. {'loss': 0.001, 'grad_norm': 0.18142998218536377, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  93%|█████████▎| 28/30 [4:14:03<14:24, 432.24s/it]


4. 100%|██████████| 16/16 [09:17<00:00, 34.98s/it][A
4. [A
4. 100%|██████████| 16/16 [09:17<00:00, 34.98s/it][A
4. [A


4. {'train_runtime': 557.3003, 'train_samples_per_second': 0.23, 'train_steps_per_second': 0.029, 'train_loss': 0.004895133955869824, 'epoch': 1.0}


4. 100%|██████████| 16/16 [09:17<00:00, 34.98s/it][A
4. 100%|██████████| 16/16 [09:17<00:00, 34.83s/it]
5.   0%|          | 0/16 [00:00<?, ?it/s][A
5.   6%|▋         | 1/16 [00:27<06:47, 27.18s/it][A
5.  12%|█▎        | 2/16 [00:54<06:20, 27.16s/it][A
5.  19%|█▉        | 3/16 [01:21<05:52, 27.15s/it][A
5.  25%|██▌       | 4/16 [01:48<05:25, 27.15s/it][A


6.  87%|████████▋ | 26/30 [4:06:30<23:46, 356.66s/it]*** -> Training took 644.4822 seconds.
6.  90%|█████████ | 27/30 [4:11:42<26:53, 537.84s/it]retraining model for key 'e8686506' (retrain_dataset_size=5)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 3030.98 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  31%|███▏      | 5/16 [02:15<04:58, 27.16s/it][A
6.   6%|▋         | 1/16 [00:05<01:18,  5.23s/it][A
6.  12%|█▎        | 2/16 [00:10<01:12,  5.21s/it][A
6.  19%|█▉        | 3/16 [00:15<01:07,  5.21s/it][A
6.  25%|██▌       | 4/16 [00:20<01:02,  5.22s/it][A
6.  31%|███▏      | 5/16 [00:26<00:57,  5.23s/it][A
5.  38%|███▊      | 6/16 [02:42<04:31, 27.16s/it][A
6.  38%|███▊      | 6/16 [00:31<00:52,  5.23s/it][A
6.  44%|████▍     | 7/16 [00:36<00:47,  5.24s/it][A


6. {'loss': 0.0268, 'grad_norm': 0.6024872660636902, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [00:41<00:41,  5.24s/it][A
6. [A


4.  93%|█████████▎| 28/30 [4:14:03<14:24, 432.24s/it]*** -> Training took 557.3003 seconds.
4.  97%|█████████▋| 29/30 [4:16:58<08:43, 523.62s/it]retraining model for key 'edb79dae' (retrain_dataset_size=5)
4. *** Set model state_dict...


4. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 971.24 examples/s][A
4. Map: 100%|██████████| 128/128 [00:00<00:00, 952.35 examples/s]
4. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
4.    \\   /|    Num examples = 128 | Num Epochs = 1
4. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
4. \        /    Total batch size = 8 | Total steps = 16
4.  "-____-"     Number of trainable parameters = 94,044,160


4. *** Start training run...


6.  50%|█████     | 8/16 [00:41<00:41,  5.24s/it][A
6.  56%|█████▋    | 9/16 [00:47<00:36,  5.24s/it][A
6.  62%|██████▎   | 10/16 [00:52<00:31,  5.25s/it][A
5.  44%|████▍     | 7/16 [03:10<04:04, 27.16s/it][A


5. {'loss': 0.0035, 'grad_norm': 0.3062807023525238, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [03:37<03:37, 27.13s/it][A
5. [A
6.  69%|██████▉   | 11/16 [00:57<00:26,  5.24s/it][A
4.   0%|          | 0/16 [00:00<?, ?it/s][A
6.  75%|███████▌  | 12/16 [01:02<00:20,  5.22s/it][A
6.  81%|████████▏ | 13/16 [01:07<00:15,  5.22s/it][A
6.  88%|████████▊ | 14/16 [01:13<00:10,  5.21s/it][A
4.   6%|▋         | 1/16 [00:17<04:17, 17.19s/it][A
6.  94%|█████████▍| 15/16 [01:18<00:05,  5.22s/it][A


6.  90%|█████████ | 27/30 [4:12:26<26:53, 537.84s/it]
6. {'loss': 0.0093, 'grad_norm': 1.0301649570465088, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  90%|█████████ | 27/30 [4:13:08<26:53, 537.84s/it]
6. {'train_runtime': 83.6065, 'train_samples_per_second': 1.531, 'train_steps_per_second': 0.191, 'train_loss': 0.018059154972434044, 'epoch': 1.0}


6. 100%|██████████| 16/16 [01:23<00:00,  5.21s/it][A
6. [A
6. 100%|██████████| 16/16 [01:23<00:00,  5.21s/it][A
6. [A
6. 100%|██████████| 16/16 [01:23<00:00,  5.21s/it][A
6. 100%|██████████| 16/16 [01:23<00:00,  5.22s/it]
5.  50%|█████     | 8/16 [03:37<03:37, 27.13s/it][A


6.  90%|█████████ | 27/30 [4:13:08<26:53, 537.84s/it]*** -> Training took 83.6065 seconds.
6.  93%|█████████▎| 28/30 [4:13:17<13:30, 405.12s/it]retraining model for key 'f560132c' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1500.18 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


4.  12%|█▎        | 2/16 [00:34<04:00, 17.19s/it][A
6.   0%|          | 0/16 [00:00<?, ?it/s][A
5.  56%|█████▋    | 9/16 [04:04<03:10, 27.15s/it][A
4.  19%|█▉        | 3/16 [00:51<03:43, 17.19s/it][A
6.   6%|▋         | 1/16 [00:10<02:43, 10.87s/it][A
6.  12%|█▎        | 2/16 [00:21<02:32, 10.93s/it][A
4.  25%|██▌       | 4/16 [01:08<03:26, 17.19s/it][A
6.  19%|█▉        | 3/16 [00:32<02:22, 10.97s/it][A
5.  62%|██████▎   | 10/16 [04:31<02:42, 27.15s/it][A
4.  31%|███▏      | 5/16 [01:25<03:09, 17.19s/it][A
6.  25%|██▌       | 4/16 [00:43<02:11, 10.96s/it][A
6.  31%|███▏      | 5/16 [00:54<02:00, 10.96s/it][A
4.  38%|███▊      | 6/16 [01:43<02:51, 17.19s/it][A
5.  69%|██████▉   | 11/16 [04:58<02:15, 27.15s/it][A
6.  38%|███▊      | 6/16 [01:05<01:49, 10.94s/it][A
6.  44%|████▍     | 7/16 [01:16<01:38, 10.92s/it][A


6. {'loss': 0.0474, 'grad_norm': 6.112204551696777, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:27<01:27, 10.90s/it][A
6. [A
4.  44%|████▍     | 7/16 [02:00<02:34, 17.18s/it][A


4. {'loss': 0.008, 'grad_norm': 0.3137953579425812, 'learning_rate': 5e-05, 'epoch': 0.5}


4.  50%|█████     | 8/16 [02:17<02:17, 17.18s/it][A
4. [A
6.  50%|█████     | 8/16 [01:27<01:27, 10.90s/it][A
5.  75%|███████▌  | 12/16 [05:25<01:48, 27.13s/it][A
4.  50%|█████     | 8/16 [02:17<02:17, 17.18s/it][A
6.  56%|█████▋    | 9/16 [01:38<01:16, 10.91s/it][A
6.  62%|██████▎   | 10/16 [01:49<01:05, 10.93s/it][A
4.  56%|█████▋    | 9/16 [02:34<02:00, 17.19s/it][A
5.  81%|████████▏ | 13/16 [05:52<01:21, 27.15s/it][A
6.  69%|██████▉   | 11/16 [02:00<00:54, 10.91s/it][A
4.  62%|██████▎   | 10/16 [02:51<01:43, 17.19s/it][A
6.  75%|███████▌  | 12/16 [02:11<00:43, 10.91s/it][A
6.  81%|████████▏ | 13/16 [02:22<00:32, 10.91s/it][A
5.  88%|████████▊ | 14/16 [06:20<00:54, 27.15s/it][A
4.  69%|██████▉   | 11/16 [03:09<01:25, 17.18s/it][A
6.  88%|████████▊ | 14/16 [02:32<00:21, 10.92s/it][A
4.  75%|███████▌  | 12/16 [03:26<01:08, 17.18s/it][A
6.  94%|█████████▍| 15/16 [02:43<00:10, 10.93s/it][A


6.  93%|█████████▎| 28/30 [4:14:48<13:30, 405.12s/it]
6. {'loss': 0.0103, 'grad_norm': 3.2201054096221924, 'learning_rate': 5e-05, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:54<00:00, 10.91s/it][A
6. [A


6.  93%|█████████▎| 28/30 [4:16:15<13:30, 405.12s/it]
6. {'train_runtime': 174.7948, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.092, 'train_loss': 0.028875060845166445, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:54<00:00, 10.91s/it][A
6. [A
6. 100%|██████████| 16/16 [02:54<00:00, 10.91s/it][A
6. 100%|██████████| 16/16 [02:54<00:00, 10.92s/it]
5.  94%|█████████▍| 15/16 [06:47<00:27, 27.14s/it][A


5.  93%|█████████▎| 28/30 [4:07:44<22:45, 682.60s/it]
5. {'loss': 0.0008, 'grad_norm': 0.27955859899520874, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  93%|█████████▎| 28/30 [4:11:21<22:45, 682.60s/it]
5. {'train_runtime': 434.2978, 'train_samples_per_second': 0.295, 'train_steps_per_second': 0.037, 'train_loss': 0.002169808925827965, 'epoch': 1.0}


5. 100%|██████████| 16/16 [07:14<00:00, 27.12s/it][A
5. [A
5. 100%|██████████| 16/16 [07:14<00:00, 27.12s/it][A
5. [A
5. 100%|██████████| 16/16 [07:14<00:00, 27.12s/it][A
5. 100%|██████████| 16/16 [07:14<00:00, 27.14s/it]
4.  81%|████████▏ | 13/16 [03:43<00:51, 17.19s/it][A
4.  88%|████████▊ | 14/16 [04:00<00:34, 17.19s/it][A
4.  94%|█████████▍| 15/16 [04:17<00:17, 17.19s/it][A


4.  97%|█████████▋| 29/30 [4:19:19<08:43, 523.62s/it]
4. {'loss': 0.0008, 'grad_norm': 0.07075163722038269, 'learning_rate': 5e-05, 'epoch': 1.0}
4.  97%|█████████▋| 29/30 [4:21:37<08:43, 523.62s/it]
4. {'train_runtime': 275.0147, 'train_samples_per_second': 0.465, 'train_steps_per_second': 0.058, 'train_loss': 0.0043894307455047965, 'epoch': 1.0}


4. 100%|██████████| 16/16 [04:35<00:00, 17.19s/it][A
4. [A
4. 100%|██████████| 16/16 [04:35<00:00, 17.19s/it][A
4. [A
4. 100%|██████████| 16/16 [04:35<00:00, 17.19s/it][A
4. 100%|██████████| 16/16 [04:35<00:00, 17.19s/it]


6.  93%|█████████▎| 28/30 [4:16:15<13:30, 405.12s/it]*** -> Training took 174.7948 seconds.
6.  97%|█████████▋| 29/30 [4:17:10<05:53, 353.61s/it]retraining model for key 'faa9f03d' (retrain_dataset_size=10)
6. *** Set model state_dict...


6. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
6. Map: 100%|██████████| 128/128 [00:00<00:00, 1614.10 examples/s]
6. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
6.    \\   /|    Num examples = 128 | Num Epochs = 1
6. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
6. \        /    Total batch size = 8 | Total steps = 16
6.  "-____-"     Number of trainable parameters = 94,044,160


6. *** Start training run...


6.   0%|          | 0/16 [00:00<?, ?it/s][A
6.   6%|▋         | 1/16 [00:10<02:36, 10.44s/it][A
6.  12%|█▎        | 2/16 [00:20<02:25, 10.42s/it][A


4.  97%|█████████▋| 29/30 [4:21:37<08:43, 523.62s/it]*** -> Training took 275.0147 seconds.
4. 100%|██████████| 30/30 [4:22:21<00:00, 463.35s/it]
4. 100%|██████████| 30/30 [4:22:21<00:00, 524.72s/it]
4. *** Completed inference run.


6.  19%|█▉        | 3/16 [00:31<02:15, 10.41s/it][A


4. calculate augmented scores:   0%|          | 0/23 [00:00<?, ?it/s]


6.  25%|██▌       | 4/16 [00:41<02:04, 10.39s/it][A


4. calculate augmented scores:   4%|▍         | 1/23 [00:05<01:59,  5.43s/it]


6.  31%|███▏      | 5/16 [00:51<01:54, 10.38s/it][A


4. calculate augmented scores:   9%|▊         | 2/23 [00:20<03:51, 11.05s/it]


6.  38%|███▊      | 6/16 [01:02<01:43, 10.38s/it][A
6.  44%|████▍     | 7/16 [01:12<01:33, 10.37s/it][A


6. {'loss': 0.0018, 'grad_norm': 0.10638858377933502, 'learning_rate': 5e-05, 'epoch': 0.5}


6.  50%|█████     | 8/16 [01:23<01:22, 10.37s/it][A
6. [A


4. calculate augmented scores:  13%|█▎        | 3/23 [00:30<03:36, 10.82s/it]


6.  50%|█████     | 8/16 [01:23<01:22, 10.37s/it][A


4. calculate augmented scores:  17%|█▋        | 4/23 [00:52<04:43, 14.91s/it]


6.  56%|█████▋    | 9/16 [01:33<01:12, 10.36s/it][A


4. calculate augmented scores:  22%|██▏       | 5/23 [01:05<04:18, 14.37s/it]


6.  62%|██████▎   | 10/16 [01:43<01:02, 10.36s/it][A


4. calculate augmented scores:  26%|██▌       | 6/23 [01:09<03:05, 10.91s/it]


6.  69%|██████▉   | 11/16 [01:54<00:51, 10.36s/it][A
6.  75%|███████▌  | 12/16 [02:04<00:41, 10.36s/it][A


4. calculate augmented scores:  30%|███       | 7/23 [01:25<03:18, 12.43s/it]


6.  81%|████████▏ | 13/16 [02:14<00:31, 10.37s/it][A


4. calculate augmented scores:  35%|███▍      | 8/23 [01:41<03:25, 13.68s/it]


6.  88%|████████▊ | 14/16 [02:25<00:20, 10.36s/it][A


4. calculate augmented scores:  39%|███▉      | 9/23 [01:51<02:54, 12.43s/it]


6.  94%|█████████▍| 15/16 [02:35<00:10, 10.37s/it][A


6.  97%|█████████▋| 29/30 [4:18:37<05:53, 353.61s/it]
6. {'loss': 0.0012, 'grad_norm': 0.48776867985725403, 'learning_rate': 5e-05, 'epoch': 1.0}
6.  97%|█████████▋| 29/30 [4:20:00<05:53, 353.61s/it]
6. {'train_runtime': 165.9731, 'train_samples_per_second': 0.771, 'train_steps_per_second': 0.096, 'train_loss': 0.0015015621320344508, 'epoch': 1.0}


6. 100%|██████████| 16/16 [02:45<00:00, 10.36s/it][A
6. [A
6. 100%|██████████| 16/16 [02:45<00:00, 10.36s/it][A
6. [A
6. 100%|██████████| 16/16 [02:45<00:00, 10.36s/it][A
6. 100%|██████████| 16/16 [02:45<00:00, 10.37s/it]


4. calculate augmented scores:  43%|████▎     | 10/23 [02:06<02:53, 13.38s/it]
5.  93%|█████████▎| 28/30 [4:11:21<22:45, 682.60s/it]*** -> Training took 434.2978 seconds.
5.  97%|█████████▋| 29/30 [4:15:33<11:25, 685.18s/it]retraining model for key 'eee78d87' (retrain_dataset_size=5)
5. *** Set model state_dict...


5. Map:   0%|          | 0/128 [00:00<?, ? examples/s][A
5. Map: 100%|██████████| 128/128 [00:00<00:00, 1600.97 examples/s]


4. calculate augmented scores:  48%|████▊     | 11/23 [02:10<02:05, 10.47s/it]


5. ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
5.    \\   /|    Num examples = 128 | Num Epochs = 1
5. O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
5. \        /    Total batch size = 8 | Total steps = 16
5.  "-____-"     Number of trainable parameters = 94,044,160


5. *** Start training run...


5.   0%|          | 0/16 [00:00<?, ?it/s][A


4. calculate augmented scores:  52%|█████▏    | 12/23 [02:44<03:13, 17.63s/it]
4. calculate augmented scores:  57%|█████▋    | 13/23 [02:59<02:47, 16.71s/it]


5.   6%|▋         | 1/16 [00:10<02:37, 10.50s/it][A
5.  12%|█▎        | 2/16 [00:20<02:26, 10.47s/it][A
5.  19%|█▉        | 3/16 [00:31<02:16, 10.46s/it][A
5.  25%|██▌       | 4/16 [00:41<02:05, 10.45s/it][A
5.  31%|███▏      | 5/16 [00:52<01:54, 10.45s/it][A


4. calculate augmented scores:  61%|██████    | 14/23 [03:07<02:05, 13.99s/it]


5.  38%|███▊      | 6/16 [01:02<01:44, 10.44s/it][A


4. calculate augmented scores:  65%|██████▌   | 15/23 [03:51<03:04, 23.05s/it]


5.  44%|████▍     | 7/16 [01:13<01:33, 10.44s/it][A


5. {'loss': 0.0147, 'grad_norm': 0.06800442934036255, 'learning_rate': 5e-05, 'epoch': 0.5}


5.  50%|█████     | 8/16 [01:23<01:23, 10.45s/it][A
5. [A
5.  50%|█████     | 8/16 [01:23<01:23, 10.45s/it][A
5.  56%|█████▋    | 9/16 [01:34<01:13, 10.44s/it][A


4. calculate augmented scores:  70%|██████▉   | 16/23 [04:07<02:27, 21.02s/it]


5.  62%|██████▎   | 10/16 [01:44<01:02, 10.45s/it][A
5.  69%|██████▉   | 11/16 [01:54<00:52, 10.44s/it][A


4. calculate augmented scores:  74%|███████▍  | 17/23 [04:34<02:16, 22.77s/it]


5.  75%|███████▌  | 12/16 [02:05<00:41, 10.44s/it][A
5.  81%|████████▏ | 13/16 [02:15<00:31, 10.44s/it][A


4. calculate augmented scores:  78%|███████▊  | 18/23 [04:59<01:56, 23.40s/it]


5.  88%|████████▊ | 14/16 [02:26<00:20, 10.43s/it][A


4. calculate augmented scores:  83%|████████▎ | 19/23 [05:16<01:27, 21.75s/it]


5.  94%|█████████▍| 15/16 [02:36<00:10, 10.44s/it][A


5.  97%|█████████▋| 29/30 [4:17:00<11:25, 685.18s/it]
5. {'loss': 0.0057, 'grad_norm': 0.06220773607492447, 'learning_rate': 5e-05, 'epoch': 1.0}
5.  97%|█████████▋| 29/30 [4:18:24<11:25, 685.18s/it]
5. {'train_runtime': 167.1316, 'train_samples_per_second': 0.766, 'train_steps_per_second': 0.096, 'train_loss': 0.010196179384365678, 'epoch': 1.0}


5. 100%|██████████| 16/16 [02:47<00:00, 10.44s/it][A
5. [A
5. 100%|██████████| 16/16 [02:47<00:00, 10.44s/it][A
5. [A
5. 100%|██████████| 16/16 [02:47<00:00, 10.44s/it][A
5. 100%|██████████| 16/16 [02:47<00:00, 10.45s/it]


4. calculate augmented scores:  87%|████████▋ | 20/23 [05:28<00:55, 18.65s/it]
4. calculate augmented scores:  91%|█████████▏| 21/23 [05:39<00:32, 16.48s/it]
6.  97%|█████████▋| 29/30 [4:20:00<05:53, 353.61s/it]*** -> Training took 165.9731 seconds.
6. 100%|██████████| 30/30 [4:23:45<00:00, 365.75s/it]
6. 100%|██████████| 30/30 [4:23:45<00:00, 527.50s/it]
6. *** Completed inference run.
4. calculate augmented scores:  96%|█████████▌| 22/23 [05:49<00:14, 14.46s/it]
4. calculate augmented scores: 100%|██████████| 23/23 [05:56<00:00, 12.23s/it]
4. calculate augmented scores: 100%|██████████| 23/23 [05:56<00:00, 15.50s/it]
4. *** GPU: NVIDIA L4, used 13.6 / 22.3 GB.
6. calculate augmented scores:   0%|          | 0/30 [00:00<?, ?it/s]
6. calculate augmented scores:   3%|▎         | 1/30 [00:29<14:02, 29.07s/it]
6. calculate augmented scores:   7%|▋         | 2/30 [00:34<07:07, 15.27s/it]
6. calculate augmented scores:  10%|█         | 3/30 [00:44<05:40, 12.62s/it]
6. calculate augmented sc

In [19]:
# write submission
from common_stuff import *
with RemapCudaOOM():
    model, formatter, dataset = None, MyFormatter(), None
    decoder = Decoder(formatter, arc_test_set.split_multi_replies(), n_guesses=2, frac_score=True).from_store(infer_params['store'])
    if use_aug_score or arc_test_set.is_fake: decoder.calc_augmented_scores(model=model, store=score_temp_storage, **aug_score_params)
    submission = arc_test_set.get_submission(decoder.run_selection_algo(submission_select_algo))
    with open('submission.json', 'w') as f: json.dump(submission, f)
    if arc_test_set.is_fake:
        decoder.benchmark_selection_algos(selection_algorithms)
        with open('submission.json') as f: reload_submission = json.load(f)
        print('*** Reload score:', arc_test_set.validate_submission(reload_submission))

calculate augmented scores: 100%|██████████| 107/107 [00:03<00:00, 33.50it/s]
*** Generating submission for 107 outputs...


In [20]:
# Visualization for inference results from submission.json
if arc_test_set.is_fake:
    from common_stuff import *
    import matplotlib.pyplot as plt
    from matplotlib import colors
    import json
    import os
    import numpy as np
    
    print("\n" + "="*80)
    print("VISUALIZING RESULTS FROM SUBMISSION.JSON")
    print("="*80)
    
    # Check if submission file exists
    submission_path = 'submission.json'
    if not os.path.exists(submission_path):
        print(f"Submission file not found at {submission_path}")
    else:
        print(f"Found submission file: {submission_path}")
        
        # Load submission data
        with open(submission_path, 'r') as f:
            submission_data = json.load(f)
        
        print(f"Loaded submission with {len(submission_data)} tasks")
        
        # ARC color map
        cmap = colors.ListedColormap(
            ['#000000', '#0074D9', '#FF4136', '#2ECC40', '#FFDC00',
             '#AAAAAA', '#F012BE', '#FF851B', '#7FDBFF', '#870C25'])
        norm = colors.Normalize(vmin=0, vmax=9)
        
        # Function to check if prediction is non-trivial (not just zeros)
        def is_non_trivial_prediction(pred_array):
            # Check if the prediction contains any non-zero values
            return np.any(np.array(pred_array) > 0)
        
        # Function to visualize a single task result
        def visualize_submission_result(task_id, task_data, submission_output, test_idx):
            # Skip visualization if both predictions are just zeros
            pred_1 = np.array(submission_output['attempt_1'])
            pred_2 = np.array(submission_output['attempt_2'])
            
            if not is_non_trivial_prediction(pred_1) and not is_non_trivial_prediction(pred_2):
                print(f"  Skipping visualization for Task {task_id} - Test #{test_idx+1} (all predictions are zeros)")
                return False
            
            # Create visualization
            fig = plt.figure(figsize=(15, 8))
            grid_spec = plt.GridSpec(2, 3, width_ratios=[1, 1, 1])
            
            # Training examples (first one only for simplicity)
            if task_data['train']:
                # Train Input
                ax1 = fig.add_subplot(grid_spec[0, 0])
                ax1.imshow(task_data['train'][0]['input'], cmap=cmap, norm=norm)
                ax1.grid(True, which='both', color='lightgrey', linewidth=0.5)
                ax1.set_title("Training Input")
                ax1.set_xticks([])
                ax1.set_yticks([])
                
                # Train Output
                ax2 = fig.add_subplot(grid_spec[1, 0])
                ax2.imshow(task_data['train'][0]['output'], cmap=cmap, norm=norm)
                ax2.grid(True, which='both', color='lightgrey', linewidth=0.5)
                ax2.set_title("Training Output")
                ax2.set_xticks([])
                ax2.set_yticks([])
            
            # Test Input
            if test_idx < len(task_data['test']):
                ax3 = fig.add_subplot(grid_spec[0, 1])
                ax3.imshow(task_data['test'][test_idx]['input'], cmap=cmap, norm=norm)
                ax3.grid(True, which='both', color='lightgrey', linewidth=0.5)
                ax3.set_title(f"Test Input (Test #{test_idx+1})")
                ax3.set_xticks([])
                ax3.set_yticks([])
                
                # Ground Truth (if available)
                if 'output' in task_data['test'][test_idx]:
                    ax4 = fig.add_subplot(grid_spec[1, 1])
                    ax4.imshow(task_data['test'][test_idx]['output'], cmap=cmap, norm=norm)
                    ax4.grid(True, which='both', color='lightgrey', linewidth=0.5)
                    ax4.set_title("Ground Truth")
                    ax4.set_xticks([])
                    ax4.set_yticks([])
            
            # Model Predictions
            # Attempt 1
            ax5 = fig.add_subplot(grid_spec[0, 2])
            ax5.imshow(pred_1, cmap=cmap, norm=norm)
            ax5.grid(True, which='both', color='lightgrey', linewidth=0.5)
            ax5.set_title("Model Prediction (Attempt 1)")
            ax5.set_xticks([])
            ax5.set_yticks([])
            
            # Attempt 2
            ax6 = fig.add_subplot(grid_spec[1, 2])
            ax6.imshow(pred_2, cmap=cmap, norm=norm)
            ax6.grid(True, which='both', color='lightgrey', linewidth=0.5)
            ax6.set_title("Model Prediction (Attempt 2)")
            ax6.set_xticks([])
            ax6.set_yticks([])
            
            plt.suptitle(f"Task {task_id} - Test Example #{test_idx+1}", fontsize=16)
            plt.tight_layout()
            plt.subplots_adjust(top=0.9)
            plt.show()
            
            # Calculate accuracy if ground truth is available
            if 'output' in task_data['test'][test_idx]:
                ground_truth = np.array(task_data['test'][test_idx]['output'])
                
                # Check accuracy of both attempts
                results = []
                match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                results.append(f"Attempt 1: {'✓' if match_1 else '✗'}{' (zeros)' if not is_non_trivial_prediction(pred_1) else ''}")
                
                match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                results.append(f"Attempt 2: {'✓' if match_2 else '✗'}{' (zeros)' if not is_non_trivial_prediction(pred_2) else ''}")
                
                print(f"  Results: {', '.join(results)}")
                
                # Display task statistics
                print(f"  Shape - Ground Truth: {ground_truth.shape}, Prediction 1: {pred_1.shape}, Prediction 2: {pred_2.shape}")
                print(f"  Values - Ground Truth unique values: {np.unique(ground_truth)}")
                print(f"          Prediction 1 unique values: {np.unique(pred_1)}")
                print(f"          Prediction 2 unique values: {np.unique(pred_2)}")
            print()
            return True
        
        # Process ALL results from submission (no limit)
        visualized_count = 0
        skipped_count = 0
        
        # Get a list of tasks in the submission
        task_ids = list(submission_data.keys())
        
        # Collect all task/test combinations
        all_predictions = []
        for task_id in task_ids:
            if task_id in arc_test_set.queries:
                task_data = arc_test_set.queries[task_id]
                for test_idx, test_prediction in enumerate(submission_data[task_id]):
                    # Check if we have ground truth available
                    has_ground_truth = (task_id in arc_test_set.replies and 
                                        test_idx < len(arc_test_set.replies[task_id]))
                    
                    # Check if predictions are non-trivial
                    pred_1 = np.array(test_prediction['attempt_1'])
                    pred_2 = np.array(test_prediction['attempt_2'])
                    has_non_zero_pred = is_non_trivial_prediction(pred_1) or is_non_trivial_prediction(pred_2)
                    
                    # Score based on correctness if ground truth is available
                    score = 0
                    if has_ground_truth and has_non_zero_pred:
                        ground_truth = np.array(arc_test_set.replies[task_id][test_idx])
                        
                        match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                        match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                        score = match_1 + match_2
                        
                    all_predictions.append((task_id, test_idx, score, has_ground_truth, has_non_zero_pred))
        
        # Sort by whether they have ground truth first, then by score
        all_predictions.sort(key=lambda x: (-int(x[3]), -x[2]))
        
        # Print summary before visualization
        print(f"\nFound {len(all_predictions)} total predictions to visualize")
        
        # Visualize all tasks
        for task_id, test_idx, score, has_ground_truth, has_non_zero_pred in all_predictions:
            # Get task data and predictions
            task_data = arc_test_set.queries[task_id]
            submission_output = submission_data[task_id][test_idx]
            
            # Visualize this task
            score_info = f" (Score: {score}/2)" if has_ground_truth and has_non_zero_pred else " (no ground truth)" if not has_ground_truth else " (all zeros - no score)"
            print(f"\nTask: {task_id} - Test #{test_idx+1}{score_info}")
            
            # Only increment visualized_count if actually visualized
            if visualize_submission_result(task_id, task_data, submission_output, test_idx):
                visualized_count += 1
            else:
                skipped_count += 1
        
        print(f"\nVisualized {visualized_count} inference results (skipped {skipped_count} with all-zero predictions)")
        
        # Calculate overall accuracy statistics
        if arc_test_set.is_fake:
            total_tests = 0
            total_scored_tests = 0
            correct_attempt1 = 0
            correct_attempt2 = 0
            correct_any = 0
            zero_predictions = 0
            
            for task_id, test_predictions in submission_data.items():
                if task_id in arc_test_set.replies:
                    for test_idx, test_prediction in enumerate(test_predictions):
                        if test_idx < len(arc_test_set.replies[task_id]):
                            total_tests += 1
                            
                            ground_truth = np.array(arc_test_set.replies[task_id][test_idx])
                            pred_1 = np.array(test_prediction['attempt_1'])
                            pred_2 = np.array(test_prediction['attempt_2'])
                            
                            # Check if both predictions are all zeros
                            if not is_non_trivial_prediction(pred_1) and not is_non_trivial_prediction(pred_2):
                                zero_predictions += 1
                                continue
                            
                            # Only count tests with at least one non-zero prediction
                            total_scored_tests += 1
                            
                            match_1 = np.array_equal(pred_1, ground_truth) if is_non_trivial_prediction(pred_1) else False
                            match_2 = np.array_equal(pred_2, ground_truth) if is_non_trivial_prediction(pred_2) else False
                            
                            if match_1: correct_attempt1 += 1
                            if match_2: correct_attempt2 += 1
                            if match_1 or match_2: correct_any += 1
            
            if total_tests > 0:
                print("\n" + "="*80)
                print("OVERALL ACCURACY STATISTICS")
                print("="*80)
                print(f"Total test examples: {total_tests}")
                print(f"Test examples with zero predictions (excluded from accuracy): {zero_predictions}")
                print(f"Test examples included in accuracy calculation: {total_scored_tests}")
                
                if total_scored_tests > 0:
                    print(f"Correct on attempt 1: {correct_attempt1}/{total_scored_tests} ({correct_attempt1/total_scored_tests:.2%})")
                    print(f"Correct on attempt 2: {correct_attempt2}/{total_scored_tests} ({correct_attempt2/total_scored_tests:.2%})")
                    print(f"Correct on either attempt: {correct_any}/{total_scored_tests} ({correct_any/total_scored_tests:.2%})")
                else:
                    print("No non-zero predictions to calculate accuracy")
                    
                print(f"Overall completion rate: {total_scored_tests/total_tests:.2%} of tests have non-zero predictions")
                print("="*80)
else:
    print("Skipping inference visualization - not in fake test mode")

Skipping inference visualization - not in fake test mode
