In [8]:
import os 
import shutil
import random

import torch
from transformers import AutoModelForObjectDetection, TrainingArguments, Trainer, YolosImageProcessor
from datasets import load_dataset
from PIL import Image, ImageDraw, ImageFont
from datasets import Dataset
import numpy as np
import gc
from sklearn.metrics import precision_recall_fscore_support
from torchmetrics.detection.mean_ap import MeanAveragePrecision


import configparser
import torch
import gc

import matplotlib.pyplot as plt


In [9]:
# config = configparser.ConfigParser()
# config.read("config.ini")
# AUTH_TOKEN = config["auth"]["token"]

BASE_DIR = os.getcwd()
IMAGE_PROCESSOR_GLOBAL = YolosImageProcessor()


In [None]:
def clear_directory(directory):
    """
    Löscht den gesamten Inhalt eines Verzeichnisses, ohne das Verzeichnis selbst zu entfernen.

    Args:
        directory (str): Pfad zum Verzeichnis.
    """
    if os.path.exists(directory):
        for item in os.listdir(directory):
            item_path = os.path.join(directory, item)
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.unlink(item_path)  
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)  


def split_dataset(images_dir, labels_dir, output_dir, test_ratio=0.2, seed=42):
    """
    Teilt einen Datensatz (Bilder + Labels) in Training, Validierung und Test auf.

    Args:
        images_dir (str): Pfad zum Ordner mit den Bildern.
        labels_dir (str): Pfad zum Ordner mit den Labels.
        output_dir (str): Pfad zum Ordner, in dem die aufgeteilten Daten gespeichert werden sollen.
        val_ratio (float): Verhältnis der Validierungsdaten (zwischen 0 und 1). Standard: 0.1.
        test_ratio (float): Verhältnis der Testdaten (zwischen 0 und 1). Standard: 0.1.
        seed (int): Zufallssaat für Reproduzierbarkeit. Standard: 42.

    Returns:
        None
    """


    # Unterordner-Pfade
    train_images_dir = os.path.join(output_dir, "train/images")
    train_labels_dir = os.path.join(output_dir, "train/labels")
    test_images_dir = os.path.join(output_dir, "test/images")
    test_labels_dir = os.path.join(output_dir, "test/labels")
    
    # Ordner neu erstellen
    os.makedirs(train_images_dir, exist_ok=True)
    os.makedirs(train_labels_dir, exist_ok=True)
    os.makedirs(test_images_dir, exist_ok=True)
    os.makedirs(test_labels_dir, exist_ok=True)

    for subdir in [train_images_dir, train_labels_dir, test_images_dir, test_labels_dir]:
        clear_directory(subdir)
        os.makedirs(subdir, exist_ok=True)

    # Liste aller Bilder
    image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.png'))]
    
    # Shuffle und Split
    random.seed(seed)
    random.shuffle(image_files)
    
    test_split = int(len(image_files) * test_ratio)
    
    test_files = image_files[:test_split]
    train_files = image_files[test_split:]

    # Dateien kopieren
    def copy_files(file_list, dest_images_dir, dest_labels_dir):
        for file in file_list:
            shutil.copy(os.path.join(images_dir, file), os.path.join(dest_images_dir, file))
            label_file = file.rsplit('.', 1)[0] + '.txt'
            if os.path.exists(os.path.join(labels_dir, label_file)):
                shutil.copy(os.path.join(labels_dir, label_file), os.path.join(dest_labels_dir, label_file))

    copy_files(train_files, train_images_dir, train_labels_dir)
    copy_files(test_files, test_images_dir, test_labels_dir)

    print(f"Train/Validation/Test-Split abgeschlossen! Daten in '{output_dir}' gespeichert.")
    print(f"Train: {len(train_files)} | Test: {len(test_files)}")

# Beispielaufruf
yolo_images_dir = os.path.join(BASE_DIR, "Data", "Kugellager_Data", "YOLO_data", "yolo_images_dump")
yolo_labels_dir = os.path.join(BASE_DIR, "Data", "Kugellager_Data", "YOLO_data", "yolo_labels_dump")
yolo_output_dir = os.path.join(BASE_DIR, "Data", "Kugellager_Data", "YOLO_data")

split_dataset(images_dir=yolo_images_dir,
              labels_dir=yolo_labels_dir,
              output_dir=yolo_output_dir,
              test_ratio=0.2,
              seed=42)


In [4]:
# split_dataset(images_dir=os.path.join(BASE_DIR, "Data", "Oberflächen_Data", "YOLO_data", "test", "images"),
#               labels_dir=os.path.join(BASE_DIR, "Data", "Oberflächen_Data", "YOLO_data", "test", "labels"),
#               output_dir="halbe_data_oberfläche",
#               test_ratio=0.5)

In [5]:
def convert_yolo_to_custom_format(images_dir, labels_dir, categories):
    """
    Converts YOLO annotations to a custom dataset format similar to CPPE-5.

    Args:
        images_dir (str): Path to the images directory.
        labels_dir (str): Path to the YOLO labels directory.
        categories (list): List of category names.

    Returns:
        list: A dataset where each entry contains image metadata and associated objects.
    """
    dataset = []
    annotation_id = 0
    image_id = 0

    for image_file in sorted(os.listdir(images_dir)):
        if not image_file.endswith(('.jpg', '.png', '.jpeg')):
            continue

        image_path = os.path.join(images_dir, image_file)

        # Open the image as a PIL image object
        with Image.open(image_path) as img:
            width, height = img.size

            # Prepare the image entry
            image_entry = {
                'image_id': image_id,
                'image': img.copy(),  # Keep a reference to the PIL image
                'width': width,
                'height': height,
                'objects': {
                    'id': [],
                    'area': [],
                    'bbox': [],
                    'category': []
                }
            }

            # Corresponding label file in YOLO format
            label_file = os.path.join(labels_dir, image_file.rsplit('.', 1)[0] + '.txt')
            if os.path.exists(label_file):
                with open(label_file, 'r') as f:
                    for line in f:
                        parts = line.strip().split()
                        category_id = int(parts[0])
                        x_center, y_center, box_width, box_height = map(float, parts[1:])

                        # Convert YOLO to bounding box coordinates
                        x_min = (x_center - box_width / 2) * width
                        y_min = (y_center - box_height / 2) * height
                        bbox_width = box_width * width
                        bbox_height = box_height * height
                        area = bbox_width * bbox_height

                        # Append object data
                        image_entry['objects']['id'].append(annotation_id)
                        image_entry['objects']['area'].append(int(area))
                        image_entry['objects']['bbox'].append([
                            round(x_min, 1),
                            round(y_min, 1),
                            round(bbox_width, 1),
                            round(bbox_height, 1)
                        ])
                        image_entry['objects']['category'].append(category_id)

                        annotation_id += 1

            dataset.append(image_entry)
            image_id += 1

    return dataset


categories_kugellager = [
    {"id": 0, "name": "defect"},
]
images_dir_kugellager_train = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/train/images")
labels_dir_kugellager_train = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/train/labels")

dataset_kugellager_train = Dataset.from_list(convert_yolo_to_custom_format(images_dir_kugellager_train, labels_dir_kugellager_train, categories_kugellager))

images_dir_kugellager_test = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/test/images")
labels_dir_kugellager_test = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/test/labels")

dataset_kugellager_test = Dataset.from_list(convert_yolo_to_custom_format(images_dir_kugellager_test, labels_dir_kugellager_test, categories_kugellager))



images_dir_kugellager_train_halb = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/train_halb/images")
labels_dir_kugellager_train_halb = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/train_halb/labels")

dataset_kugellager_train_halb = Dataset.from_list(convert_yolo_to_custom_format(images_dir_kugellager_train_halb, labels_dir_kugellager_train_halb, categories_kugellager))

images_dir_kugellager_test_halb = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/test_halb/images")
labels_dir_kugellager_test_halb = os.path.join(BASE_DIR, "Data/Kugellager_Data/YOLO_Data/test_halb/labels")

dataset_kugellager_test_halb = Dataset.from_list(convert_yolo_to_custom_format(images_dir_kugellager_test_halb, labels_dir_kugellager_test_halb, categories_kugellager))



categories_oberfläche = [
    {"id": 0, "name": "crazing"},
    {"id": 1, "name": "inclusion"},
    {"id": 2, "name": "patches"},
    {"id": 3, "name": "pitted surface"},
    {"id": 4, "name": "rolled in scale"},
    {"id": 5, "name": "scratches"}
]

images_dir_oberfläche_train = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/train/images")
labels_dir_oberfläche_train = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/train/labels")

dataset_oberfläche_train = Dataset.from_list(convert_yolo_to_custom_format(images_dir_oberfläche_train, labels_dir_oberfläche_train, categories_oberfläche))

images_dir_oberfläche_test = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/test/images")
labels_dir_oberfläche_test = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/test/labels")

dataset_oberfläche_test = Dataset.from_list(convert_yolo_to_custom_format(images_dir_oberfläche_test, labels_dir_oberfläche_test, categories_oberfläche))



images_dir_oberfläche_train_halb = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/train_halb/images")
labels_dir_oberfläche_train_halb = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/train_halb/labels")

dataset_oberfläche_train_halb = Dataset.from_list(convert_yolo_to_custom_format(images_dir_oberfläche_train_halb, labels_dir_oberfläche_train_halb, categories_oberfläche))

images_dir_oberfläche_test_halb = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/test_halb/images")
labels_dir_oberfläche_test_halb = os.path.join(BASE_DIR, "Data/Oberflächen_Data/YOLO_Data/test_halb/labels")

dataset_oberfläche_test_halb = Dataset.from_list(convert_yolo_to_custom_format(images_dir_oberfläche_test_halb, labels_dir_oberfläche_test_halb, categories_oberfläche))


In [6]:
print("CUDA available:", torch.cuda.is_available())
print("CUDA device name:", torch.cuda.get_device_name(0))
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
total_memory = torch.cuda.get_device_properties(0).total_memory

# Belegter Speicher (in Bytes)
allocated_memory = torch.cuda.memory_allocated(0)

# Zwischengespeicherter Speicher
cached_memory = torch.cuda.memory_reserved(0)

print(f"Gesamtspeicher: {total_memory / 1e9:.2f} GB")
print(f"Belegter Speicher: {allocated_memory / 1e9:.2f} GB")
print(f"Zwischengespeicherter Speicher: {cached_memory / 1e9:.2f} GB")

CUDA available: True
CUDA device name: NVIDIA GeForce RTX 3060 Ti
CUDA version: 12.4
cuDNN version: 90100
Gesamtspeicher: 8.59 GB
Belegter Speicher: 0.00 GB
Zwischengespeicherter Speicher: 0.00 GB


In [7]:
def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):

        new_ann = {
            "id": image_id,
            "category_id": category[i],  # Hier wird das richtige category ID verwendet
            "isCrowd": 0,
            "area": area[i],
            "bbox": list(bbox[i]),
        }

        annotations.append(new_ann)
    return annotations

# Create annotations such that they match the expected form by the algorithm
def transform_ann(examples, image_processor = YolosImageProcessor()):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    for image, objects in zip(examples["image"], examples["objects"]): 
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        area.append(objects["area"])
        images.append(image)
        bboxes.append(objects["bbox"])
        categories.append(objects["category"])

    targets = [
    {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
    for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt") # Is applied on the whole batch

def collate_fn(batch, image_processor = YolosImageProcessor()):
    pixel_values = [item["pixel_values"] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item["labels"] for item in batch]
    batch = {}
    batch["pixel_values"] = encoding["pixel_values"]
    #batch["pixel_mask"] = encoding["pixel_mask"] # For object detection we do not need this - only needed for segmentation.
    batch["labels"] = labels
    return batch


def model_training(categories, model_name, train_data, validation_data, num_epochs=3, image_processor=YolosImageProcessor(), output_name="Kugellager"):
    # Mapping zwischen IDs und Labels
    id2label = {category['id']: category['name'] for category in categories}
    label2id = {category['name']: category['id'] for category in categories} 

    # Trainingsargumente definieren
    training_args = TrainingArguments(

        output_dir=f"trained_model/{output_name}/{model_name}", 
        remove_unused_columns=False, 
        load_best_model_at_end=False, 
        save_strategy="no", 
        eval_strategy="epoch", 
        per_device_train_batch_size=10, 
        push_to_hub=False,
        logging_steps=10,
        num_train_epochs=num_epochs
    )

    # Modell initialisieren
    model = AutoModelForObjectDetection.from_pretrained(
        model_name,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,  
    )

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")  
    model.to(device)
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=collate_fn, 
        train_dataset=train_data,
        eval_dataset=validation_data,
        tokenizer=image_processor,
    )

    # Training durchführen
    trainer.train()

    # Evaluation durchführen
    evaluation_results = trainer.evaluate()

    # Plotten der Losses aus dem TrainerState
    train_logs = trainer.state.log_history

    # Extrahiere Training und Evaluation Losses
    train_losses = [log['loss'] for log in train_logs if 'loss' in log]
    eval_losses = [log['eval_loss'] for log in train_logs if 'eval_loss' in log]

    plt.figure(figsize=(14, 10))
    
    # Plot Training Loss
    plt.plot(train_losses, label='Training Loss', color='blue')
    
    # Plot Evaluation Loss
    # Stelle sicher, dass die Längen übereinstimmen
    if len(eval_losses) > 0:
        # Falls unterschiedliche Längen, interpoliere oder schneide
        if len(eval_losses) != len(train_losses):
            # Interpoliere eval_losses auf die Länge von train_losses
            x_train = np.linspace(0, len(train_losses)-1, len(train_losses))
            x_eval = np.linspace(0, len(train_losses)-1, len(eval_losses))
            eval_losses_interpolated = np.interp(x_train, x_eval, eval_losses)
            
            plt.plot(eval_losses_interpolated, label='Evaluation Loss', color='red')
        else:
            plt.plot(eval_losses, label='Evaluation Loss', color='red')

    plt.title(f'Loss Progression for {model_name}')
    plt.xlabel('Training Steps')
    plt.ylabel('Loss')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"trained_model/{output_name}/{model_name}/loss_plot.png")
    plt.close()

    save_path = f"trained_model/{output_name}/{model_name}/final_model"
    model.save_pretrained(save_path)

    return evaluation_results, train_losses, eval_losses
    

# Transform data such that it can be feed to the model
train_data_kugellager = dataset_kugellager_train.with_transform(transform_ann)
test_data_kugellager = dataset_kugellager_test.with_transform(transform_ann)

train_data_oberfläche = dataset_oberfläche_train.with_transform(transform_ann)
test_data_oberfläche = dataset_oberfläche_test.with_transform(transform_ann)


train_data_kugellager_halb = dataset_kugellager_train_halb.with_transform(transform_ann)
test_data_kugellager_halb = dataset_kugellager_test_halb.with_transform(transform_ann)

train_data_oberfläche_halb = dataset_oberfläche_train_halb.with_transform(transform_ann)
test_data_oberfläche_halb = dataset_oberfläche_test_halb.with_transform(transform_ann)

best_models_kugellager_dict = {}
best_models_oberfläche_dict = {}

best_models_kugellager_dict_halb = {}
best_models_oberfläche_dict_halb = {}

# "ultralytics/yolov11-m", 
# "jparedesDS/welding-defects-detection",
# "facebook/detr-resnet-50", 
# "hustvl/yolos-small",
# "hustvl/yolos-tiny"
# "jparedesDS/welding-defects-detection",
# "hustvl/yolos-small",

model_training_list = ["hustvl/yolos-tiny", "facebook/detr-resnet-50", ]

for model_name in model_training_list:
    evaluation_results, train_losses, eval_losses = model_training(
        categories=categories_kugellager, 
        model_name=model_name, 
        train_data=train_data_kugellager, 
        validation_data=test_data_kugellager, 
        num_epochs=50,
        output_name="Kugellager"
    )
    best_models_kugellager_dict[model_name] = {
        'evaluation_results': evaluation_results,
        'train_losses': train_losses,
        'eval_losses': eval_losses
    }


for model_name in model_training_list:
    evaluation_results, train_losses, eval_losses = model_training(
        categories=categories_oberfläche, 
        model_name=model_name, 
        train_data=train_data_oberfläche, 
        validation_data=test_data_oberfläche, 
        num_epochs=10,
        output_name="Oberfläche"
    )
    best_models_oberfläche_dict[model_name] = {
        'evaluation_results': evaluation_results,
        'train_losses': train_losses,
        'eval_losses': eval_losses
    }


for model_name in model_training_list:
    evaluation_results, train_losses, eval_losses = model_training(
        categories=categories_kugellager, 
        model_name=model_name, 
        train_data=train_data_kugellager_halb, 
        validation_data=test_data_kugellager_halb, 
        num_epochs=50,
        output_name="Kugellager_halb"
    )
    best_models_kugellager_dict_halb[model_name] = {
        'evaluation_results': evaluation_results,
        'train_losses': train_losses,
        'eval_losses': eval_losses
    }


for model_name in model_training_list:
    evaluation_results, train_losses, eval_losses = model_training(
        categories=categories_oberfläche, 
        model_name=model_name, 
        train_data=train_data_oberfläche_halb, 
        validation_data=test_data_oberfläche_halb, 
        num_epochs=20,
        output_name="Oberfläche_halb"
    )
    best_models_oberfläche_dict_halb[model_name] = {
        'evaluation_results': evaluation_results,
        'train_losses': train_losses,
        'eval_losses': eval_losses
    }

Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-tiny and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([2]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 192]) in the checkpoint and torch.Size([2, 192]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Using device: cuda:0


  0%|          | 0/2400 [00:00<?, ?it/s]The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.
  0%|          | 10/2400 [00:12<45:39,  1.15s/it] 

{'loss': 1.5067, 'grad_norm': 88.63176727294922, 'learning_rate': 4.979166666666667e-05, 'epoch': 0.21}


  1%|          | 20/2400 [00:23<44:30,  1.12s/it]

{'loss': 1.4485, 'grad_norm': 41.573917388916016, 'learning_rate': 4.958333333333334e-05, 'epoch': 0.42}


  1%|▏         | 30/2400 [00:34<44:51,  1.14s/it]

{'loss': 1.4342, 'grad_norm': 41.531707763671875, 'learning_rate': 4.937500000000001e-05, 'epoch': 0.62}


  2%|▏         | 40/2400 [00:46<44:43,  1.14s/it]

{'loss': 1.4607, 'grad_norm': 247.10638427734375, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.83}


                                                 
  2%|▏         | 48/2400 [01:01<40:38,  1.04s/it]

{'eval_loss': 1.2795870304107666, 'eval_runtime': 6.6781, 'eval_samples_per_second': 17.969, 'eval_steps_per_second': 2.246, 'epoch': 1.0}


  2%|▏         | 50/2400 [01:04<1:40:51,  2.58s/it]

{'loss': 1.284, 'grad_norm': 109.70744323730469, 'learning_rate': 4.8958333333333335e-05, 'epoch': 1.04}


  2%|▎         | 60/2400 [01:15<45:57,  1.18s/it]  

{'loss': 1.3929, 'grad_norm': 145.8207550048828, 'learning_rate': 4.875e-05, 'epoch': 1.25}


  3%|▎         | 70/2400 [01:26<43:22,  1.12s/it]

{'loss': 1.4545, 'grad_norm': 155.82623291015625, 'learning_rate': 4.854166666666667e-05, 'epoch': 1.46}


  3%|▎         | 80/2400 [01:38<43:16,  1.12s/it]

{'loss': 1.3905, 'grad_norm': 447.83306884765625, 'learning_rate': 4.8333333333333334e-05, 'epoch': 1.67}


  4%|▍         | 90/2400 [01:49<43:23,  1.13s/it]

{'loss': 1.1351, 'grad_norm': 43.50605010986328, 'learning_rate': 4.8125000000000004e-05, 'epoch': 1.88}


                                                 
  4%|▍         | 96/2400 [02:02<39:20,  1.02s/it]

{'eval_loss': 1.0816879272460938, 'eval_runtime': 6.5432, 'eval_samples_per_second': 18.34, 'eval_steps_per_second': 2.292, 'epoch': 2.0}


  4%|▍         | 100/2400 [02:07<1:08:55,  1.80s/it]

{'loss': 1.1383, 'grad_norm': 32.2797966003418, 'learning_rate': 4.791666666666667e-05, 'epoch': 2.08}


  5%|▍         | 110/2400 [02:18<43:52,  1.15s/it]  

{'loss': 1.0903, 'grad_norm': 115.84516143798828, 'learning_rate': 4.770833333333334e-05, 'epoch': 2.29}


  5%|▌         | 120/2400 [02:29<42:04,  1.11s/it]

{'loss': 1.0716, 'grad_norm': 54.45139694213867, 'learning_rate': 4.75e-05, 'epoch': 2.5}


  5%|▌         | 130/2400 [02:40<41:58,  1.11s/it]

{'loss': 1.1924, 'grad_norm': 44.164981842041016, 'learning_rate': 4.7291666666666666e-05, 'epoch': 2.71}


  6%|▌         | 140/2400 [02:51<41:43,  1.11s/it]

{'loss': 1.0981, 'grad_norm': 50.26229476928711, 'learning_rate': 4.708333333333334e-05, 'epoch': 2.92}


                                                  
  6%|▌         | 144/2400 [03:02<38:28,  1.02s/it]

{'eval_loss': 1.1244843006134033, 'eval_runtime': 6.3572, 'eval_samples_per_second': 18.876, 'eval_steps_per_second': 2.36, 'epoch': 3.0}


  6%|▋         | 150/2400 [03:09<53:51,  1.44s/it]  

{'loss': 1.1138, 'grad_norm': 142.0801239013672, 'learning_rate': 4.6875e-05, 'epoch': 3.12}


  7%|▋         | 160/2400 [03:20<41:59,  1.12s/it]

{'loss': 1.2055, 'grad_norm': 65.8754653930664, 'learning_rate': 4.666666666666667e-05, 'epoch': 3.33}


  7%|▋         | 170/2400 [03:31<41:38,  1.12s/it]

{'loss': 1.0553, 'grad_norm': 34.00343322753906, 'learning_rate': 4.6458333333333335e-05, 'epoch': 3.54}


  8%|▊         | 180/2400 [03:42<40:56,  1.11s/it]

{'loss': 1.172, 'grad_norm': 70.9768295288086, 'learning_rate': 4.6250000000000006e-05, 'epoch': 3.75}


  8%|▊         | 190/2400 [03:53<40:59,  1.11s/it]

{'loss': 1.1041, 'grad_norm': 28.409236907958984, 'learning_rate': 4.604166666666666e-05, 'epoch': 3.96}


                                                  
  8%|▊         | 192/2400 [04:01<37:29,  1.02s/it]

{'eval_loss': 1.0918203592300415, 'eval_runtime': 6.4128, 'eval_samples_per_second': 18.712, 'eval_steps_per_second': 2.339, 'epoch': 4.0}


  8%|▊         | 200/2400 [04:11<47:55,  1.31s/it]  

{'loss': 1.2339, 'grad_norm': 34.75534439086914, 'learning_rate': 4.5833333333333334e-05, 'epoch': 4.17}


  9%|▉         | 210/2400 [04:22<42:23,  1.16s/it]

{'loss': 1.0628, 'grad_norm': 64.50273895263672, 'learning_rate': 4.5625e-05, 'epoch': 4.38}


  9%|▉         | 220/2400 [04:34<41:59,  1.16s/it]

{'loss': 1.0413, 'grad_norm': 29.636917114257812, 'learning_rate': 4.541666666666667e-05, 'epoch': 4.58}


 10%|▉         | 230/2400 [04:45<41:22,  1.14s/it]

{'loss': 1.0864, 'grad_norm': 51.34901809692383, 'learning_rate': 4.520833333333334e-05, 'epoch': 4.79}


 10%|█         | 240/2400 [04:57<38:07,  1.06s/it]

{'loss': 1.2383, 'grad_norm': 38.838104248046875, 'learning_rate': 4.5e-05, 'epoch': 5.0}


                                                  
 10%|█         | 240/2400 [05:03<38:07,  1.06s/it]

{'eval_loss': 1.1264654397964478, 'eval_runtime': 6.7379, 'eval_samples_per_second': 17.81, 'eval_steps_per_second': 2.226, 'epoch': 5.0}


 10%|█         | 250/2400 [05:15<43:52,  1.22s/it]  

{'loss': 1.2034, 'grad_norm': 31.35780143737793, 'learning_rate': 4.4791666666666673e-05, 'epoch': 5.21}


 11%|█         | 260/2400 [05:27<40:53,  1.15s/it]

{'loss': 1.029, 'grad_norm': 30.71452522277832, 'learning_rate': 4.458333333333334e-05, 'epoch': 5.42}


 11%|█▏        | 270/2400 [05:38<40:22,  1.14s/it]

{'loss': 1.1048, 'grad_norm': 42.111915588378906, 'learning_rate': 4.4375e-05, 'epoch': 5.62}


 12%|█▏        | 280/2400 [05:49<40:02,  1.13s/it]

{'loss': 1.0178, 'grad_norm': 56.57844161987305, 'learning_rate': 4.4166666666666665e-05, 'epoch': 5.83}


                                                  
 12%|█▏        | 288/2400 [06:05<37:49,  1.07s/it]

{'eval_loss': 1.0106264352798462, 'eval_runtime': 6.7244, 'eval_samples_per_second': 17.846, 'eval_steps_per_second': 2.231, 'epoch': 6.0}


 12%|█▏        | 290/2400 [06:07<1:31:34,  2.60s/it]

{'loss': 1.1243, 'grad_norm': 33.86328125, 'learning_rate': 4.3958333333333336e-05, 'epoch': 6.04}


 12%|█▎        | 300/2400 [06:19<40:54,  1.17s/it]  

{'loss': 0.9305, 'grad_norm': 42.06914520263672, 'learning_rate': 4.375e-05, 'epoch': 6.25}


 13%|█▎        | 310/2400 [06:30<39:19,  1.13s/it]

{'loss': 0.9777, 'grad_norm': 38.72967529296875, 'learning_rate': 4.354166666666667e-05, 'epoch': 6.46}


 13%|█▎        | 320/2400 [06:41<39:11,  1.13s/it]

{'loss': 1.0505, 'grad_norm': 28.14974021911621, 'learning_rate': 4.3333333333333334e-05, 'epoch': 6.67}


 14%|█▍        | 330/2400 [06:53<38:56,  1.13s/it]

{'loss': 1.0514, 'grad_norm': 47.33351135253906, 'learning_rate': 4.3125000000000005e-05, 'epoch': 6.88}


                                                  
 14%|█▍        | 336/2400 [07:05<35:31,  1.03s/it]

{'eval_loss': 1.0189310312271118, 'eval_runtime': 6.5035, 'eval_samples_per_second': 18.451, 'eval_steps_per_second': 2.306, 'epoch': 7.0}


 14%|█▍        | 340/2400 [07:10<1:01:55,  1.80s/it]

{'loss': 0.9281, 'grad_norm': 33.33854675292969, 'learning_rate': 4.291666666666667e-05, 'epoch': 7.08}


 15%|█▍        | 350/2400 [07:22<39:35,  1.16s/it]  

{'loss': 1.1046, 'grad_norm': 39.79082107543945, 'learning_rate': 4.270833333333333e-05, 'epoch': 7.29}


 15%|█▌        | 360/2400 [07:33<38:08,  1.12s/it]

{'loss': 0.8858, 'grad_norm': 17.838367462158203, 'learning_rate': 4.25e-05, 'epoch': 7.5}


 15%|█▌        | 370/2400 [07:44<37:41,  1.11s/it]

{'loss': 0.9577, 'grad_norm': 30.161083221435547, 'learning_rate': 4.229166666666667e-05, 'epoch': 7.71}


 16%|█▌        | 380/2400 [07:55<37:47,  1.12s/it]

{'loss': 0.9338, 'grad_norm': 34.92267990112305, 'learning_rate': 4.208333333333334e-05, 'epoch': 7.92}


                                                  
 16%|█▌        | 384/2400 [08:06<34:58,  1.04s/it]

{'eval_loss': 0.9801893830299377, 'eval_runtime': 6.5644, 'eval_samples_per_second': 18.28, 'eval_steps_per_second': 2.285, 'epoch': 8.0}


 16%|█▋        | 390/2400 [08:13<49:21,  1.47s/it]  

{'loss': 1.0546, 'grad_norm': 39.00990676879883, 'learning_rate': 4.1875e-05, 'epoch': 8.12}


 17%|█▋        | 400/2400 [08:25<38:58,  1.17s/it]

{'loss': 0.9085, 'grad_norm': 36.90516662597656, 'learning_rate': 4.166666666666667e-05, 'epoch': 8.33}


 17%|█▋        | 410/2400 [08:37<38:29,  1.16s/it]

{'loss': 1.0644, 'grad_norm': 19.705528259277344, 'learning_rate': 4.1458333333333336e-05, 'epoch': 8.54}


 18%|█▊        | 420/2400 [08:48<37:58,  1.15s/it]

{'loss': 0.8717, 'grad_norm': 26.13507080078125, 'learning_rate': 4.125e-05, 'epoch': 8.75}


 18%|█▊        | 430/2400 [09:00<38:04,  1.16s/it]

{'loss': 0.9581, 'grad_norm': 24.471071243286133, 'learning_rate': 4.104166666666667e-05, 'epoch': 8.96}


                                                  
 18%|█▊        | 432/2400 [09:09<34:35,  1.05s/it]

{'eval_loss': 0.9456943869590759, 'eval_runtime': 7.0261, 'eval_samples_per_second': 17.079, 'eval_steps_per_second': 2.135, 'epoch': 9.0}


 18%|█▊        | 440/2400 [09:18<43:36,  1.34s/it]  

{'loss': 1.0004, 'grad_norm': 20.151588439941406, 'learning_rate': 4.0833333333333334e-05, 'epoch': 9.17}


 19%|█▉        | 450/2400 [09:30<37:59,  1.17s/it]

{'loss': 0.805, 'grad_norm': 22.357877731323242, 'learning_rate': 4.0625000000000005e-05, 'epoch': 9.38}


 19%|█▉        | 460/2400 [09:42<37:30,  1.16s/it]

{'loss': 0.8278, 'grad_norm': 31.724388122558594, 'learning_rate': 4.041666666666667e-05, 'epoch': 9.58}


 20%|█▉        | 470/2400 [09:53<37:25,  1.16s/it]

{'loss': 0.9309, 'grad_norm': 26.805089950561523, 'learning_rate': 4.020833333333334e-05, 'epoch': 9.79}


 20%|██        | 480/2400 [10:04<33:38,  1.05s/it]

{'loss': 0.9106, 'grad_norm': 21.70477294921875, 'learning_rate': 4e-05, 'epoch': 10.0}


                                                  
 20%|██        | 480/2400 [10:11<33:38,  1.05s/it]

{'eval_loss': 0.9475752711296082, 'eval_runtime': 7.0273, 'eval_samples_per_second': 17.076, 'eval_steps_per_second': 2.135, 'epoch': 10.0}


 20%|██        | 490/2400 [10:23<39:52,  1.25s/it]  

{'loss': 0.9917, 'grad_norm': 20.511777877807617, 'learning_rate': 3.979166666666667e-05, 'epoch': 10.21}


 21%|██        | 500/2400 [10:35<36:42,  1.16s/it]

{'loss': 0.7953, 'grad_norm': 33.465087890625, 'learning_rate': 3.958333333333333e-05, 'epoch': 10.42}


 21%|██▏       | 510/2400 [10:47<36:54,  1.17s/it]

{'loss': 0.7823, 'grad_norm': 20.652135848999023, 'learning_rate': 3.9375e-05, 'epoch': 10.62}


 22%|██▏       | 520/2400 [10:58<36:27,  1.16s/it]

{'loss': 0.8673, 'grad_norm': 37.07013702392578, 'learning_rate': 3.9166666666666665e-05, 'epoch': 10.83}


                                                  
 22%|██▏       | 528/2400 [11:15<33:12,  1.06s/it]

{'eval_loss': 0.9762992262840271, 'eval_runtime': 7.168, 'eval_samples_per_second': 16.741, 'eval_steps_per_second': 2.093, 'epoch': 11.0}


 22%|██▏       | 530/2400 [11:17<1:24:30,  2.71s/it]

{'loss': 0.8355, 'grad_norm': 38.52298355102539, 'learning_rate': 3.8958333333333336e-05, 'epoch': 11.04}


 22%|██▎       | 540/2400 [11:29<37:53,  1.22s/it]  

{'loss': 0.7935, 'grad_norm': 22.860553741455078, 'learning_rate': 3.875e-05, 'epoch': 11.25}


 23%|██▎       | 550/2400 [11:41<36:05,  1.17s/it]

{'loss': 0.9048, 'grad_norm': 49.56422805786133, 'learning_rate': 3.854166666666667e-05, 'epoch': 11.46}


 23%|██▎       | 560/2400 [11:52<35:25,  1.16s/it]

{'loss': 0.9294, 'grad_norm': 37.14400100708008, 'learning_rate': 3.8333333333333334e-05, 'epoch': 11.67}


 24%|██▍       | 570/2400 [12:04<35:14,  1.16s/it]

{'loss': 0.8812, 'grad_norm': 58.756134033203125, 'learning_rate': 3.8125e-05, 'epoch': 11.88}


                                                  
 24%|██▍       | 576/2400 [12:18<32:06,  1.06s/it]

{'eval_loss': 0.9050614833831787, 'eval_runtime': 7.1928, 'eval_samples_per_second': 16.683, 'eval_steps_per_second': 2.085, 'epoch': 12.0}


 24%|██▍       | 580/2400 [12:23<58:01,  1.91s/it]  

{'loss': 0.7537, 'grad_norm': 31.558780670166016, 'learning_rate': 3.791666666666667e-05, 'epoch': 12.08}


 25%|██▍       | 590/2400 [12:34<35:27,  1.18s/it]

{'loss': 0.7664, 'grad_norm': 21.949560165405273, 'learning_rate': 3.770833333333333e-05, 'epoch': 12.29}


 25%|██▌       | 600/2400 [12:46<35:55,  1.20s/it]

{'loss': 0.7679, 'grad_norm': 38.59206008911133, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


 25%|██▌       | 610/2400 [12:58<35:50,  1.20s/it]

{'loss': 0.8445, 'grad_norm': 32.191707611083984, 'learning_rate': 3.729166666666667e-05, 'epoch': 12.71}


 26%|██▌       | 620/2400 [13:10<33:55,  1.14s/it]

{'loss': 0.859, 'grad_norm': 30.999120712280273, 'learning_rate': 3.708333333333334e-05, 'epoch': 12.92}


                                                  
 26%|██▌       | 624/2400 [13:22<30:49,  1.04s/it]

{'eval_loss': 0.8829768300056458, 'eval_runtime': 7.2422, 'eval_samples_per_second': 16.569, 'eval_steps_per_second': 2.071, 'epoch': 13.0}


 26%|██▋       | 630/2400 [13:29<45:23,  1.54s/it]  

{'loss': 0.7487, 'grad_norm': 28.98098373413086, 'learning_rate': 3.6875e-05, 'epoch': 13.12}


 27%|██▋       | 640/2400 [13:40<34:08,  1.16s/it]

{'loss': 0.7627, 'grad_norm': 22.897563934326172, 'learning_rate': 3.6666666666666666e-05, 'epoch': 13.33}


 27%|██▋       | 650/2400 [13:52<33:12,  1.14s/it]

{'loss': 0.7736, 'grad_norm': 41.831642150878906, 'learning_rate': 3.6458333333333336e-05, 'epoch': 13.54}


 28%|██▊       | 660/2400 [14:03<33:03,  1.14s/it]

{'loss': 0.8344, 'grad_norm': 30.944446563720703, 'learning_rate': 3.625e-05, 'epoch': 13.75}


 28%|██▊       | 670/2400 [14:15<33:14,  1.15s/it]

{'loss': 0.75, 'grad_norm': 27.086530685424805, 'learning_rate': 3.604166666666667e-05, 'epoch': 13.96}


                                                  
 28%|██▊       | 672/2400 [14:24<30:12,  1.05s/it]

{'eval_loss': 0.8407334685325623, 'eval_runtime': 7.0908, 'eval_samples_per_second': 16.923, 'eval_steps_per_second': 2.115, 'epoch': 14.0}


 28%|██▊       | 680/2400 [14:33<37:53,  1.32s/it]  

{'loss': 0.7366, 'grad_norm': 36.10009002685547, 'learning_rate': 3.5833333333333335e-05, 'epoch': 14.17}


 29%|██▉       | 690/2400 [14:45<33:07,  1.16s/it]

{'loss': 0.7507, 'grad_norm': 22.451326370239258, 'learning_rate': 3.5625000000000005e-05, 'epoch': 14.38}


 29%|██▉       | 700/2400 [14:56<32:30,  1.15s/it]

{'loss': 0.7377, 'grad_norm': 30.422399520874023, 'learning_rate': 3.541666666666667e-05, 'epoch': 14.58}


 30%|██▉       | 710/2400 [15:08<32:05,  1.14s/it]

{'loss': 0.7474, 'grad_norm': 32.25687789916992, 'learning_rate': 3.520833333333334e-05, 'epoch': 14.79}


 30%|███       | 720/2400 [15:19<29:02,  1.04s/it]

{'loss': 0.8361, 'grad_norm': 35.552589416503906, 'learning_rate': 3.5e-05, 'epoch': 15.0}


                                                  
 30%|███       | 720/2400 [15:25<29:02,  1.04s/it]

{'eval_loss': 0.9121769070625305, 'eval_runtime': 6.6213, 'eval_samples_per_second': 18.123, 'eval_steps_per_second': 2.265, 'epoch': 15.0}


 30%|███       | 730/2400 [15:37<33:41,  1.21s/it]  

{'loss': 0.7541, 'grad_norm': 45.71516799926758, 'learning_rate': 3.479166666666667e-05, 'epoch': 15.21}


 31%|███       | 740/2400 [15:49<31:50,  1.15s/it]

{'loss': 0.7824, 'grad_norm': 66.95647430419922, 'learning_rate': 3.458333333333333e-05, 'epoch': 15.42}


 31%|███▏      | 750/2400 [16:00<31:05,  1.13s/it]

{'loss': 0.7275, 'grad_norm': 21.606178283691406, 'learning_rate': 3.4375e-05, 'epoch': 15.62}


 32%|███▏      | 760/2400 [16:11<30:34,  1.12s/it]

{'loss': 0.7287, 'grad_norm': 33.990966796875, 'learning_rate': 3.4166666666666666e-05, 'epoch': 15.83}


                                                  
 32%|███▏      | 768/2400 [16:28<31:02,  1.14s/it]

{'eval_loss': 0.9200717806816101, 'eval_runtime': 7.2428, 'eval_samples_per_second': 16.568, 'eval_steps_per_second': 2.071, 'epoch': 16.0}


 32%|███▏      | 770/2400 [16:30<1:15:15,  2.77s/it]

{'loss': 0.7313, 'grad_norm': 22.379175186157227, 'learning_rate': 3.3958333333333337e-05, 'epoch': 16.04}


 32%|███▎      | 780/2400 [16:42<32:57,  1.22s/it]  

{'loss': 0.7245, 'grad_norm': 16.898488998413086, 'learning_rate': 3.375000000000001e-05, 'epoch': 16.25}


 33%|███▎      | 790/2400 [16:54<30:24,  1.13s/it]

{'loss': 0.6979, 'grad_norm': 105.71133422851562, 'learning_rate': 3.3541666666666664e-05, 'epoch': 16.46}


 33%|███▎      | 800/2400 [17:05<30:22,  1.14s/it]

{'loss': 0.7181, 'grad_norm': 34.566829681396484, 'learning_rate': 3.3333333333333335e-05, 'epoch': 16.67}


 34%|███▍      | 810/2400 [17:17<30:41,  1.16s/it]

{'loss': 0.7293, 'grad_norm': 35.211299896240234, 'learning_rate': 3.3125e-05, 'epoch': 16.88}


                                                  
 34%|███▍      | 816/2400 [17:30<27:14,  1.03s/it]

{'eval_loss': 0.9152175784111023, 'eval_runtime': 7.0877, 'eval_samples_per_second': 16.931, 'eval_steps_per_second': 2.116, 'epoch': 17.0}


 34%|███▍      | 820/2400 [17:35<49:30,  1.88s/it]  

{'loss': 0.7661, 'grad_norm': 36.150917053222656, 'learning_rate': 3.291666666666667e-05, 'epoch': 17.08}


 35%|███▍      | 830/2400 [17:46<29:47,  1.14s/it]

{'loss': 0.7143, 'grad_norm': 20.36416244506836, 'learning_rate': 3.270833333333333e-05, 'epoch': 17.29}


 35%|███▌      | 840/2400 [17:58<29:40,  1.14s/it]

{'loss': 0.7119, 'grad_norm': 27.072040557861328, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


 35%|███▌      | 850/2400 [18:09<28:55,  1.12s/it]

{'loss': 0.6411, 'grad_norm': 31.273075103759766, 'learning_rate': 3.229166666666667e-05, 'epoch': 17.71}


 36%|███▌      | 860/2400 [18:21<29:22,  1.14s/it]

{'loss': 0.7051, 'grad_norm': 27.65541648864746, 'learning_rate': 3.208333333333334e-05, 'epoch': 17.92}


                                                  
 36%|███▌      | 864/2400 [18:32<26:26,  1.03s/it]

{'eval_loss': 0.9047257900238037, 'eval_runtime': 7.11, 'eval_samples_per_second': 16.878, 'eval_steps_per_second': 2.11, 'epoch': 18.0}


 36%|███▋      | 870/2400 [18:39<38:04,  1.49s/it]  

{'loss': 0.6923, 'grad_norm': 30.278486251831055, 'learning_rate': 3.1875e-05, 'epoch': 18.12}


 37%|███▋      | 880/2400 [18:50<28:50,  1.14s/it]

{'loss': 0.5733, 'grad_norm': 30.15094566345215, 'learning_rate': 3.1666666666666666e-05, 'epoch': 18.33}


 37%|███▋      | 890/2400 [19:02<28:34,  1.14s/it]

{'loss': 0.6888, 'grad_norm': 30.810665130615234, 'learning_rate': 3.145833333333334e-05, 'epoch': 18.54}


 38%|███▊      | 900/2400 [19:13<28:07,  1.12s/it]

{'loss': 0.7091, 'grad_norm': 18.489578247070312, 'learning_rate': 3.125e-05, 'epoch': 18.75}


 38%|███▊      | 910/2400 [19:25<28:16,  1.14s/it]

{'loss': 0.6471, 'grad_norm': 38.628578186035156, 'learning_rate': 3.104166666666667e-05, 'epoch': 18.96}


                                                  
 38%|███▊      | 912/2400 [19:34<25:36,  1.03s/it]

{'eval_loss': 0.8946374654769897, 'eval_runtime': 7.1808, 'eval_samples_per_second': 16.711, 'eval_steps_per_second': 2.089, 'epoch': 19.0}


 38%|███▊      | 920/2400 [19:43<32:47,  1.33s/it]  

{'loss': 0.6356, 'grad_norm': 23.415224075317383, 'learning_rate': 3.0833333333333335e-05, 'epoch': 19.17}


 39%|███▉      | 930/2400 [19:55<28:19,  1.16s/it]

{'loss': 0.6484, 'grad_norm': 32.516334533691406, 'learning_rate': 3.0625000000000006e-05, 'epoch': 19.38}


 39%|███▉      | 940/2400 [20:06<27:54,  1.15s/it]

{'loss': 0.6838, 'grad_norm': 25.866037368774414, 'learning_rate': 3.0416666666666666e-05, 'epoch': 19.58}


 40%|███▉      | 950/2400 [20:18<27:44,  1.15s/it]

{'loss': 0.6705, 'grad_norm': 24.277267456054688, 'learning_rate': 3.0208333333333334e-05, 'epoch': 19.79}


 40%|████      | 960/2400 [20:29<25:16,  1.05s/it]

{'loss': 0.6314, 'grad_norm': 42.624202728271484, 'learning_rate': 3e-05, 'epoch': 20.0}


                                                  
 40%|████      | 960/2400 [20:36<25:16,  1.05s/it]

{'eval_loss': 0.892613410949707, 'eval_runtime': 7.2156, 'eval_samples_per_second': 16.631, 'eval_steps_per_second': 2.079, 'epoch': 20.0}


 40%|████      | 970/2400 [20:48<28:58,  1.22s/it]  

{'loss': 0.5689, 'grad_norm': 18.18920135498047, 'learning_rate': 2.9791666666666668e-05, 'epoch': 20.21}


 41%|████      | 980/2400 [20:59<26:38,  1.13s/it]

{'loss': 0.5407, 'grad_norm': 21.317564010620117, 'learning_rate': 2.9583333333333335e-05, 'epoch': 20.42}


 41%|████▏     | 990/2400 [21:11<26:26,  1.12s/it]

{'loss': 0.7276, 'grad_norm': 27.131948471069336, 'learning_rate': 2.9375000000000003e-05, 'epoch': 20.62}


 42%|████▏     | 1000/2400 [21:22<26:41,  1.14s/it]

{'loss': 0.6524, 'grad_norm': 24.700809478759766, 'learning_rate': 2.916666666666667e-05, 'epoch': 20.83}


                                                   
 42%|████▏     | 1008/2400 [21:38<23:55,  1.03s/it]

{'eval_loss': 0.9016342759132385, 'eval_runtime': 6.9986, 'eval_samples_per_second': 17.146, 'eval_steps_per_second': 2.143, 'epoch': 21.0}


 42%|████▏     | 1010/2400 [21:40<1:01:04,  2.64s/it]

{'loss': 0.617, 'grad_norm': 23.986202239990234, 'learning_rate': 2.8958333333333337e-05, 'epoch': 21.04}


 42%|████▎     | 1020/2400 [21:52<27:49,  1.21s/it]  

{'loss': 0.6662, 'grad_norm': 21.592077255249023, 'learning_rate': 2.8749999999999997e-05, 'epoch': 21.25}


 43%|████▎     | 1030/2400 [22:03<25:51,  1.13s/it]

{'loss': 0.5788, 'grad_norm': 29.007612228393555, 'learning_rate': 2.8541666666666668e-05, 'epoch': 21.46}


 43%|████▎     | 1040/2400 [22:15<26:01,  1.15s/it]

{'loss': 0.589, 'grad_norm': 32.57634353637695, 'learning_rate': 2.8333333333333335e-05, 'epoch': 21.67}


 44%|████▍     | 1050/2400 [22:26<25:32,  1.14s/it]

{'loss': 0.5951, 'grad_norm': 21.834774017333984, 'learning_rate': 2.8125000000000003e-05, 'epoch': 21.88}


                                                   
 44%|████▍     | 1056/2400 [22:40<23:09,  1.03s/it]

{'eval_loss': 0.9630633592605591, 'eval_runtime': 7.034, 'eval_samples_per_second': 17.06, 'eval_steps_per_second': 2.133, 'epoch': 22.0}


 44%|████▍     | 1060/2400 [22:44<41:57,  1.88s/it]  

{'loss': 0.6305, 'grad_norm': 26.188024520874023, 'learning_rate': 2.791666666666667e-05, 'epoch': 22.08}


 45%|████▍     | 1070/2400 [22:56<26:05,  1.18s/it]

{'loss': 0.6068, 'grad_norm': 21.766454696655273, 'learning_rate': 2.7708333333333337e-05, 'epoch': 22.29}


 45%|████▌     | 1080/2400 [23:07<24:44,  1.12s/it]

{'loss': 0.5583, 'grad_norm': 33.449031829833984, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


 45%|████▌     | 1090/2400 [23:18<24:34,  1.13s/it]

{'loss': 0.6441, 'grad_norm': 36.39936065673828, 'learning_rate': 2.7291666666666665e-05, 'epoch': 22.71}


 46%|████▌     | 1100/2400 [23:30<24:30,  1.13s/it]

{'loss': 0.5859, 'grad_norm': 32.975929260253906, 'learning_rate': 2.7083333333333332e-05, 'epoch': 22.92}


                                                   
 46%|████▌     | 1104/2400 [23:41<22:04,  1.02s/it]

{'eval_loss': 0.8752363920211792, 'eval_runtime': 6.7294, 'eval_samples_per_second': 17.832, 'eval_steps_per_second': 2.229, 'epoch': 23.0}


 46%|████▋     | 1110/2400 [23:48<31:39,  1.47s/it]  

{'loss': 0.6542, 'grad_norm': 40.48882293701172, 'learning_rate': 2.6875e-05, 'epoch': 23.12}


 47%|████▋     | 1120/2400 [23:59<24:24,  1.14s/it]

{'loss': 0.5761, 'grad_norm': 28.19754409790039, 'learning_rate': 2.6666666666666667e-05, 'epoch': 23.33}


 47%|████▋     | 1130/2400 [24:10<23:49,  1.13s/it]

{'loss': 0.5369, 'grad_norm': 36.807640075683594, 'learning_rate': 2.6458333333333334e-05, 'epoch': 23.54}


 48%|████▊     | 1140/2400 [24:22<23:47,  1.13s/it]

{'loss': 0.5883, 'grad_norm': 25.554582595825195, 'learning_rate': 2.625e-05, 'epoch': 23.75}


 48%|████▊     | 1150/2400 [24:33<23:35,  1.13s/it]

{'loss': 0.6144, 'grad_norm': 38.58452606201172, 'learning_rate': 2.604166666666667e-05, 'epoch': 23.96}


                                                   
 48%|████▊     | 1152/2400 [24:42<21:25,  1.03s/it]

{'eval_loss': 0.9015560150146484, 'eval_runtime': 6.9449, 'eval_samples_per_second': 17.279, 'eval_steps_per_second': 2.16, 'epoch': 24.0}


 48%|████▊     | 1160/2400 [24:51<27:21,  1.32s/it]  

{'loss': 0.5297, 'grad_norm': 25.57680320739746, 'learning_rate': 2.5833333333333336e-05, 'epoch': 24.17}


 49%|████▉     | 1170/2400 [25:03<22:59,  1.12s/it]

{'loss': 0.538, 'grad_norm': 22.008989334106445, 'learning_rate': 2.5625e-05, 'epoch': 24.38}


 49%|████▉     | 1180/2400 [25:14<22:52,  1.13s/it]

{'loss': 0.5511, 'grad_norm': 41.056427001953125, 'learning_rate': 2.5416666666666667e-05, 'epoch': 24.58}


 50%|████▉     | 1190/2400 [25:25<22:33,  1.12s/it]

{'loss': 0.5133, 'grad_norm': 30.482133865356445, 'learning_rate': 2.5208333333333334e-05, 'epoch': 24.79}


 50%|█████     | 1200/2400 [25:36<20:19,  1.02s/it]

{'loss': 0.5512, 'grad_norm': 35.863311767578125, 'learning_rate': 2.5e-05, 'epoch': 25.0}


                                                   
 50%|█████     | 1200/2400 [25:43<20:19,  1.02s/it]

{'eval_loss': 0.882563591003418, 'eval_runtime': 6.7039, 'eval_samples_per_second': 17.9, 'eval_steps_per_second': 2.238, 'epoch': 25.0}


 50%|█████     | 1210/2400 [25:54<24:14,  1.22s/it]  

{'loss': 0.4416, 'grad_norm': 18.176172256469727, 'learning_rate': 2.479166666666667e-05, 'epoch': 25.21}


 51%|█████     | 1220/2400 [26:05<21:57,  1.12s/it]

{'loss': 0.5227, 'grad_norm': 43.54343795776367, 'learning_rate': 2.4583333333333332e-05, 'epoch': 25.42}


 51%|█████▏    | 1230/2400 [26:17<22:14,  1.14s/it]

{'loss': 0.5254, 'grad_norm': 29.939495086669922, 'learning_rate': 2.4375e-05, 'epoch': 25.62}


 52%|█████▏    | 1240/2400 [26:28<22:10,  1.15s/it]

{'loss': 0.4943, 'grad_norm': 36.7228889465332, 'learning_rate': 2.4166666666666667e-05, 'epoch': 25.83}


                                                   
 52%|█████▏    | 1248/2400 [26:44<19:36,  1.02s/it]

{'eval_loss': 0.8977911472320557, 'eval_runtime': 6.9111, 'eval_samples_per_second': 17.363, 'eval_steps_per_second': 2.17, 'epoch': 26.0}


 52%|█████▏    | 1250/2400 [26:47<49:59,  2.61s/it]  

{'loss': 0.5568, 'grad_norm': 35.35559844970703, 'learning_rate': 2.3958333333333334e-05, 'epoch': 26.04}


 52%|█████▎    | 1260/2400 [26:58<22:21,  1.18s/it]

{'loss': 0.5102, 'grad_norm': 26.269790649414062, 'learning_rate': 2.375e-05, 'epoch': 26.25}


 53%|█████▎    | 1270/2400 [27:09<21:02,  1.12s/it]

{'loss': 0.4431, 'grad_norm': 35.87321090698242, 'learning_rate': 2.354166666666667e-05, 'epoch': 26.46}


 53%|█████▎    | 1280/2400 [27:21<21:22,  1.15s/it]

{'loss': 0.4932, 'grad_norm': 28.17375946044922, 'learning_rate': 2.3333333333333336e-05, 'epoch': 26.67}


 54%|█████▍    | 1290/2400 [27:32<20:55,  1.13s/it]

{'loss': 0.5394, 'grad_norm': 31.138105392456055, 'learning_rate': 2.3125000000000003e-05, 'epoch': 26.88}


                                                   
 54%|█████▍    | 1296/2400 [27:45<18:57,  1.03s/it]

{'eval_loss': 0.8818529844284058, 'eval_runtime': 7.008, 'eval_samples_per_second': 17.123, 'eval_steps_per_second': 2.14, 'epoch': 27.0}


 54%|█████▍    | 1300/2400 [27:50<34:06,  1.86s/it]  

{'loss': 0.5069, 'grad_norm': 25.852651596069336, 'learning_rate': 2.2916666666666667e-05, 'epoch': 27.08}


 55%|█████▍    | 1310/2400 [28:02<20:55,  1.15s/it]

{'loss': 0.4443, 'grad_norm': 38.70936584472656, 'learning_rate': 2.2708333333333334e-05, 'epoch': 27.29}


 55%|█████▌    | 1320/2400 [28:13<20:21,  1.13s/it]

{'loss': 0.482, 'grad_norm': 38.31462097167969, 'learning_rate': 2.25e-05, 'epoch': 27.5}


 55%|█████▌    | 1330/2400 [28:24<20:26,  1.15s/it]

{'loss': 0.4215, 'grad_norm': 22.23291015625, 'learning_rate': 2.229166666666667e-05, 'epoch': 27.71}


 56%|█████▌    | 1340/2400 [28:36<21:34,  1.22s/it]

{'loss': 0.5421, 'grad_norm': 17.122028350830078, 'learning_rate': 2.2083333333333333e-05, 'epoch': 27.92}


                                                   
 56%|█████▌    | 1344/2400 [28:49<20:22,  1.16s/it]

{'eval_loss': 0.8933327794075012, 'eval_runtime': 7.5211, 'eval_samples_per_second': 15.955, 'eval_steps_per_second': 1.994, 'epoch': 28.0}


 56%|█████▋    | 1350/2400 [28:57<29:19,  1.68s/it]  

{'loss': 0.4334, 'grad_norm': 35.52229690551758, 'learning_rate': 2.1875e-05, 'epoch': 28.12}


 57%|█████▋    | 1360/2400 [29:09<21:55,  1.26s/it]

{'loss': 0.4566, 'grad_norm': 35.37481689453125, 'learning_rate': 2.1666666666666667e-05, 'epoch': 28.33}


 57%|█████▋    | 1370/2400 [29:22<21:44,  1.27s/it]

{'loss': 0.3565, 'grad_norm': 31.877601623535156, 'learning_rate': 2.1458333333333334e-05, 'epoch': 28.54}


 57%|█████▊    | 1380/2400 [29:34<20:28,  1.20s/it]

{'loss': 0.4895, 'grad_norm': 54.35128402709961, 'learning_rate': 2.125e-05, 'epoch': 28.75}


 58%|█████▊    | 1390/2400 [29:46<19:07,  1.14s/it]

{'loss': 0.4782, 'grad_norm': 23.1397762298584, 'learning_rate': 2.104166666666667e-05, 'epoch': 28.96}


                                                   
 58%|█████▊    | 1392/2400 [29:55<17:20,  1.03s/it]

{'eval_loss': 0.8655139803886414, 'eval_runtime': 7.0419, 'eval_samples_per_second': 17.041, 'eval_steps_per_second': 2.13, 'epoch': 29.0}


 58%|█████▊    | 1400/2400 [30:04<21:40,  1.30s/it]

{'loss': 0.3909, 'grad_norm': 35.2596321105957, 'learning_rate': 2.0833333333333336e-05, 'epoch': 29.17}


 59%|█████▉    | 1410/2400 [30:16<18:40,  1.13s/it]

{'loss': 0.4173, 'grad_norm': 26.134607315063477, 'learning_rate': 2.0625e-05, 'epoch': 29.38}


 59%|█████▉    | 1420/2400 [30:27<18:21,  1.12s/it]

{'loss': 0.3883, 'grad_norm': 42.2251091003418, 'learning_rate': 2.0416666666666667e-05, 'epoch': 29.58}


 60%|█████▉    | 1430/2400 [30:38<17:45,  1.10s/it]

{'loss': 0.3898, 'grad_norm': 37.4979248046875, 'learning_rate': 2.0208333333333334e-05, 'epoch': 29.79}


 60%|██████    | 1440/2400 [30:49<16:10,  1.01s/it]

{'loss': 0.4303, 'grad_norm': 20.101318359375, 'learning_rate': 2e-05, 'epoch': 30.0}


                                                   
 60%|██████    | 1440/2400 [30:55<16:10,  1.01s/it]

{'eval_loss': 0.9181323647499084, 'eval_runtime': 6.5484, 'eval_samples_per_second': 18.325, 'eval_steps_per_second': 2.291, 'epoch': 30.0}


 60%|██████    | 1450/2400 [31:07<18:54,  1.19s/it]

{'loss': 0.4329, 'grad_norm': 24.29295539855957, 'learning_rate': 1.9791666666666665e-05, 'epoch': 30.21}


 61%|██████    | 1460/2400 [31:19<19:03,  1.22s/it]

{'loss': 0.4229, 'grad_norm': 31.666645050048828, 'learning_rate': 1.9583333333333333e-05, 'epoch': 30.42}


 61%|██████▏   | 1470/2400 [31:30<17:35,  1.13s/it]

{'loss': 0.4658, 'grad_norm': 34.96866226196289, 'learning_rate': 1.9375e-05, 'epoch': 30.62}


 62%|██████▏   | 1480/2400 [31:41<16:55,  1.10s/it]

{'loss': 0.3966, 'grad_norm': 46.27988815307617, 'learning_rate': 1.9166666666666667e-05, 'epoch': 30.83}


                                                   
 62%|██████▏   | 1488/2400 [31:56<15:20,  1.01s/it]

{'eval_loss': 0.9002007246017456, 'eval_runtime': 6.503, 'eval_samples_per_second': 18.453, 'eval_steps_per_second': 2.307, 'epoch': 31.0}


 62%|██████▏   | 1490/2400 [31:59<37:46,  2.49s/it]

{'loss': 0.4466, 'grad_norm': 37.32483673095703, 'learning_rate': 1.8958333333333334e-05, 'epoch': 31.04}


 62%|██████▎   | 1500/2400 [32:10<17:15,  1.15s/it]

{'loss': 0.4129, 'grad_norm': 21.847993850708008, 'learning_rate': 1.8750000000000002e-05, 'epoch': 31.25}


 63%|██████▎   | 1510/2400 [32:21<16:41,  1.13s/it]

{'loss': 0.3866, 'grad_norm': 23.705455780029297, 'learning_rate': 1.854166666666667e-05, 'epoch': 31.46}


 63%|██████▎   | 1520/2400 [32:32<16:24,  1.12s/it]

{'loss': 0.3643, 'grad_norm': 20.29074478149414, 'learning_rate': 1.8333333333333333e-05, 'epoch': 31.67}


 64%|██████▍   | 1530/2400 [32:44<16:11,  1.12s/it]

{'loss': 0.4725, 'grad_norm': 26.09242057800293, 'learning_rate': 1.8125e-05, 'epoch': 31.88}


                                                   
 64%|██████▍   | 1536/2400 [32:57<14:33,  1.01s/it]

{'eval_loss': 0.9008398056030273, 'eval_runtime': 6.7805, 'eval_samples_per_second': 17.698, 'eval_steps_per_second': 2.212, 'epoch': 32.0}


 64%|██████▍   | 1540/2400 [33:02<26:03,  1.82s/it]

{'loss': 0.3667, 'grad_norm': 29.14394187927246, 'learning_rate': 1.7916666666666667e-05, 'epoch': 32.08}


 65%|██████▍   | 1550/2400 [33:13<16:06,  1.14s/it]

{'loss': 0.3871, 'grad_norm': 43.06050109863281, 'learning_rate': 1.7708333333333335e-05, 'epoch': 32.29}


 65%|██████▌   | 1560/2400 [33:24<15:48,  1.13s/it]

{'loss': 0.4211, 'grad_norm': 20.420589447021484, 'learning_rate': 1.75e-05, 'epoch': 32.5}


 65%|██████▌   | 1570/2400 [33:35<15:34,  1.13s/it]

{'loss': 0.3719, 'grad_norm': 26.388532638549805, 'learning_rate': 1.7291666666666666e-05, 'epoch': 32.71}


 66%|██████▌   | 1580/2400 [33:47<15:22,  1.13s/it]

{'loss': 0.3457, 'grad_norm': 25.19268035888672, 'learning_rate': 1.7083333333333333e-05, 'epoch': 32.92}


                                                   
 66%|██████▌   | 1584/2400 [33:58<14:01,  1.03s/it]

{'eval_loss': 0.9204034209251404, 'eval_runtime': 6.9754, 'eval_samples_per_second': 17.203, 'eval_steps_per_second': 2.15, 'epoch': 33.0}


 66%|██████▋   | 1590/2400 [34:05<19:58,  1.48s/it]

{'loss': 0.4057, 'grad_norm': 23.99127197265625, 'learning_rate': 1.6875000000000004e-05, 'epoch': 33.12}


 67%|██████▋   | 1600/2400 [34:16<15:02,  1.13s/it]

{'loss': 0.3111, 'grad_norm': 21.351696014404297, 'learning_rate': 1.6666666666666667e-05, 'epoch': 33.33}


 67%|██████▋   | 1610/2400 [34:27<14:43,  1.12s/it]

{'loss': 0.3322, 'grad_norm': 36.5560188293457, 'learning_rate': 1.6458333333333335e-05, 'epoch': 33.54}


 68%|██████▊   | 1620/2400 [34:39<14:33,  1.12s/it]

{'loss': 0.3468, 'grad_norm': 45.1473274230957, 'learning_rate': 1.6250000000000002e-05, 'epoch': 33.75}


 68%|██████▊   | 1630/2400 [34:50<14:24,  1.12s/it]

{'loss': 0.3688, 'grad_norm': 28.355510711669922, 'learning_rate': 1.604166666666667e-05, 'epoch': 33.96}


                                                   
 68%|██████▊   | 1632/2400 [34:59<13:11,  1.03s/it]

{'eval_loss': 0.9243371486663818, 'eval_runtime': 7.0405, 'eval_samples_per_second': 17.044, 'eval_steps_per_second': 2.131, 'epoch': 34.0}


 68%|██████▊   | 1640/2400 [35:08<16:34,  1.31s/it]

{'loss': 0.3377, 'grad_norm': 37.56144714355469, 'learning_rate': 1.5833333333333333e-05, 'epoch': 34.17}


 69%|██████▉   | 1650/2400 [35:20<14:09,  1.13s/it]

{'loss': 0.3483, 'grad_norm': 36.927955627441406, 'learning_rate': 1.5625e-05, 'epoch': 34.38}


 69%|██████▉   | 1660/2400 [35:31<13:55,  1.13s/it]

{'loss': 0.3127, 'grad_norm': 33.2502326965332, 'learning_rate': 1.5416666666666668e-05, 'epoch': 34.58}


 70%|██████▉   | 1670/2400 [35:42<13:32,  1.11s/it]

{'loss': 0.3331, 'grad_norm': 22.68816375732422, 'learning_rate': 1.5208333333333333e-05, 'epoch': 34.79}


 70%|███████   | 1680/2400 [35:53<12:19,  1.03s/it]

{'loss': 0.3556, 'grad_norm': 21.557479858398438, 'learning_rate': 1.5e-05, 'epoch': 35.0}


                                                   
 70%|███████   | 1680/2400 [36:00<12:19,  1.03s/it]

{'eval_loss': 0.9468952417373657, 'eval_runtime': 7.105, 'eval_samples_per_second': 16.889, 'eval_steps_per_second': 2.111, 'epoch': 35.0}


 70%|███████   | 1690/2400 [36:12<14:26,  1.22s/it]

{'loss': 0.3282, 'grad_norm': 32.77067565917969, 'learning_rate': 1.4791666666666668e-05, 'epoch': 35.21}


 71%|███████   | 1700/2400 [36:23<13:12,  1.13s/it]

{'loss': 0.3424, 'grad_norm': 45.557376861572266, 'learning_rate': 1.4583333333333335e-05, 'epoch': 35.42}


 71%|███████▏  | 1710/2400 [36:35<13:14,  1.15s/it]

{'loss': 0.2881, 'grad_norm': 23.968908309936523, 'learning_rate': 1.4374999999999999e-05, 'epoch': 35.62}


 72%|███████▏  | 1720/2400 [36:46<12:47,  1.13s/it]

{'loss': 0.3066, 'grad_norm': 29.49365234375, 'learning_rate': 1.4166666666666668e-05, 'epoch': 35.83}


                                                   
 72%|███████▏  | 1728/2400 [37:02<11:40,  1.04s/it]

{'eval_loss': 0.9055318832397461, 'eval_runtime': 7.0112, 'eval_samples_per_second': 17.116, 'eval_steps_per_second': 2.139, 'epoch': 36.0}


 72%|███████▏  | 1730/2400 [37:05<29:35,  2.65s/it]

{'loss': 0.315, 'grad_norm': 30.03326988220215, 'learning_rate': 1.3958333333333335e-05, 'epoch': 36.04}


 72%|███████▎  | 1740/2400 [37:16<13:10,  1.20s/it]

{'loss': 0.34, 'grad_norm': 38.254737854003906, 'learning_rate': 1.3750000000000002e-05, 'epoch': 36.25}


 73%|███████▎  | 1750/2400 [37:28<12:29,  1.15s/it]

{'loss': 0.3223, 'grad_norm': 25.30882453918457, 'learning_rate': 1.3541666666666666e-05, 'epoch': 36.46}


 73%|███████▎  | 1760/2400 [37:39<12:33,  1.18s/it]

{'loss': 0.2862, 'grad_norm': 54.161006927490234, 'learning_rate': 1.3333333333333333e-05, 'epoch': 36.67}


 74%|███████▍  | 1770/2400 [37:51<12:00,  1.14s/it]

{'loss': 0.2462, 'grad_norm': 48.69904708862305, 'learning_rate': 1.3125e-05, 'epoch': 36.88}


                                                   
 74%|███████▍  | 1776/2400 [38:04<10:49,  1.04s/it]

{'eval_loss': 0.9314776659011841, 'eval_runtime': 7.1134, 'eval_samples_per_second': 16.869, 'eval_steps_per_second': 2.109, 'epoch': 37.0}


 74%|███████▍  | 1780/2400 [38:09<19:23,  1.88s/it]

{'loss': 0.3018, 'grad_norm': 33.72697067260742, 'learning_rate': 1.2916666666666668e-05, 'epoch': 37.08}


 75%|███████▍  | 1790/2400 [38:21<11:52,  1.17s/it]

{'loss': 0.3016, 'grad_norm': 34.3427848815918, 'learning_rate': 1.2708333333333333e-05, 'epoch': 37.29}


 75%|███████▌  | 1800/2400 [38:33<11:44,  1.17s/it]

{'loss': 0.2985, 'grad_norm': 26.406972885131836, 'learning_rate': 1.25e-05, 'epoch': 37.5}


 75%|███████▌  | 1810/2400 [38:44<11:16,  1.15s/it]

{'loss': 0.2732, 'grad_norm': 28.58176040649414, 'learning_rate': 1.2291666666666666e-05, 'epoch': 37.71}


 76%|███████▌  | 1820/2400 [38:56<11:17,  1.17s/it]

{'loss': 0.2537, 'grad_norm': 35.65806198120117, 'learning_rate': 1.2083333333333333e-05, 'epoch': 37.92}


                                                   
 76%|███████▌  | 1824/2400 [39:07<10:13,  1.06s/it]

{'eval_loss': 0.9481073021888733, 'eval_runtime': 7.2687, 'eval_samples_per_second': 16.509, 'eval_steps_per_second': 2.064, 'epoch': 38.0}


 76%|███████▋  | 1830/2400 [39:15<14:52,  1.57s/it]

{'loss': 0.2817, 'grad_norm': 20.211307525634766, 'learning_rate': 1.1875e-05, 'epoch': 38.12}


 77%|███████▋  | 1840/2400 [39:27<11:06,  1.19s/it]

{'loss': 0.2323, 'grad_norm': 49.9239616394043, 'learning_rate': 1.1666666666666668e-05, 'epoch': 38.33}


 77%|███████▋  | 1850/2400 [39:38<10:30,  1.15s/it]

{'loss': 0.2651, 'grad_norm': 31.185880661010742, 'learning_rate': 1.1458333333333333e-05, 'epoch': 38.54}


 78%|███████▊  | 1860/2400 [39:50<10:31,  1.17s/it]

{'loss': 0.2935, 'grad_norm': 24.590755462646484, 'learning_rate': 1.125e-05, 'epoch': 38.75}


 78%|███████▊  | 1870/2400 [40:02<10:14,  1.16s/it]

{'loss': 0.3194, 'grad_norm': 21.341054916381836, 'learning_rate': 1.1041666666666666e-05, 'epoch': 38.96}


                                                   
 78%|███████▊  | 1872/2400 [40:11<09:15,  1.05s/it]

{'eval_loss': 0.9296230673789978, 'eval_runtime': 7.028, 'eval_samples_per_second': 17.074, 'eval_steps_per_second': 2.134, 'epoch': 39.0}


 78%|███████▊  | 1880/2400 [40:20<11:21,  1.31s/it]

{'loss': 0.2451, 'grad_norm': 35.74227523803711, 'learning_rate': 1.0833333333333334e-05, 'epoch': 39.17}


 79%|███████▉  | 1890/2400 [40:32<09:38,  1.13s/it]

{'loss': 0.2445, 'grad_norm': 25.836048126220703, 'learning_rate': 1.0625e-05, 'epoch': 39.38}


 79%|███████▉  | 1900/2400 [40:43<09:23,  1.13s/it]

{'loss': 0.2364, 'grad_norm': 29.52741813659668, 'learning_rate': 1.0416666666666668e-05, 'epoch': 39.58}


 80%|███████▉  | 1910/2400 [40:54<09:12,  1.13s/it]

{'loss': 0.2942, 'grad_norm': 24.769454956054688, 'learning_rate': 1.0208333333333334e-05, 'epoch': 39.79}


 80%|████████  | 1920/2400 [41:05<08:23,  1.05s/it]

{'loss': 0.2594, 'grad_norm': 30.718280792236328, 'learning_rate': 1e-05, 'epoch': 40.0}


                                                   
 80%|████████  | 1920/2400 [41:12<08:23,  1.05s/it]

{'eval_loss': 0.9347819089889526, 'eval_runtime': 7.1597, 'eval_samples_per_second': 16.761, 'eval_steps_per_second': 2.095, 'epoch': 40.0}


 80%|████████  | 1930/2400 [41:24<09:37,  1.23s/it]

{'loss': 0.2533, 'grad_norm': 31.076778411865234, 'learning_rate': 9.791666666666666e-06, 'epoch': 40.21}


 81%|████████  | 1940/2400 [41:35<08:33,  1.12s/it]

{'loss': 0.2112, 'grad_norm': 40.104190826416016, 'learning_rate': 9.583333333333334e-06, 'epoch': 40.42}


 81%|████████▏ | 1950/2400 [41:46<08:22,  1.12s/it]

{'loss': 0.2373, 'grad_norm': 24.19012451171875, 'learning_rate': 9.375000000000001e-06, 'epoch': 40.62}


 82%|████████▏ | 1960/2400 [41:58<08:13,  1.12s/it]

{'loss': 0.2279, 'grad_norm': 38.943580627441406, 'learning_rate': 9.166666666666666e-06, 'epoch': 40.83}


                                                   
 82%|████████▏ | 1968/2400 [42:13<07:28,  1.04s/it]

{'eval_loss': 0.9559744596481323, 'eval_runtime': 7.0141, 'eval_samples_per_second': 17.108, 'eval_steps_per_second': 2.139, 'epoch': 41.0}


 82%|████████▏ | 1970/2400 [42:16<18:58,  2.65s/it]

{'loss': 0.1974, 'grad_norm': 36.369014739990234, 'learning_rate': 8.958333333333334e-06, 'epoch': 41.04}


 82%|████████▎ | 1980/2400 [42:27<08:12,  1.17s/it]

{'loss': 0.1937, 'grad_norm': 23.297861099243164, 'learning_rate': 8.75e-06, 'epoch': 41.25}


 83%|████████▎ | 1990/2400 [42:39<07:37,  1.11s/it]

{'loss': 0.2435, 'grad_norm': 33.47713851928711, 'learning_rate': 8.541666666666666e-06, 'epoch': 41.46}


 83%|████████▎ | 2000/2400 [42:50<07:28,  1.12s/it]

{'loss': 0.2153, 'grad_norm': 25.547161102294922, 'learning_rate': 8.333333333333334e-06, 'epoch': 41.67}


 84%|████████▍ | 2010/2400 [43:01<07:25,  1.14s/it]

{'loss': 0.2247, 'grad_norm': 28.244171142578125, 'learning_rate': 8.125000000000001e-06, 'epoch': 41.88}


                                                   
 84%|████████▍ | 2016/2400 [43:15<06:36,  1.03s/it]

{'eval_loss': 0.9344708323478699, 'eval_runtime': 6.9864, 'eval_samples_per_second': 17.176, 'eval_steps_per_second': 2.147, 'epoch': 42.0}


 84%|████████▍ | 2020/2400 [43:20<11:56,  1.88s/it]

{'loss': 0.2422, 'grad_norm': 44.68596267700195, 'learning_rate': 7.916666666666667e-06, 'epoch': 42.08}


 85%|████████▍ | 2030/2400 [43:31<07:06,  1.15s/it]

{'loss': 0.1945, 'grad_norm': 29.118715286254883, 'learning_rate': 7.708333333333334e-06, 'epoch': 42.29}


 85%|████████▌ | 2040/2400 [43:43<07:25,  1.24s/it]

{'loss': 0.2467, 'grad_norm': 26.13985824584961, 'learning_rate': 7.5e-06, 'epoch': 42.5}


 85%|████████▌ | 2050/2400 [43:55<06:59,  1.20s/it]

{'loss': 0.2176, 'grad_norm': 28.834259033203125, 'learning_rate': 7.2916666666666674e-06, 'epoch': 42.71}


 86%|████████▌ | 2060/2400 [44:07<06:22,  1.12s/it]

{'loss': 0.2376, 'grad_norm': 30.014572143554688, 'learning_rate': 7.083333333333334e-06, 'epoch': 42.92}


                                                   
 86%|████████▌ | 2064/2400 [44:18<05:44,  1.03s/it]

{'eval_loss': 0.9398954510688782, 'eval_runtime': 6.8326, 'eval_samples_per_second': 17.563, 'eval_steps_per_second': 2.195, 'epoch': 43.0}


 86%|████████▋ | 2070/2400 [44:25<08:10,  1.49s/it]

{'loss': 0.1836, 'grad_norm': 28.0694637298584, 'learning_rate': 6.875000000000001e-06, 'epoch': 43.12}


 87%|████████▋ | 2080/2400 [44:36<06:04,  1.14s/it]

{'loss': 0.1943, 'grad_norm': 25.309249877929688, 'learning_rate': 6.666666666666667e-06, 'epoch': 43.33}


 87%|████████▋ | 2090/2400 [44:47<05:47,  1.12s/it]

{'loss': 0.1847, 'grad_norm': 33.857215881347656, 'learning_rate': 6.458333333333334e-06, 'epoch': 43.54}


 88%|████████▊ | 2100/2400 [44:59<05:39,  1.13s/it]

{'loss': 0.2004, 'grad_norm': 37.35911178588867, 'learning_rate': 6.25e-06, 'epoch': 43.75}


 88%|████████▊ | 2110/2400 [45:10<05:29,  1.14s/it]

{'loss': 0.2175, 'grad_norm': 47.56922912597656, 'learning_rate': 6.041666666666667e-06, 'epoch': 43.96}


                                                   
 88%|████████▊ | 2112/2400 [45:19<04:57,  1.03s/it]

{'eval_loss': 0.9425292611122131, 'eval_runtime': 6.9297, 'eval_samples_per_second': 17.317, 'eval_steps_per_second': 2.165, 'epoch': 44.0}


 88%|████████▊ | 2120/2400 [45:28<06:11,  1.33s/it]

{'loss': 0.1787, 'grad_norm': 34.89528274536133, 'learning_rate': 5.833333333333334e-06, 'epoch': 44.17}


 89%|████████▉ | 2130/2400 [45:40<05:05,  1.13s/it]

{'loss': 0.189, 'grad_norm': 27.419979095458984, 'learning_rate': 5.625e-06, 'epoch': 44.38}


 89%|████████▉ | 2140/2400 [45:51<04:51,  1.12s/it]

{'loss': 0.2229, 'grad_norm': 35.75274658203125, 'learning_rate': 5.416666666666667e-06, 'epoch': 44.58}


 90%|████████▉ | 2150/2400 [46:02<04:43,  1.14s/it]

{'loss': 0.1846, 'grad_norm': 28.787189483642578, 'learning_rate': 5.208333333333334e-06, 'epoch': 44.79}


 90%|█████████ | 2160/2400 [46:13<04:06,  1.03s/it]

{'loss': 0.1617, 'grad_norm': 29.950586318969727, 'learning_rate': 5e-06, 'epoch': 45.0}


                                                   
 90%|█████████ | 2160/2400 [46:20<04:06,  1.03s/it]

{'eval_loss': 0.9388759136199951, 'eval_runtime': 6.9199, 'eval_samples_per_second': 17.341, 'eval_steps_per_second': 2.168, 'epoch': 45.0}


 90%|█████████ | 2170/2400 [46:32<04:42,  1.23s/it]

{'loss': 0.1713, 'grad_norm': 21.549373626708984, 'learning_rate': 4.791666666666667e-06, 'epoch': 45.21}


 91%|█████████ | 2180/2400 [46:44<04:26,  1.21s/it]

{'loss': 0.1934, 'grad_norm': 44.96153259277344, 'learning_rate': 4.583333333333333e-06, 'epoch': 45.42}


 91%|█████████▏| 2190/2400 [46:55<03:57,  1.13s/it]

{'loss': 0.1816, 'grad_norm': 29.226911544799805, 'learning_rate': 4.375e-06, 'epoch': 45.62}


 92%|█████████▏| 2200/2400 [47:07<03:46,  1.13s/it]

{'loss': 0.1465, 'grad_norm': 40.05644226074219, 'learning_rate': 4.166666666666667e-06, 'epoch': 45.83}


                                                   
 92%|█████████▏| 2208/2400 [47:22<03:19,  1.04s/it]

{'eval_loss': 0.9485101103782654, 'eval_runtime': 6.87, 'eval_samples_per_second': 17.467, 'eval_steps_per_second': 2.183, 'epoch': 46.0}


 92%|█████████▏| 2210/2400 [47:25<08:18,  2.63s/it]

{'loss': 0.1212, 'grad_norm': 35.514320373535156, 'learning_rate': 3.958333333333333e-06, 'epoch': 46.04}


 92%|█████████▎| 2220/2400 [47:37<03:32,  1.18s/it]

{'loss': 0.1633, 'grad_norm': 28.848323822021484, 'learning_rate': 3.75e-06, 'epoch': 46.25}


 93%|█████████▎| 2230/2400 [47:48<03:09,  1.11s/it]

{'loss': 0.1755, 'grad_norm': 29.150388717651367, 'learning_rate': 3.541666666666667e-06, 'epoch': 46.46}


 93%|█████████▎| 2240/2400 [47:59<02:58,  1.11s/it]

{'loss': 0.1663, 'grad_norm': 41.344486236572266, 'learning_rate': 3.3333333333333333e-06, 'epoch': 46.67}


 94%|█████████▍| 2250/2400 [48:10<02:47,  1.11s/it]

{'loss': 0.1639, 'grad_norm': 32.293392181396484, 'learning_rate': 3.125e-06, 'epoch': 46.88}


                                                   
 94%|█████████▍| 2256/2400 [48:23<02:26,  1.01s/it]

{'eval_loss': 0.9385384321212769, 'eval_runtime': 6.9814, 'eval_samples_per_second': 17.188, 'eval_steps_per_second': 2.149, 'epoch': 47.0}


 94%|█████████▍| 2260/2400 [48:28<04:23,  1.88s/it]

{'loss': 0.167, 'grad_norm': 29.694005966186523, 'learning_rate': 2.916666666666667e-06, 'epoch': 47.08}


 95%|█████████▍| 2270/2400 [48:40<02:29,  1.15s/it]

{'loss': 0.1141, 'grad_norm': 32.72202682495117, 'learning_rate': 2.7083333333333334e-06, 'epoch': 47.29}


 95%|█████████▌| 2280/2400 [48:51<02:15,  1.13s/it]

{'loss': 0.1863, 'grad_norm': 21.975406646728516, 'learning_rate': 2.5e-06, 'epoch': 47.5}


 95%|█████████▌| 2290/2400 [49:02<02:01,  1.10s/it]

{'loss': 0.1273, 'grad_norm': 29.737415313720703, 'learning_rate': 2.2916666666666666e-06, 'epoch': 47.71}


 96%|█████████▌| 2300/2400 [49:13<01:50,  1.10s/it]

{'loss': 0.1507, 'grad_norm': 26.102453231811523, 'learning_rate': 2.0833333333333334e-06, 'epoch': 47.92}


                                                   
 96%|█████████▌| 2304/2400 [49:24<01:36,  1.01s/it]

{'eval_loss': 0.9499157667160034, 'eval_runtime': 6.8721, 'eval_samples_per_second': 17.462, 'eval_steps_per_second': 2.183, 'epoch': 48.0}


 96%|█████████▋| 2310/2400 [49:31<02:12,  1.47s/it]

{'loss': 0.1745, 'grad_norm': 16.61427116394043, 'learning_rate': 1.875e-06, 'epoch': 48.12}


 97%|█████████▋| 2320/2400 [49:42<01:32,  1.16s/it]

{'loss': 0.1582, 'grad_norm': 31.606468200683594, 'learning_rate': 1.6666666666666667e-06, 'epoch': 48.33}


 97%|█████████▋| 2330/2400 [49:54<01:19,  1.13s/it]

{'loss': 0.1584, 'grad_norm': 30.60253143310547, 'learning_rate': 1.4583333333333335e-06, 'epoch': 48.54}


 98%|█████████▊| 2340/2400 [50:05<01:05,  1.10s/it]

{'loss': 0.1455, 'grad_norm': 36.39582824707031, 'learning_rate': 1.25e-06, 'epoch': 48.75}


 98%|█████████▊| 2350/2400 [50:16<00:54,  1.09s/it]

{'loss': 0.1182, 'grad_norm': 21.667299270629883, 'learning_rate': 1.0416666666666667e-06, 'epoch': 48.96}


                                                   
 98%|█████████▊| 2352/2400 [50:24<00:48,  1.00s/it]

{'eval_loss': 0.9432058930397034, 'eval_runtime': 6.796, 'eval_samples_per_second': 17.657, 'eval_steps_per_second': 2.207, 'epoch': 49.0}


 98%|█████████▊| 2360/2400 [50:34<00:50,  1.27s/it]

{'loss': 0.1368, 'grad_norm': 21.043956756591797, 'learning_rate': 8.333333333333333e-07, 'epoch': 49.17}


 99%|█████████▉| 2370/2400 [50:45<00:33,  1.11s/it]

{'loss': 0.1426, 'grad_norm': 25.23729133605957, 'learning_rate': 6.25e-07, 'epoch': 49.38}


 99%|█████████▉| 2380/2400 [50:56<00:21,  1.10s/it]

{'loss': 0.1068, 'grad_norm': 24.69630241394043, 'learning_rate': 4.1666666666666667e-07, 'epoch': 49.58}


100%|█████████▉| 2390/2400 [51:07<00:10,  1.09s/it]

{'loss': 0.1954, 'grad_norm': 25.97264289855957, 'learning_rate': 2.0833333333333333e-07, 'epoch': 49.79}


100%|██████████| 2400/2400 [51:17<00:00,  1.01s/it]

{'loss': 0.1392, 'grad_norm': 22.109771728515625, 'learning_rate': 0.0, 'epoch': 50.0}


                                                   
100%|██████████| 2400/2400 [51:24<00:00,  1.29s/it]


{'eval_loss': 0.947696328163147, 'eval_runtime': 6.7448, 'eval_samples_per_second': 17.791, 'eval_steps_per_second': 2.224, 'epoch': 50.0}
{'train_runtime': 3084.495, 'train_samples_per_second': 7.781, 'train_steps_per_second': 0.778, 'train_loss': 0.5806682108342648, 'epoch': 50.0}


100%|██████████| 15/15 [00:05<00:00,  2.51it/s]
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetecti

Using device: cuda:0


  0%|          | 10/2400 [00:09<36:44,  1.08it/s] 

{'loss': 2.5301, 'grad_norm': 294.0908508300781, 'learning_rate': 4.979166666666667e-05, 'epoch': 0.21}


  1%|          | 20/2400 [00:19<36:04,  1.10it/s]

{'loss': 2.1859, 'grad_norm': 148.22686767578125, 'learning_rate': 4.958333333333334e-05, 'epoch': 0.42}


  1%|▏         | 30/2400 [00:28<35:35,  1.11it/s]

{'loss': 2.5357, 'grad_norm': 59.04261779785156, 'learning_rate': 4.937500000000001e-05, 'epoch': 0.62}


  2%|▏         | 40/2400 [00:37<36:13,  1.09it/s]

{'loss': 1.7692, 'grad_norm': 81.17992401123047, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.83}


  2%|▏         | 48/2400 [00:44<32:06,  1.22it/s]
  2%|▏         | 48/2400 [00:51<32:06,  1.22it/s]

{'eval_loss': 1.5786865949630737, 'eval_runtime': 7.7023, 'eval_samples_per_second': 15.58, 'eval_steps_per_second': 1.947, 'epoch': 1.0}


  2%|▏         | 50/2400 [00:54<1:40:16,  2.56s/it]

{'loss': 1.7603, 'grad_norm': 60.770328521728516, 'learning_rate': 4.8958333333333335e-05, 'epoch': 1.04}


  2%|▎         | 60/2400 [01:03<39:10,  1.00s/it]  

{'loss': 1.3443, 'grad_norm': 36.38862609863281, 'learning_rate': 4.875e-05, 'epoch': 1.25}


  3%|▎         | 70/2400 [01:12<34:52,  1.11it/s]

{'loss': 1.3158, 'grad_norm': 59.92686462402344, 'learning_rate': 4.854166666666667e-05, 'epoch': 1.46}


  3%|▎         | 80/2400 [01:22<36:18,  1.06it/s]

{'loss': 1.4355, 'grad_norm': 47.52374267578125, 'learning_rate': 4.8333333333333334e-05, 'epoch': 1.67}


  4%|▍         | 90/2400 [01:31<34:51,  1.10it/s]

{'loss': 1.3314, 'grad_norm': 36.580177307128906, 'learning_rate': 4.8125000000000004e-05, 'epoch': 1.88}


  4%|▍         | 96/2400 [01:36<30:42,  1.25it/s]
  4%|▍         | 96/2400 [01:43<30:42,  1.25it/s]

{'eval_loss': 1.346444010734558, 'eval_runtime': 7.6458, 'eval_samples_per_second': 15.695, 'eval_steps_per_second': 1.962, 'epoch': 2.0}


  4%|▍         | 100/2400 [01:47<1:05:14,  1.70s/it]

{'loss': 1.3999, 'grad_norm': 45.79840850830078, 'learning_rate': 4.791666666666667e-05, 'epoch': 2.08}


  5%|▍         | 110/2400 [01:56<35:06,  1.09it/s]  

{'loss': 1.281, 'grad_norm': 80.77574920654297, 'learning_rate': 4.770833333333334e-05, 'epoch': 2.29}


  5%|▌         | 120/2400 [02:05<34:07,  1.11it/s]

{'loss': 1.3623, 'grad_norm': 34.345401763916016, 'learning_rate': 4.75e-05, 'epoch': 2.5}


  5%|▌         | 130/2400 [02:14<34:00,  1.11it/s]

{'loss': 1.4103, 'grad_norm': 29.348041534423828, 'learning_rate': 4.7291666666666666e-05, 'epoch': 2.71}


  6%|▌         | 140/2400 [02:23<33:59,  1.11it/s]

{'loss': 1.5116, 'grad_norm': 33.68116760253906, 'learning_rate': 4.708333333333334e-05, 'epoch': 2.92}


  6%|▌         | 144/2400 [02:27<29:44,  1.26it/s]
  6%|▌         | 144/2400 [02:34<29:44,  1.26it/s]

{'eval_loss': 1.2775428295135498, 'eval_runtime': 7.5227, 'eval_samples_per_second': 15.952, 'eval_steps_per_second': 1.994, 'epoch': 3.0}


  6%|▋         | 150/2400 [02:40<47:40,  1.27s/it]  

{'loss': 1.4502, 'grad_norm': 87.45465850830078, 'learning_rate': 4.6875e-05, 'epoch': 3.12}


  7%|▋         | 160/2400 [02:49<34:49,  1.07it/s]

{'loss': 1.4572, 'grad_norm': 32.53771209716797, 'learning_rate': 4.666666666666667e-05, 'epoch': 3.33}


  7%|▋         | 170/2400 [02:59<35:39,  1.04it/s]

{'loss': 1.1292, 'grad_norm': 108.24874114990234, 'learning_rate': 4.6458333333333335e-05, 'epoch': 3.54}


  8%|▊         | 180/2400 [03:09<35:56,  1.03it/s]

{'loss': 1.5179, 'grad_norm': 62.15097427368164, 'learning_rate': 4.6250000000000006e-05, 'epoch': 3.75}


  8%|▊         | 190/2400 [03:18<35:57,  1.02it/s]

{'loss': 1.3616, 'grad_norm': 57.69001388549805, 'learning_rate': 4.604166666666666e-05, 'epoch': 3.96}


  8%|▊         | 192/2400 [03:20<30:46,  1.20it/s]
  8%|▊         | 192/2400 [03:27<30:46,  1.20it/s]

{'eval_loss': 1.4091777801513672, 'eval_runtime': 7.5832, 'eval_samples_per_second': 15.824, 'eval_steps_per_second': 1.978, 'epoch': 4.0}


  8%|▊         | 200/2400 [03:35<39:46,  1.08s/it]  

{'loss': 1.4459, 'grad_norm': 33.53784942626953, 'learning_rate': 4.5833333333333334e-05, 'epoch': 4.17}


  9%|▉         | 210/2400 [03:44<32:58,  1.11it/s]

{'loss': 1.1644, 'grad_norm': 168.3345184326172, 'learning_rate': 4.5625e-05, 'epoch': 4.38}


  9%|▉         | 220/2400 [03:53<34:54,  1.04it/s]

{'loss': 1.3945, 'grad_norm': 88.47871398925781, 'learning_rate': 4.541666666666667e-05, 'epoch': 4.58}


 10%|▉         | 230/2400 [04:03<36:02,  1.00it/s]

{'loss': 1.2265, 'grad_norm': 40.190181732177734, 'learning_rate': 4.520833333333334e-05, 'epoch': 4.79}


 10%|█         | 240/2400 [04:13<31:18,  1.15it/s]

{'loss': 1.3133, 'grad_norm': 192.2684783935547, 'learning_rate': 4.5e-05, 'epoch': 5.0}



 10%|█         | 240/2400 [04:21<31:18,  1.15it/s]

{'eval_loss': 1.2718377113342285, 'eval_runtime': 8.0763, 'eval_samples_per_second': 14.858, 'eval_steps_per_second': 1.857, 'epoch': 5.0}


 10%|█         | 250/2400 [04:31<38:14,  1.07s/it]  

{'loss': 1.1999, 'grad_norm': 40.84565734863281, 'learning_rate': 4.4791666666666673e-05, 'epoch': 5.21}


 11%|█         | 260/2400 [04:40<32:26,  1.10it/s]

{'loss': 1.5128, 'grad_norm': 29.41852569580078, 'learning_rate': 4.458333333333334e-05, 'epoch': 5.42}


 11%|█▏        | 270/2400 [04:49<31:34,  1.12it/s]

{'loss': 1.3845, 'grad_norm': 33.898834228515625, 'learning_rate': 4.4375e-05, 'epoch': 5.62}


 12%|█▏        | 280/2400 [04:58<31:25,  1.12it/s]

{'loss': 1.1722, 'grad_norm': 38.75770568847656, 'learning_rate': 4.4166666666666665e-05, 'epoch': 5.83}


 12%|█▏        | 288/2400 [05:05<27:41,  1.27it/s]
 12%|█▏        | 288/2400 [05:12<27:41,  1.27it/s]

{'eval_loss': 1.2897549867630005, 'eval_runtime': 7.4943, 'eval_samples_per_second': 16.012, 'eval_steps_per_second': 2.002, 'epoch': 6.0}


 12%|█▏        | 290/2400 [05:15<1:27:37,  2.49s/it]

{'loss': 1.2485, 'grad_norm': 30.821353912353516, 'learning_rate': 4.3958333333333336e-05, 'epoch': 6.04}


 12%|█▎        | 300/2400 [05:23<33:00,  1.06it/s]  

{'loss': 1.1342, 'grad_norm': 33.9752197265625, 'learning_rate': 4.375e-05, 'epoch': 6.25}


 13%|█▎        | 310/2400 [05:32<30:26,  1.14it/s]

{'loss': 1.1146, 'grad_norm': 39.09653854370117, 'learning_rate': 4.354166666666667e-05, 'epoch': 6.46}


 13%|█▎        | 320/2400 [05:41<30:37,  1.13it/s]

{'loss': 1.3425, 'grad_norm': 29.59172821044922, 'learning_rate': 4.3333333333333334e-05, 'epoch': 6.67}


 14%|█▍        | 330/2400 [05:50<30:30,  1.13it/s]

{'loss': 1.1921, 'grad_norm': 36.791648864746094, 'learning_rate': 4.3125000000000005e-05, 'epoch': 6.88}


 14%|█▍        | 336/2400 [05:55<27:03,  1.27it/s]
 14%|█▍        | 336/2400 [06:02<27:03,  1.27it/s]

{'eval_loss': 1.133249044418335, 'eval_runtime': 7.4425, 'eval_samples_per_second': 16.124, 'eval_steps_per_second': 2.015, 'epoch': 7.0}


 14%|█▍        | 340/2400 [06:06<57:08,  1.66s/it]  

{'loss': 1.0885, 'grad_norm': 32.74349594116211, 'learning_rate': 4.291666666666667e-05, 'epoch': 7.08}


 15%|█▍        | 350/2400 [06:15<30:26,  1.12it/s]

{'loss': 1.3126, 'grad_norm': 40.48517608642578, 'learning_rate': 4.270833333333333e-05, 'epoch': 7.29}


 15%|█▌        | 360/2400 [06:24<30:09,  1.13it/s]

{'loss': 1.1966, 'grad_norm': 72.59716033935547, 'learning_rate': 4.25e-05, 'epoch': 7.5}


 15%|█▌        | 370/2400 [06:33<29:56,  1.13it/s]

{'loss': 1.1351, 'grad_norm': 22.454700469970703, 'learning_rate': 4.229166666666667e-05, 'epoch': 7.71}


 16%|█▌        | 380/2400 [06:42<30:06,  1.12it/s]

{'loss': 1.1609, 'grad_norm': 37.758358001708984, 'learning_rate': 4.208333333333334e-05, 'epoch': 7.92}


 16%|█▌        | 384/2400 [06:45<26:26,  1.27it/s]
 16%|█▌        | 384/2400 [06:53<26:26,  1.27it/s]

{'eval_loss': 1.0838223695755005, 'eval_runtime': 7.4182, 'eval_samples_per_second': 16.176, 'eval_steps_per_second': 2.022, 'epoch': 8.0}


 16%|█▋        | 390/2400 [06:58<42:19,  1.26s/it]  

{'loss': 1.076, 'grad_norm': 31.79969596862793, 'learning_rate': 4.1875e-05, 'epoch': 8.12}


 17%|█▋        | 400/2400 [07:07<29:58,  1.11it/s]

{'loss': 0.9674, 'grad_norm': 34.63935470581055, 'learning_rate': 4.166666666666667e-05, 'epoch': 8.33}


 17%|█▋        | 410/2400 [07:16<29:10,  1.14it/s]

{'loss': 1.3035, 'grad_norm': 78.68498229980469, 'learning_rate': 4.1458333333333336e-05, 'epoch': 8.54}


 18%|█▊        | 420/2400 [07:25<30:09,  1.09it/s]

{'loss': 1.0846, 'grad_norm': 31.815893173217773, 'learning_rate': 4.125e-05, 'epoch': 8.75}


 18%|█▊        | 430/2400 [07:34<29:48,  1.10it/s]

{'loss': 1.2317, 'grad_norm': 37.07801055908203, 'learning_rate': 4.104166666666667e-05, 'epoch': 8.96}


 18%|█▊        | 432/2400 [07:36<26:07,  1.26it/s]
 18%|█▊        | 432/2400 [07:43<26:07,  1.26it/s]

{'eval_loss': 1.3445078134536743, 'eval_runtime': 7.545, 'eval_samples_per_second': 15.905, 'eval_steps_per_second': 1.988, 'epoch': 9.0}


 18%|█▊        | 440/2400 [07:51<35:09,  1.08s/it]  

{'loss': 1.4232, 'grad_norm': 36.97389602661133, 'learning_rate': 4.0833333333333334e-05, 'epoch': 9.17}


 19%|█▉        | 450/2400 [07:59<28:22,  1.15it/s]

{'loss': 1.2486, 'grad_norm': 51.466609954833984, 'learning_rate': 4.0625000000000005e-05, 'epoch': 9.38}


 19%|█▉        | 460/2400 [08:08<27:55,  1.16it/s]

{'loss': 1.0766, 'grad_norm': 41.07866287231445, 'learning_rate': 4.041666666666667e-05, 'epoch': 9.58}


 20%|█▉        | 470/2400 [08:17<28:21,  1.13it/s]

{'loss': 1.1971, 'grad_norm': 28.823606491088867, 'learning_rate': 4.020833333333334e-05, 'epoch': 9.79}


 20%|██        | 480/2400 [08:26<25:02,  1.28it/s]

{'loss': 1.3823, 'grad_norm': 77.8322982788086, 'learning_rate': 4e-05, 'epoch': 10.0}



 20%|██        | 480/2400 [08:33<25:02,  1.28it/s]

{'eval_loss': 1.1592308282852173, 'eval_runtime': 7.8749, 'eval_samples_per_second': 15.238, 'eval_steps_per_second': 1.905, 'epoch': 10.0}


 20%|██        | 490/2400 [08:43<32:58,  1.04s/it]  

{'loss': 1.3156, 'grad_norm': 60.50212097167969, 'learning_rate': 3.979166666666667e-05, 'epoch': 10.21}


 21%|██        | 500/2400 [08:53<30:38,  1.03it/s]

{'loss': 1.0003, 'grad_norm': 32.658721923828125, 'learning_rate': 3.958333333333333e-05, 'epoch': 10.42}


 21%|██▏       | 510/2400 [09:02<28:42,  1.10it/s]

{'loss': 1.1727, 'grad_norm': 21.22163963317871, 'learning_rate': 3.9375e-05, 'epoch': 10.62}


 22%|██▏       | 520/2400 [09:11<28:28,  1.10it/s]

{'loss': 1.0362, 'grad_norm': 243.30870056152344, 'learning_rate': 3.9166666666666665e-05, 'epoch': 10.83}


 22%|██▏       | 528/2400 [09:18<25:43,  1.21it/s]
 22%|██▏       | 528/2400 [09:26<25:43,  1.21it/s]

{'eval_loss': 1.1613600254058838, 'eval_runtime': 7.6736, 'eval_samples_per_second': 15.638, 'eval_steps_per_second': 1.955, 'epoch': 11.0}


 22%|██▏       | 530/2400 [09:28<1:19:58,  2.57s/it]

{'loss': 1.0309, 'grad_norm': 40.510047912597656, 'learning_rate': 3.8958333333333336e-05, 'epoch': 11.04}


 22%|██▎       | 540/2400 [09:37<30:05,  1.03it/s]  

{'loss': 0.9554, 'grad_norm': 29.56011199951172, 'learning_rate': 3.875e-05, 'epoch': 11.25}


 23%|██▎       | 550/2400 [09:46<28:09,  1.09it/s]

{'loss': 1.0805, 'grad_norm': 36.25480651855469, 'learning_rate': 3.854166666666667e-05, 'epoch': 11.46}


 23%|██▎       | 560/2400 [09:55<27:36,  1.11it/s]

{'loss': 1.1534, 'grad_norm': 116.7842788696289, 'learning_rate': 3.8333333333333334e-05, 'epoch': 11.67}


 24%|██▍       | 570/2400 [10:04<27:19,  1.12it/s]

{'loss': 1.1821, 'grad_norm': 122.49872589111328, 'learning_rate': 3.8125e-05, 'epoch': 11.88}


 24%|██▍       | 576/2400 [10:09<24:35,  1.24it/s]
 24%|██▍       | 576/2400 [10:17<24:35,  1.24it/s]

{'eval_loss': 1.0644527673721313, 'eval_runtime': 7.7149, 'eval_samples_per_second': 15.554, 'eval_steps_per_second': 1.944, 'epoch': 12.0}


 24%|██▍       | 580/2400 [10:21<53:05,  1.75s/it]  

{'loss': 0.9084, 'grad_norm': 32.40636444091797, 'learning_rate': 3.791666666666667e-05, 'epoch': 12.08}


 25%|██▍       | 590/2400 [10:31<31:13,  1.04s/it]

{'loss': 1.0883, 'grad_norm': 40.28506088256836, 'learning_rate': 3.770833333333333e-05, 'epoch': 12.29}


 25%|██▌       | 600/2400 [10:41<29:24,  1.02it/s]

{'loss': 1.0719, 'grad_norm': 83.20631408691406, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


 25%|██▌       | 610/2400 [10:51<28:42,  1.04it/s]

{'loss': 1.263, 'grad_norm': 25.394922256469727, 'learning_rate': 3.729166666666667e-05, 'epoch': 12.71}


 26%|██▌       | 620/2400 [11:00<27:23,  1.08it/s]

{'loss': 1.3111, 'grad_norm': 33.31489562988281, 'learning_rate': 3.708333333333334e-05, 'epoch': 12.92}


 26%|██▌       | 624/2400 [11:03<24:15,  1.22it/s]
 26%|██▌       | 624/2400 [11:11<24:15,  1.22it/s]

{'eval_loss': 1.090072751045227, 'eval_runtime': 7.893, 'eval_samples_per_second': 15.203, 'eval_steps_per_second': 1.9, 'epoch': 13.0}


 26%|██▋       | 630/2400 [11:17<39:15,  1.33s/it]  

{'loss': 1.1163, 'grad_norm': 26.961244583129883, 'learning_rate': 3.6875e-05, 'epoch': 13.12}


 27%|██▋       | 640/2400 [11:26<27:20,  1.07it/s]

{'loss': 1.039, 'grad_norm': 26.830785751342773, 'learning_rate': 3.6666666666666666e-05, 'epoch': 13.33}


 27%|██▋       | 650/2400 [11:36<26:30,  1.10it/s]

{'loss': 1.1161, 'grad_norm': 54.41392517089844, 'learning_rate': 3.6458333333333336e-05, 'epoch': 13.54}


 28%|██▊       | 660/2400 [11:45<26:15,  1.10it/s]

{'loss': 1.0225, 'grad_norm': 20.18340301513672, 'learning_rate': 3.625e-05, 'epoch': 13.75}


 28%|██▊       | 670/2400 [11:54<26:01,  1.11it/s]

{'loss': 1.1435, 'grad_norm': 32.119319915771484, 'learning_rate': 3.604166666666667e-05, 'epoch': 13.96}


 28%|██▊       | 672/2400 [11:55<23:06,  1.25it/s]
 28%|██▊       | 672/2400 [12:03<23:06,  1.25it/s]

{'eval_loss': 1.129323124885559, 'eval_runtime': 7.5939, 'eval_samples_per_second': 15.802, 'eval_steps_per_second': 1.975, 'epoch': 14.0}


 28%|██▊       | 680/2400 [12:10<31:15,  1.09s/it]  

{'loss': 1.0406, 'grad_norm': 82.39964294433594, 'learning_rate': 3.5833333333333335e-05, 'epoch': 14.17}


 29%|██▉       | 690/2400 [12:20<26:25,  1.08it/s]

{'loss': 0.9829, 'grad_norm': 45.299285888671875, 'learning_rate': 3.5625000000000005e-05, 'epoch': 14.38}


 29%|██▉       | 700/2400 [12:29<25:46,  1.10it/s]

{'loss': 1.0269, 'grad_norm': 30.286773681640625, 'learning_rate': 3.541666666666667e-05, 'epoch': 14.58}


 30%|██▉       | 710/2400 [12:38<25:39,  1.10it/s]

{'loss': 1.1471, 'grad_norm': 73.08600616455078, 'learning_rate': 3.520833333333334e-05, 'epoch': 14.79}


 30%|███       | 720/2400 [12:46<22:28,  1.25it/s]

{'loss': 1.1275, 'grad_norm': 23.007568359375, 'learning_rate': 3.5e-05, 'epoch': 15.0}



 30%|███       | 720/2400 [12:54<22:28,  1.25it/s]

{'eval_loss': 1.0948556661605835, 'eval_runtime': 7.4946, 'eval_samples_per_second': 16.011, 'eval_steps_per_second': 2.001, 'epoch': 15.0}


 30%|███       | 730/2400 [13:03<27:22,  1.02it/s]  

{'loss': 1.035, 'grad_norm': 105.97747039794922, 'learning_rate': 3.479166666666667e-05, 'epoch': 15.21}


 31%|███       | 740/2400 [13:12<24:55,  1.11it/s]

{'loss': 1.0831, 'grad_norm': 29.097274780273438, 'learning_rate': 3.458333333333333e-05, 'epoch': 15.42}


 31%|███▏      | 750/2400 [13:21<24:51,  1.11it/s]

{'loss': 0.9519, 'grad_norm': 40.21162033081055, 'learning_rate': 3.4375e-05, 'epoch': 15.62}


 32%|███▏      | 760/2400 [13:31<25:28,  1.07it/s]

{'loss': 0.9445, 'grad_norm': 39.250946044921875, 'learning_rate': 3.4166666666666666e-05, 'epoch': 15.83}


 32%|███▏      | 768/2400 [13:38<22:05,  1.23it/s]
 32%|███▏      | 768/2400 [13:45<22:05,  1.23it/s]

{'eval_loss': 1.1108765602111816, 'eval_runtime': 7.6734, 'eval_samples_per_second': 15.639, 'eval_steps_per_second': 1.955, 'epoch': 16.0}


 32%|███▏      | 770/2400 [13:47<1:09:52,  2.57s/it]

{'loss': 1.1324, 'grad_norm': 25.009910583496094, 'learning_rate': 3.3958333333333337e-05, 'epoch': 16.04}


 32%|███▎      | 780/2400 [13:57<26:22,  1.02it/s]  

{'loss': 0.9731, 'grad_norm': 70.3175277709961, 'learning_rate': 3.375000000000001e-05, 'epoch': 16.25}


 33%|███▎      | 790/2400 [14:06<25:01,  1.07it/s]

{'loss': 1.0294, 'grad_norm': 28.149673461914062, 'learning_rate': 3.3541666666666664e-05, 'epoch': 16.46}


 33%|███▎      | 800/2400 [14:16<24:40,  1.08it/s]

{'loss': 0.9729, 'grad_norm': 47.39876174926758, 'learning_rate': 3.3333333333333335e-05, 'epoch': 16.67}


 34%|███▍      | 810/2400 [14:25<24:53,  1.06it/s]

{'loss': 1.0009, 'grad_norm': 32.11473846435547, 'learning_rate': 3.3125e-05, 'epoch': 16.88}


 34%|███▍      | 816/2400 [14:30<22:02,  1.20it/s]
 34%|███▍      | 816/2400 [14:38<22:02,  1.20it/s]

{'eval_loss': 1.0273692607879639, 'eval_runtime': 7.7081, 'eval_samples_per_second': 15.568, 'eval_steps_per_second': 1.946, 'epoch': 17.0}


 34%|███▍      | 820/2400 [14:42<45:37,  1.73s/it]  

{'loss': 0.9678, 'grad_norm': 51.90059280395508, 'learning_rate': 3.291666666666667e-05, 'epoch': 17.08}


 35%|███▍      | 830/2400 [14:51<24:30,  1.07it/s]

{'loss': 1.0539, 'grad_norm': 31.270610809326172, 'learning_rate': 3.270833333333333e-05, 'epoch': 17.29}


 35%|███▌      | 840/2400 [15:00<23:35,  1.10it/s]

{'loss': 1.0884, 'grad_norm': 42.32679748535156, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


 35%|███▌      | 850/2400 [15:09<23:24,  1.10it/s]

{'loss': 0.9414, 'grad_norm': 26.86811065673828, 'learning_rate': 3.229166666666667e-05, 'epoch': 17.71}


 36%|███▌      | 860/2400 [15:19<23:32,  1.09it/s]

{'loss': 0.9221, 'grad_norm': 29.679351806640625, 'learning_rate': 3.208333333333334e-05, 'epoch': 17.92}


 36%|███▌      | 864/2400 [15:22<20:34,  1.24it/s]
 36%|███▌      | 864/2400 [15:30<20:34,  1.24it/s]

{'eval_loss': 1.057030200958252, 'eval_runtime': 7.9458, 'eval_samples_per_second': 15.102, 'eval_steps_per_second': 1.888, 'epoch': 18.0}


 36%|███▋      | 870/2400 [15:36<35:16,  1.38s/it]  

{'loss': 0.9344, 'grad_norm': 22.404325485229492, 'learning_rate': 3.1875e-05, 'epoch': 18.12}


 37%|███▋      | 880/2400 [15:46<25:01,  1.01it/s]

{'loss': 0.8233, 'grad_norm': 83.99898529052734, 'learning_rate': 3.1666666666666666e-05, 'epoch': 18.33}


 37%|███▋      | 890/2400 [15:56<23:26,  1.07it/s]

{'loss': 0.9595, 'grad_norm': 49.12004089355469, 'learning_rate': 3.145833333333334e-05, 'epoch': 18.54}


 38%|███▊      | 900/2400 [16:04<21:24,  1.17it/s]

{'loss': 1.1121, 'grad_norm': 22.836502075195312, 'learning_rate': 3.125e-05, 'epoch': 18.75}


 38%|███▊      | 910/2400 [16:13<21:13,  1.17it/s]

{'loss': 0.978, 'grad_norm': 161.64463806152344, 'learning_rate': 3.104166666666667e-05, 'epoch': 18.96}


 38%|███▊      | 912/2400 [16:14<19:03,  1.30it/s]
 38%|███▊      | 912/2400 [16:21<19:03,  1.30it/s]

{'eval_loss': 1.0791633129119873, 'eval_runtime': 6.8014, 'eval_samples_per_second': 17.643, 'eval_steps_per_second': 2.205, 'epoch': 19.0}


 38%|███▊      | 920/2400 [16:28<25:13,  1.02s/it]  

{'loss': 0.9285, 'grad_norm': 88.0418701171875, 'learning_rate': 3.0833333333333335e-05, 'epoch': 19.17}


 39%|███▉      | 930/2400 [16:37<21:08,  1.16it/s]

{'loss': 1.0781, 'grad_norm': 32.23320007324219, 'learning_rate': 3.0625000000000006e-05, 'epoch': 19.38}


 39%|███▉      | 940/2400 [16:45<20:46,  1.17it/s]

{'loss': 1.0687, 'grad_norm': 57.615596771240234, 'learning_rate': 3.0416666666666666e-05, 'epoch': 19.58}


 40%|███▉      | 950/2400 [16:54<21:00,  1.15it/s]

{'loss': 0.9091, 'grad_norm': 24.200685501098633, 'learning_rate': 3.0208333333333334e-05, 'epoch': 19.79}


 40%|████      | 960/2400 [17:03<19:15,  1.25it/s]

{'loss': 0.9241, 'grad_norm': 108.42658233642578, 'learning_rate': 3e-05, 'epoch': 20.0}



 40%|████      | 960/2400 [17:10<19:15,  1.25it/s]

{'eval_loss': 1.1535149812698364, 'eval_runtime': 7.4747, 'eval_samples_per_second': 16.054, 'eval_steps_per_second': 2.007, 'epoch': 20.0}


 40%|████      | 970/2400 [17:20<23:46,  1.00it/s]  

{'loss': 1.1471, 'grad_norm': 26.337453842163086, 'learning_rate': 2.9791666666666668e-05, 'epoch': 20.21}


 41%|████      | 980/2400 [17:28<21:12,  1.12it/s]

{'loss': 1.0161, 'grad_norm': 32.216522216796875, 'learning_rate': 2.9583333333333335e-05, 'epoch': 20.42}


 41%|████▏     | 990/2400 [17:37<20:28,  1.15it/s]

{'loss': 1.182, 'grad_norm': 42.119632720947266, 'learning_rate': 2.9375000000000003e-05, 'epoch': 20.62}


 42%|████▏     | 1000/2400 [17:46<20:13,  1.15it/s]

{'loss': 0.8809, 'grad_norm': 48.39412307739258, 'learning_rate': 2.916666666666667e-05, 'epoch': 20.83}


 42%|████▏     | 1008/2400 [17:53<18:00,  1.29it/s]
 42%|████▏     | 1008/2400 [18:00<18:00,  1.29it/s]

{'eval_loss': 1.0570813417434692, 'eval_runtime': 6.95, 'eval_samples_per_second': 17.266, 'eval_steps_per_second': 2.158, 'epoch': 21.0}


 42%|████▏     | 1010/2400 [18:02<54:25,  2.35s/it]  

{'loss': 0.8684, 'grad_norm': 30.915220260620117, 'learning_rate': 2.8958333333333337e-05, 'epoch': 21.04}


 42%|████▎     | 1020/2400 [18:10<20:55,  1.10it/s]

{'loss': 1.118, 'grad_norm': 23.511945724487305, 'learning_rate': 2.8749999999999997e-05, 'epoch': 21.25}


 43%|████▎     | 1030/2400 [18:19<20:14,  1.13it/s]

{'loss': 0.9383, 'grad_norm': 27.63957405090332, 'learning_rate': 2.8541666666666668e-05, 'epoch': 21.46}


 43%|████▎     | 1040/2400 [18:28<19:38,  1.15it/s]

{'loss': 1.0361, 'grad_norm': 106.59748077392578, 'learning_rate': 2.8333333333333335e-05, 'epoch': 21.67}


 44%|████▍     | 1050/2400 [18:36<19:26,  1.16it/s]

{'loss': 1.1638, 'grad_norm': 41.230308532714844, 'learning_rate': 2.8125000000000003e-05, 'epoch': 21.88}


 44%|████▍     | 1056/2400 [18:41<17:25,  1.29it/s]
 44%|████▍     | 1056/2400 [18:48<17:25,  1.29it/s]

{'eval_loss': 1.10440194606781, 'eval_runtime': 6.919, 'eval_samples_per_second': 17.343, 'eval_steps_per_second': 2.168, 'epoch': 22.0}


 44%|████▍     | 1060/2400 [18:52<35:33,  1.59s/it]  

{'loss': 0.9949, 'grad_norm': 28.213722229003906, 'learning_rate': 2.791666666666667e-05, 'epoch': 22.08}


 45%|████▍     | 1070/2400 [19:01<19:31,  1.14it/s]

{'loss': 0.9827, 'grad_norm': 43.59398651123047, 'learning_rate': 2.7708333333333337e-05, 'epoch': 22.29}


 45%|████▌     | 1080/2400 [19:09<19:05,  1.15it/s]

{'loss': 1.0022, 'grad_norm': 113.77378845214844, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


 45%|████▌     | 1090/2400 [19:18<19:01,  1.15it/s]

{'loss': 1.0233, 'grad_norm': 58.11576461791992, 'learning_rate': 2.7291666666666665e-05, 'epoch': 22.71}


 46%|████▌     | 1100/2400 [19:27<18:49,  1.15it/s]

{'loss': 0.8841, 'grad_norm': 37.867034912109375, 'learning_rate': 2.7083333333333332e-05, 'epoch': 22.92}


 46%|████▌     | 1104/2400 [19:30<16:54,  1.28it/s]
 46%|████▌     | 1104/2400 [19:37<16:54,  1.28it/s]

{'eval_loss': 1.0481760501861572, 'eval_runtime': 6.957, 'eval_samples_per_second': 17.249, 'eval_steps_per_second': 2.156, 'epoch': 23.0}


 46%|████▋     | 1110/2400 [19:43<26:44,  1.24s/it]  

{'loss': 1.04, 'grad_norm': 22.754070281982422, 'learning_rate': 2.6875e-05, 'epoch': 23.12}


 47%|████▋     | 1120/2400 [19:51<18:23,  1.16it/s]

{'loss': 0.9459, 'grad_norm': 56.23242950439453, 'learning_rate': 2.6666666666666667e-05, 'epoch': 23.33}


 47%|████▋     | 1130/2400 [19:59<17:31,  1.21it/s]

{'loss': 0.8416, 'grad_norm': 27.99053192138672, 'learning_rate': 2.6458333333333334e-05, 'epoch': 23.54}


 48%|████▊     | 1140/2400 [20:08<17:38,  1.19it/s]

{'loss': 0.843, 'grad_norm': 31.419612884521484, 'learning_rate': 2.625e-05, 'epoch': 23.75}


 48%|████▊     | 1150/2400 [20:16<17:31,  1.19it/s]

{'loss': 0.9602, 'grad_norm': 22.56062889099121, 'learning_rate': 2.604166666666667e-05, 'epoch': 23.96}


 48%|████▊     | 1152/2400 [20:18<15:30,  1.34it/s]
 48%|████▊     | 1152/2400 [20:24<15:30,  1.34it/s]

{'eval_loss': 1.0062872171401978, 'eval_runtime': 6.8228, 'eval_samples_per_second': 17.588, 'eval_steps_per_second': 2.199, 'epoch': 24.0}


 48%|████▊     | 1160/2400 [20:32<21:01,  1.02s/it]  

{'loss': 0.9115, 'grad_norm': 23.140174865722656, 'learning_rate': 2.5833333333333336e-05, 'epoch': 24.17}


 49%|████▉     | 1170/2400 [20:40<17:21,  1.18it/s]

{'loss': 0.9789, 'grad_norm': 64.42611694335938, 'learning_rate': 2.5625e-05, 'epoch': 24.38}


 49%|████▉     | 1180/2400 [20:49<17:19,  1.17it/s]

{'loss': 0.9383, 'grad_norm': 24.787822723388672, 'learning_rate': 2.5416666666666667e-05, 'epoch': 24.58}


 50%|████▉     | 1190/2400 [20:57<16:55,  1.19it/s]

{'loss': 0.814, 'grad_norm': 25.059274673461914, 'learning_rate': 2.5208333333333334e-05, 'epoch': 24.79}


 50%|█████     | 1200/2400 [21:05<14:42,  1.36it/s]

{'loss': 0.9692, 'grad_norm': 45.734825134277344, 'learning_rate': 2.5e-05, 'epoch': 25.0}



 50%|█████     | 1200/2400 [21:12<14:42,  1.36it/s]

{'eval_loss': 0.9682490229606628, 'eval_runtime': 6.7997, 'eval_samples_per_second': 17.648, 'eval_steps_per_second': 2.206, 'epoch': 25.0}


 50%|█████     | 1210/2400 [21:21<18:23,  1.08it/s]

{'loss': 0.8322, 'grad_norm': 38.76671600341797, 'learning_rate': 2.479166666666667e-05, 'epoch': 25.21}


 51%|█████     | 1220/2400 [21:29<16:35,  1.19it/s]

{'loss': 0.9637, 'grad_norm': 18.131437301635742, 'learning_rate': 2.4583333333333332e-05, 'epoch': 25.42}


 51%|█████▏    | 1230/2400 [21:37<16:06,  1.21it/s]

{'loss': 0.9666, 'grad_norm': 29.628347396850586, 'learning_rate': 2.4375e-05, 'epoch': 25.62}


 52%|█████▏    | 1240/2400 [21:46<15:56,  1.21it/s]

{'loss': 0.7648, 'grad_norm': 34.86143112182617, 'learning_rate': 2.4166666666666667e-05, 'epoch': 25.83}


 52%|█████▏    | 1248/2400 [21:52<14:05,  1.36it/s]
 52%|█████▏    | 1248/2400 [21:59<14:05,  1.36it/s]

{'eval_loss': 1.0070383548736572, 'eval_runtime': 6.7096, 'eval_samples_per_second': 17.885, 'eval_steps_per_second': 2.236, 'epoch': 26.0}


 52%|█████▏    | 1250/2400 [22:01<43:12,  2.25s/it]

{'loss': 0.8491, 'grad_norm': 30.16522216796875, 'learning_rate': 2.3958333333333334e-05, 'epoch': 26.04}


 52%|█████▎    | 1260/2400 [22:09<16:26,  1.16it/s]

{'loss': 0.8666, 'grad_norm': 35.15849304199219, 'learning_rate': 2.375e-05, 'epoch': 26.25}


 53%|█████▎    | 1270/2400 [22:17<15:45,  1.20it/s]

{'loss': 0.8178, 'grad_norm': 35.83002853393555, 'learning_rate': 2.354166666666667e-05, 'epoch': 26.46}


 53%|█████▎    | 1280/2400 [22:25<15:26,  1.21it/s]

{'loss': 0.8649, 'grad_norm': 23.937034606933594, 'learning_rate': 2.3333333333333336e-05, 'epoch': 26.67}


 54%|█████▍    | 1290/2400 [22:34<15:21,  1.20it/s]

{'loss': 0.87, 'grad_norm': 27.523393630981445, 'learning_rate': 2.3125000000000003e-05, 'epoch': 26.88}


 54%|█████▍    | 1296/2400 [22:38<13:32,  1.36it/s]
 54%|█████▍    | 1296/2400 [22:45<13:32,  1.36it/s]

{'eval_loss': 0.9936547875404358, 'eval_runtime': 6.7432, 'eval_samples_per_second': 17.796, 'eval_steps_per_second': 2.224, 'epoch': 27.0}


 54%|█████▍    | 1300/2400 [22:49<28:23,  1.55s/it]

{'loss': 0.918, 'grad_norm': 82.55977630615234, 'learning_rate': 2.2916666666666667e-05, 'epoch': 27.08}


 55%|█████▍    | 1310/2400 [22:57<16:06,  1.13it/s]

{'loss': 0.9146, 'grad_norm': 42.490882873535156, 'learning_rate': 2.2708333333333334e-05, 'epoch': 27.29}


 55%|█████▌    | 1320/2400 [23:06<15:16,  1.18it/s]

{'loss': 0.8363, 'grad_norm': 29.16877555847168, 'learning_rate': 2.25e-05, 'epoch': 27.5}


 55%|█████▌    | 1330/2400 [23:15<15:09,  1.18it/s]

{'loss': 0.9498, 'grad_norm': 67.65799713134766, 'learning_rate': 2.229166666666667e-05, 'epoch': 27.71}


 56%|█████▌    | 1340/2400 [23:23<14:58,  1.18it/s]

{'loss': 0.829, 'grad_norm': 17.103740692138672, 'learning_rate': 2.2083333333333333e-05, 'epoch': 27.92}


 56%|█████▌    | 1344/2400 [23:26<13:22,  1.32it/s]
 56%|█████▌    | 1344/2400 [23:33<13:22,  1.32it/s]

{'eval_loss': 0.9556922316551208, 'eval_runtime': 6.7582, 'eval_samples_per_second': 17.756, 'eval_steps_per_second': 2.22, 'epoch': 28.0}


 56%|█████▋    | 1350/2400 [23:38<20:37,  1.18s/it]

{'loss': 0.8384, 'grad_norm': 13.839825630187988, 'learning_rate': 2.1875e-05, 'epoch': 28.12}


 57%|█████▋    | 1360/2400 [23:47<14:59,  1.16it/s]

{'loss': 0.8094, 'grad_norm': 18.793893814086914, 'learning_rate': 2.1666666666666667e-05, 'epoch': 28.33}


 57%|█████▋    | 1370/2400 [23:55<14:38,  1.17it/s]

{'loss': 0.777, 'grad_norm': 16.689109802246094, 'learning_rate': 2.1458333333333334e-05, 'epoch': 28.54}


 57%|█████▊    | 1380/2400 [24:04<14:11,  1.20it/s]

{'loss': 0.9658, 'grad_norm': 21.2336483001709, 'learning_rate': 2.125e-05, 'epoch': 28.75}


 58%|█████▊    | 1390/2400 [24:12<14:28,  1.16it/s]

{'loss': 0.8436, 'grad_norm': 31.336483001708984, 'learning_rate': 2.104166666666667e-05, 'epoch': 28.96}


 58%|█████▊    | 1392/2400 [24:14<12:40,  1.33it/s]
 58%|█████▊    | 1392/2400 [24:21<12:40,  1.33it/s]

{'eval_loss': 0.9791632890701294, 'eval_runtime': 6.9496, 'eval_samples_per_second': 17.267, 'eval_steps_per_second': 2.158, 'epoch': 29.0}


 58%|█████▊    | 1400/2400 [24:28<16:47,  1.01s/it]

{'loss': 0.8389, 'grad_norm': 26.212554931640625, 'learning_rate': 2.0833333333333336e-05, 'epoch': 29.17}


 59%|█████▉    | 1410/2400 [24:36<13:42,  1.20it/s]

{'loss': 0.8817, 'grad_norm': 42.836727142333984, 'learning_rate': 2.0625e-05, 'epoch': 29.38}


 59%|█████▉    | 1420/2400 [24:45<13:44,  1.19it/s]

{'loss': 0.6913, 'grad_norm': 13.32696533203125, 'learning_rate': 2.0416666666666667e-05, 'epoch': 29.58}


 60%|█████▉    | 1430/2400 [24:53<13:23,  1.21it/s]

{'loss': 0.8781, 'grad_norm': 18.57514762878418, 'learning_rate': 2.0208333333333334e-05, 'epoch': 29.79}


 60%|██████    | 1440/2400 [25:01<11:40,  1.37it/s]

{'loss': 0.7677, 'grad_norm': 16.977893829345703, 'learning_rate': 2e-05, 'epoch': 30.0}



 60%|██████    | 1440/2400 [25:08<11:40,  1.37it/s]

{'eval_loss': 0.9594339728355408, 'eval_runtime': 6.7195, 'eval_samples_per_second': 17.858, 'eval_steps_per_second': 2.232, 'epoch': 30.0}


 60%|██████    | 1450/2400 [25:16<14:23,  1.10it/s]

{'loss': 0.809, 'grad_norm': 25.696365356445312, 'learning_rate': 1.9791666666666665e-05, 'epoch': 30.21}


 61%|██████    | 1460/2400 [25:24<12:59,  1.21it/s]

{'loss': 0.7571, 'grad_norm': 27.026260375976562, 'learning_rate': 1.9583333333333333e-05, 'epoch': 30.42}


 61%|██████▏   | 1470/2400 [25:33<12:49,  1.21it/s]

{'loss': 0.8471, 'grad_norm': 20.388317108154297, 'learning_rate': 1.9375e-05, 'epoch': 30.62}


 62%|██████▏   | 1480/2400 [25:41<12:40,  1.21it/s]

{'loss': 0.7311, 'grad_norm': 37.202213287353516, 'learning_rate': 1.9166666666666667e-05, 'epoch': 30.83}


 62%|██████▏   | 1488/2400 [25:48<11:27,  1.33it/s]
 62%|██████▏   | 1488/2400 [25:55<11:27,  1.33it/s]

{'eval_loss': 0.9633182287216187, 'eval_runtime': 6.9035, 'eval_samples_per_second': 17.383, 'eval_steps_per_second': 2.173, 'epoch': 31.0}


 62%|██████▏   | 1490/2400 [25:57<35:01,  2.31s/it]

{'loss': 0.8987, 'grad_norm': 42.24226760864258, 'learning_rate': 1.8958333333333334e-05, 'epoch': 31.04}


 62%|██████▎   | 1500/2400 [26:05<13:22,  1.12it/s]

{'loss': 0.8766, 'grad_norm': 37.44671630859375, 'learning_rate': 1.8750000000000002e-05, 'epoch': 31.25}


 63%|██████▎   | 1510/2400 [26:13<12:31,  1.18it/s]

{'loss': 0.866, 'grad_norm': 19.245059967041016, 'learning_rate': 1.854166666666667e-05, 'epoch': 31.46}


 63%|██████▎   | 1520/2400 [26:22<12:18,  1.19it/s]

{'loss': 0.7736, 'grad_norm': 60.58026885986328, 'learning_rate': 1.8333333333333333e-05, 'epoch': 31.67}


 64%|██████▍   | 1530/2400 [26:30<12:09,  1.19it/s]

{'loss': 0.7776, 'grad_norm': 29.476032257080078, 'learning_rate': 1.8125e-05, 'epoch': 31.88}


 64%|██████▍   | 1536/2400 [26:35<10:38,  1.35it/s]
 64%|██████▍   | 1536/2400 [26:42<10:38,  1.35it/s]

{'eval_loss': 1.029414415359497, 'eval_runtime': 6.8889, 'eval_samples_per_second': 17.419, 'eval_steps_per_second': 2.177, 'epoch': 32.0}


 64%|██████▍   | 1540/2400 [26:46<22:20,  1.56s/it]

{'loss': 0.6894, 'grad_norm': 52.12467956542969, 'learning_rate': 1.7916666666666667e-05, 'epoch': 32.08}


 65%|██████▍   | 1550/2400 [26:54<11:57,  1.18it/s]

{'loss': 0.8394, 'grad_norm': 39.7009391784668, 'learning_rate': 1.7708333333333335e-05, 'epoch': 32.29}


 65%|██████▌   | 1560/2400 [27:02<11:35,  1.21it/s]

{'loss': 0.8695, 'grad_norm': 14.690096855163574, 'learning_rate': 1.75e-05, 'epoch': 32.5}


 65%|██████▌   | 1570/2400 [27:10<11:28,  1.21it/s]

{'loss': 0.7049, 'grad_norm': 33.20947265625, 'learning_rate': 1.7291666666666666e-05, 'epoch': 32.71}


 66%|██████▌   | 1580/2400 [27:19<11:38,  1.17it/s]

{'loss': 0.7298, 'grad_norm': 19.31843376159668, 'learning_rate': 1.7083333333333333e-05, 'epoch': 32.92}


 66%|██████▌   | 1584/2400 [27:22<10:20,  1.32it/s]
 66%|██████▌   | 1584/2400 [27:29<10:20,  1.32it/s]

{'eval_loss': 0.9605532288551331, 'eval_runtime': 6.7602, 'eval_samples_per_second': 17.751, 'eval_steps_per_second': 2.219, 'epoch': 33.0}


 66%|██████▋   | 1590/2400 [27:34<16:03,  1.19s/it]

{'loss': 0.8329, 'grad_norm': 18.186426162719727, 'learning_rate': 1.6875000000000004e-05, 'epoch': 33.12}


 67%|██████▋   | 1600/2400 [27:43<11:26,  1.17it/s]

{'loss': 0.7092, 'grad_norm': 21.90953826904297, 'learning_rate': 1.6666666666666667e-05, 'epoch': 33.33}


 67%|██████▋   | 1610/2400 [27:51<11:05,  1.19it/s]

{'loss': 0.6844, 'grad_norm': 21.282886505126953, 'learning_rate': 1.6458333333333335e-05, 'epoch': 33.54}


 68%|██████▊   | 1620/2400 [28:00<10:58,  1.19it/s]

{'loss': 0.7404, 'grad_norm': 20.623796463012695, 'learning_rate': 1.6250000000000002e-05, 'epoch': 33.75}


 68%|██████▊   | 1630/2400 [28:08<11:01,  1.16it/s]

{'loss': 0.7581, 'grad_norm': 19.080974578857422, 'learning_rate': 1.604166666666667e-05, 'epoch': 33.96}


 68%|██████▊   | 1632/2400 [28:10<09:55,  1.29it/s]
 68%|██████▊   | 1632/2400 [28:16<09:55,  1.29it/s]

{'eval_loss': 0.9660792946815491, 'eval_runtime': 6.8703, 'eval_samples_per_second': 17.467, 'eval_steps_per_second': 2.183, 'epoch': 34.0}


 68%|██████▊   | 1640/2400 [28:24<12:56,  1.02s/it]

{'loss': 0.7186, 'grad_norm': 20.915624618530273, 'learning_rate': 1.5833333333333333e-05, 'epoch': 34.17}


 69%|██████▉   | 1650/2400 [28:32<10:35,  1.18it/s]

{'loss': 0.7974, 'grad_norm': 42.35422897338867, 'learning_rate': 1.5625e-05, 'epoch': 34.38}


 69%|██████▉   | 1660/2400 [28:40<10:19,  1.19it/s]

{'loss': 0.6519, 'grad_norm': 16.155502319335938, 'learning_rate': 1.5416666666666668e-05, 'epoch': 34.58}


 70%|██████▉   | 1670/2400 [28:49<10:17,  1.18it/s]

{'loss': 0.6858, 'grad_norm': 43.425315856933594, 'learning_rate': 1.5208333333333333e-05, 'epoch': 34.79}


 70%|███████   | 1680/2400 [28:57<09:03,  1.33it/s]

{'loss': 0.7461, 'grad_norm': 19.506193161010742, 'learning_rate': 1.5e-05, 'epoch': 35.0}



 70%|███████   | 1680/2400 [29:04<09:03,  1.33it/s]

{'eval_loss': 0.9901058673858643, 'eval_runtime': 6.835, 'eval_samples_per_second': 17.557, 'eval_steps_per_second': 2.195, 'epoch': 35.0}


 70%|███████   | 1690/2400 [29:13<10:58,  1.08it/s]

{'loss': 0.7065, 'grad_norm': 23.462949752807617, 'learning_rate': 1.4791666666666668e-05, 'epoch': 35.21}


 71%|███████   | 1700/2400 [29:21<09:41,  1.20it/s]

{'loss': 0.7614, 'grad_norm': 41.90000534057617, 'learning_rate': 1.4583333333333335e-05, 'epoch': 35.42}


 71%|███████▏  | 1710/2400 [29:29<09:31,  1.21it/s]

{'loss': 0.636, 'grad_norm': 22.148080825805664, 'learning_rate': 1.4374999999999999e-05, 'epoch': 35.62}


 72%|███████▏  | 1720/2400 [29:37<09:25,  1.20it/s]

{'loss': 0.7491, 'grad_norm': 27.75782585144043, 'learning_rate': 1.4166666666666668e-05, 'epoch': 35.83}


 72%|███████▏  | 1728/2400 [29:44<08:14,  1.36it/s]
 72%|███████▏  | 1728/2400 [29:51<08:14,  1.36it/s]

{'eval_loss': 0.9812434911727905, 'eval_runtime': 6.7902, 'eval_samples_per_second': 17.673, 'eval_steps_per_second': 2.209, 'epoch': 36.0}


 72%|███████▏  | 1730/2400 [29:53<25:29,  2.28s/it]

{'loss': 0.6566, 'grad_norm': 32.20362854003906, 'learning_rate': 1.3958333333333335e-05, 'epoch': 36.04}


 72%|███████▎  | 1740/2400 [30:01<09:32,  1.15it/s]

{'loss': 0.7248, 'grad_norm': 31.772146224975586, 'learning_rate': 1.3750000000000002e-05, 'epoch': 36.25}


 73%|███████▎  | 1750/2400 [30:09<08:58,  1.21it/s]

{'loss': 0.7243, 'grad_norm': 41.474674224853516, 'learning_rate': 1.3541666666666666e-05, 'epoch': 36.46}


 73%|███████▎  | 1760/2400 [30:18<09:03,  1.18it/s]

{'loss': 0.6294, 'grad_norm': 26.41596221923828, 'learning_rate': 1.3333333333333333e-05, 'epoch': 36.67}


 74%|███████▍  | 1770/2400 [30:26<08:47,  1.19it/s]

{'loss': 0.6067, 'grad_norm': 41.15189743041992, 'learning_rate': 1.3125e-05, 'epoch': 36.88}


 74%|███████▍  | 1776/2400 [30:31<07:49,  1.33it/s]
 74%|███████▍  | 1776/2400 [30:38<07:49,  1.33it/s]

{'eval_loss': 1.0304828882217407, 'eval_runtime': 6.8535, 'eval_samples_per_second': 17.509, 'eval_steps_per_second': 2.189, 'epoch': 37.0}


 74%|███████▍  | 1780/2400 [30:41<16:21,  1.58s/it]

{'loss': 0.7493, 'grad_norm': 32.610652923583984, 'learning_rate': 1.2916666666666668e-05, 'epoch': 37.08}


 75%|███████▍  | 1790/2400 [30:50<08:41,  1.17it/s]

{'loss': 0.7356, 'grad_norm': 81.96285247802734, 'learning_rate': 1.2708333333333333e-05, 'epoch': 37.29}


 75%|███████▌  | 1800/2400 [30:58<08:26,  1.19it/s]

{'loss': 0.6946, 'grad_norm': 27.976701736450195, 'learning_rate': 1.25e-05, 'epoch': 37.5}


 75%|███████▌  | 1810/2400 [31:07<08:11,  1.20it/s]

{'loss': 0.7198, 'grad_norm': 23.274747848510742, 'learning_rate': 1.2291666666666666e-05, 'epoch': 37.71}


 76%|███████▌  | 1820/2400 [31:15<08:28,  1.14it/s]

{'loss': 0.6444, 'grad_norm': 15.679999351501465, 'learning_rate': 1.2083333333333333e-05, 'epoch': 37.92}


 76%|███████▌  | 1824/2400 [31:18<07:34,  1.27it/s]
 76%|███████▌  | 1824/2400 [31:25<07:34,  1.27it/s]

{'eval_loss': 0.9435226321220398, 'eval_runtime': 7.1242, 'eval_samples_per_second': 16.844, 'eval_steps_per_second': 2.105, 'epoch': 38.0}


 76%|███████▋  | 1830/2400 [31:31<11:35,  1.22s/it]

{'loss': 0.6486, 'grad_norm': 31.362686157226562, 'learning_rate': 1.1875e-05, 'epoch': 38.12}


 77%|███████▋  | 1840/2400 [31:39<07:56,  1.18it/s]

{'loss': 0.6362, 'grad_norm': 23.878469467163086, 'learning_rate': 1.1666666666666668e-05, 'epoch': 38.33}


 77%|███████▋  | 1850/2400 [31:48<07:42,  1.19it/s]

{'loss': 0.6907, 'grad_norm': 23.48320770263672, 'learning_rate': 1.1458333333333333e-05, 'epoch': 38.54}


 78%|███████▊  | 1860/2400 [31:56<07:32,  1.19it/s]

{'loss': 0.7235, 'grad_norm': 26.459884643554688, 'learning_rate': 1.125e-05, 'epoch': 38.75}


 78%|███████▊  | 1870/2400 [32:05<07:21,  1.20it/s]

{'loss': 0.7295, 'grad_norm': 21.668725967407227, 'learning_rate': 1.1041666666666666e-05, 'epoch': 38.96}


 78%|███████▊  | 1872/2400 [32:06<06:29,  1.35it/s]
 78%|███████▊  | 1872/2400 [32:13<06:29,  1.35it/s]

{'eval_loss': 1.0190924406051636, 'eval_runtime': 6.7863, 'eval_samples_per_second': 17.683, 'eval_steps_per_second': 2.21, 'epoch': 39.0}


 78%|███████▊  | 1880/2400 [32:20<08:54,  1.03s/it]

{'loss': 0.6056, 'grad_norm': 36.925960540771484, 'learning_rate': 1.0833333333333334e-05, 'epoch': 39.17}


 79%|███████▉  | 1890/2400 [32:28<07:12,  1.18it/s]

{'loss': 0.5656, 'grad_norm': 30.523723602294922, 'learning_rate': 1.0625e-05, 'epoch': 39.38}


 79%|███████▉  | 1900/2400 [32:37<06:55,  1.20it/s]

{'loss': 0.6727, 'grad_norm': 29.973426818847656, 'learning_rate': 1.0416666666666668e-05, 'epoch': 39.58}


 80%|███████▉  | 1910/2400 [32:45<06:47,  1.20it/s]

{'loss': 0.8058, 'grad_norm': 25.149967193603516, 'learning_rate': 1.0208333333333334e-05, 'epoch': 39.79}


 80%|████████  | 1920/2400 [32:53<05:51,  1.36it/s]

{'loss': 0.6679, 'grad_norm': 20.771991729736328, 'learning_rate': 1e-05, 'epoch': 40.0}



 80%|████████  | 1920/2400 [33:00<05:51,  1.36it/s]

{'eval_loss': 1.006837010383606, 'eval_runtime': 6.8161, 'eval_samples_per_second': 17.605, 'eval_steps_per_second': 2.201, 'epoch': 40.0}


 80%|████████  | 1930/2400 [33:08<07:09,  1.09it/s]

{'loss': 0.7351, 'grad_norm': 28.069414138793945, 'learning_rate': 9.791666666666666e-06, 'epoch': 40.21}


 81%|████████  | 1940/2400 [33:17<06:25,  1.19it/s]

{'loss': 0.5943, 'grad_norm': 41.77630615234375, 'learning_rate': 9.583333333333334e-06, 'epoch': 40.42}


 81%|████████▏ | 1950/2400 [33:25<06:22,  1.18it/s]

{'loss': 0.7161, 'grad_norm': 15.106553077697754, 'learning_rate': 9.375000000000001e-06, 'epoch': 40.62}


 82%|████████▏ | 1960/2400 [33:34<06:08,  1.19it/s]

{'loss': 0.6585, 'grad_norm': 16.974267959594727, 'learning_rate': 9.166666666666666e-06, 'epoch': 40.83}


 82%|████████▏ | 1968/2400 [33:40<05:25,  1.33it/s]
 82%|████████▏ | 1968/2400 [33:47<05:25,  1.33it/s]

{'eval_loss': 0.9817866086959839, 'eval_runtime': 7.049, 'eval_samples_per_second': 17.024, 'eval_steps_per_second': 2.128, 'epoch': 41.0}


 82%|████████▏ | 1970/2400 [33:49<16:47,  2.34s/it]

{'loss': 0.5285, 'grad_norm': 116.59762573242188, 'learning_rate': 8.958333333333334e-06, 'epoch': 41.04}


 82%|████████▎ | 1980/2400 [33:58<06:10,  1.13it/s]

{'loss': 0.5371, 'grad_norm': 22.04189682006836, 'learning_rate': 8.75e-06, 'epoch': 41.25}


 83%|████████▎ | 1990/2400 [34:06<05:49,  1.17it/s]

{'loss': 0.7398, 'grad_norm': 27.395954132080078, 'learning_rate': 8.541666666666666e-06, 'epoch': 41.46}


 83%|████████▎ | 2000/2400 [34:15<05:41,  1.17it/s]

{'loss': 0.6856, 'grad_norm': 27.07850456237793, 'learning_rate': 8.333333333333334e-06, 'epoch': 41.67}


 84%|████████▍ | 2010/2400 [34:23<05:45,  1.13it/s]

{'loss': 0.6197, 'grad_norm': 17.89019203186035, 'learning_rate': 8.125000000000001e-06, 'epoch': 41.88}


 84%|████████▍ | 2016/2400 [34:28<04:53,  1.31it/s]
 84%|████████▍ | 2016/2400 [34:35<04:53,  1.31it/s]

{'eval_loss': 0.9380567669868469, 'eval_runtime': 7.0056, 'eval_samples_per_second': 17.129, 'eval_steps_per_second': 2.141, 'epoch': 42.0}


 84%|████████▍ | 2020/2400 [34:39<09:59,  1.58s/it]

{'loss': 0.7079, 'grad_norm': 19.346328735351562, 'learning_rate': 7.916666666666667e-06, 'epoch': 42.08}


 85%|████████▍ | 2030/2400 [34:47<05:21,  1.15it/s]

{'loss': 0.6765, 'grad_norm': 25.687288284301758, 'learning_rate': 7.708333333333334e-06, 'epoch': 42.29}


 85%|████████▌ | 2040/2400 [34:56<05:11,  1.16it/s]

{'loss': 0.6361, 'grad_norm': 15.0850248336792, 'learning_rate': 7.5e-06, 'epoch': 42.5}


 85%|████████▌ | 2050/2400 [35:04<04:56,  1.18it/s]

{'loss': 0.5772, 'grad_norm': 19.836816787719727, 'learning_rate': 7.2916666666666674e-06, 'epoch': 42.71}


 86%|████████▌ | 2060/2400 [35:13<04:57,  1.14it/s]

{'loss': 0.7288, 'grad_norm': 19.648847579956055, 'learning_rate': 7.083333333333334e-06, 'epoch': 42.92}


 86%|████████▌ | 2064/2400 [35:16<04:32,  1.24it/s]
 86%|████████▌ | 2064/2400 [35:24<04:32,  1.24it/s]

{'eval_loss': 0.9435597658157349, 'eval_runtime': 7.1857, 'eval_samples_per_second': 16.7, 'eval_steps_per_second': 2.087, 'epoch': 43.0}


 86%|████████▋ | 2070/2400 [35:29<06:42,  1.22s/it]

{'loss': 0.6255, 'grad_norm': 33.9465446472168, 'learning_rate': 6.875000000000001e-06, 'epoch': 43.12}


 87%|████████▋ | 2080/2400 [35:38<04:38,  1.15it/s]

{'loss': 0.5969, 'grad_norm': 31.511493682861328, 'learning_rate': 6.666666666666667e-06, 'epoch': 43.33}


 87%|████████▋ | 2090/2400 [35:46<04:19,  1.19it/s]

{'loss': 0.5641, 'grad_norm': 32.30062484741211, 'learning_rate': 6.458333333333334e-06, 'epoch': 43.54}


 88%|████████▊ | 2100/2400 [35:55<04:10,  1.20it/s]

{'loss': 0.637, 'grad_norm': 45.120479583740234, 'learning_rate': 6.25e-06, 'epoch': 43.75}


 88%|████████▊ | 2110/2400 [36:03<04:02,  1.20it/s]

{'loss': 0.6106, 'grad_norm': 36.37472915649414, 'learning_rate': 6.041666666666667e-06, 'epoch': 43.96}


 88%|████████▊ | 2112/2400 [36:04<03:32,  1.35it/s]
 88%|████████▊ | 2112/2400 [36:11<03:32,  1.35it/s]

{'eval_loss': 0.9426575303077698, 'eval_runtime': 6.9819, 'eval_samples_per_second': 17.187, 'eval_steps_per_second': 2.148, 'epoch': 44.0}


 88%|████████▊ | 2120/2400 [36:18<04:48,  1.03s/it]

{'loss': 0.6309, 'grad_norm': 94.66561126708984, 'learning_rate': 5.833333333333334e-06, 'epoch': 44.17}


 89%|████████▉ | 2130/2400 [36:27<03:55,  1.15it/s]

{'loss': 0.6112, 'grad_norm': 48.15950012207031, 'learning_rate': 5.625e-06, 'epoch': 44.38}


 89%|████████▉ | 2140/2400 [36:36<03:41,  1.17it/s]

{'loss': 0.6721, 'grad_norm': 29.6013126373291, 'learning_rate': 5.416666666666667e-06, 'epoch': 44.58}


 90%|████████▉ | 2150/2400 [36:44<03:28,  1.20it/s]

{'loss': 0.6197, 'grad_norm': 23.605493545532227, 'learning_rate': 5.208333333333334e-06, 'epoch': 44.79}


 90%|█████████ | 2160/2400 [36:52<03:04,  1.30it/s]

{'loss': 0.598, 'grad_norm': 19.95598030090332, 'learning_rate': 5e-06, 'epoch': 45.0}



 90%|█████████ | 2160/2400 [36:59<03:04,  1.30it/s]

{'eval_loss': 0.9855318665504456, 'eval_runtime': 7.099, 'eval_samples_per_second': 16.904, 'eval_steps_per_second': 2.113, 'epoch': 45.0}


 90%|█████████ | 2170/2400 [37:08<03:32,  1.08it/s]

{'loss': 0.549, 'grad_norm': 20.985706329345703, 'learning_rate': 4.791666666666667e-06, 'epoch': 45.21}


 91%|█████████ | 2180/2400 [37:16<03:05,  1.18it/s]

{'loss': 0.6828, 'grad_norm': 27.816730499267578, 'learning_rate': 4.583333333333333e-06, 'epoch': 45.42}


 91%|█████████▏| 2190/2400 [37:25<03:00,  1.17it/s]

{'loss': 0.5538, 'grad_norm': 22.692617416381836, 'learning_rate': 4.375e-06, 'epoch': 45.62}


 92%|█████████▏| 2200/2400 [37:33<02:47,  1.20it/s]

{'loss': 0.5893, 'grad_norm': 50.74863052368164, 'learning_rate': 4.166666666666667e-06, 'epoch': 45.83}


 92%|█████████▏| 2208/2400 [37:40<02:22,  1.35it/s]
 92%|█████████▏| 2208/2400 [37:47<02:22,  1.35it/s]

{'eval_loss': 0.9923914670944214, 'eval_runtime': 7.0779, 'eval_samples_per_second': 16.954, 'eval_steps_per_second': 2.119, 'epoch': 46.0}


 92%|█████████▏| 2210/2400 [37:49<07:27,  2.35s/it]

{'loss': 0.5276, 'grad_norm': 28.181968688964844, 'learning_rate': 3.958333333333333e-06, 'epoch': 46.04}


 92%|█████████▎| 2220/2400 [37:57<02:38,  1.13it/s]

{'loss': 0.6307, 'grad_norm': 30.90213394165039, 'learning_rate': 3.75e-06, 'epoch': 46.25}


 93%|█████████▎| 2230/2400 [38:06<02:22,  1.19it/s]

{'loss': 0.6057, 'grad_norm': 36.42060470581055, 'learning_rate': 3.541666666666667e-06, 'epoch': 46.46}


 93%|█████████▎| 2240/2400 [38:14<02:14,  1.19it/s]

{'loss': 0.5748, 'grad_norm': 19.757253646850586, 'learning_rate': 3.3333333333333333e-06, 'epoch': 46.67}


 94%|█████████▍| 2250/2400 [38:23<02:07,  1.18it/s]

{'loss': 0.5452, 'grad_norm': 32.732391357421875, 'learning_rate': 3.125e-06, 'epoch': 46.88}


 94%|█████████▍| 2256/2400 [38:27<01:46,  1.35it/s]
 94%|█████████▍| 2256/2400 [38:34<01:46,  1.35it/s]

{'eval_loss': 0.988380491733551, 'eval_runtime': 6.9609, 'eval_samples_per_second': 17.239, 'eval_steps_per_second': 2.155, 'epoch': 47.0}


 94%|█████████▍| 2260/2400 [38:38<03:38,  1.56s/it]

{'loss': 0.5613, 'grad_norm': 20.3570499420166, 'learning_rate': 2.916666666666667e-06, 'epoch': 47.08}


 95%|█████████▍| 2270/2400 [38:46<01:52,  1.16it/s]

{'loss': 0.4838, 'grad_norm': 35.78824996948242, 'learning_rate': 2.7083333333333334e-06, 'epoch': 47.29}


 95%|█████████▌| 2280/2400 [38:55<01:40,  1.19it/s]

{'loss': 0.6538, 'grad_norm': 16.033466339111328, 'learning_rate': 2.5e-06, 'epoch': 47.5}


 95%|█████████▌| 2290/2400 [39:03<01:31,  1.20it/s]

{'loss': 0.575, 'grad_norm': 32.48210525512695, 'learning_rate': 2.2916666666666666e-06, 'epoch': 47.71}


 96%|█████████▌| 2300/2400 [39:12<01:26,  1.16it/s]

{'loss': 0.5914, 'grad_norm': 17.866474151611328, 'learning_rate': 2.0833333333333334e-06, 'epoch': 47.92}


 96%|█████████▌| 2304/2400 [39:15<01:17,  1.23it/s]
 96%|█████████▌| 2304/2400 [39:22<01:17,  1.23it/s]

{'eval_loss': 0.9858497977256775, 'eval_runtime': 7.3338, 'eval_samples_per_second': 16.363, 'eval_steps_per_second': 2.045, 'epoch': 48.0}


 96%|█████████▋| 2310/2400 [39:28<01:51,  1.24s/it]

{'loss': 0.5858, 'grad_norm': 32.34412384033203, 'learning_rate': 1.875e-06, 'epoch': 48.12}


 97%|█████████▋| 2320/2400 [39:36<01:08,  1.17it/s]

{'loss': 0.605, 'grad_norm': 20.327680587768555, 'learning_rate': 1.6666666666666667e-06, 'epoch': 48.33}


 97%|█████████▋| 2330/2400 [39:45<00:59,  1.18it/s]

{'loss': 0.6403, 'grad_norm': 28.372346878051758, 'learning_rate': 1.4583333333333335e-06, 'epoch': 48.54}


 98%|█████████▊| 2340/2400 [39:53<00:50,  1.19it/s]

{'loss': 0.5407, 'grad_norm': 19.778850555419922, 'learning_rate': 1.25e-06, 'epoch': 48.75}


 98%|█████████▊| 2350/2400 [40:02<00:45,  1.11it/s]

{'loss': 0.5399, 'grad_norm': 22.098169326782227, 'learning_rate': 1.0416666666666667e-06, 'epoch': 48.96}


 98%|█████████▊| 2352/2400 [40:04<00:39,  1.23it/s]
 98%|█████████▊| 2352/2400 [40:11<00:39,  1.23it/s]

{'eval_loss': 0.9816278219223022, 'eval_runtime': 7.3217, 'eval_samples_per_second': 16.39, 'eval_steps_per_second': 2.049, 'epoch': 49.0}


 98%|█████████▊| 2360/2400 [40:18<00:41,  1.03s/it]

{'loss': 0.5909, 'grad_norm': 39.49052810668945, 'learning_rate': 8.333333333333333e-07, 'epoch': 49.17}


 99%|█████████▉| 2370/2400 [40:27<00:26,  1.12it/s]

{'loss': 0.6025, 'grad_norm': 18.97241973876953, 'learning_rate': 6.25e-07, 'epoch': 49.38}


 99%|█████████▉| 2380/2400 [40:36<00:17,  1.17it/s]

{'loss': 0.5581, 'grad_norm': 19.7707576751709, 'learning_rate': 4.1666666666666667e-07, 'epoch': 49.58}


100%|█████████▉| 2390/2400 [40:44<00:08,  1.19it/s]

{'loss': 0.637, 'grad_norm': 55.67551803588867, 'learning_rate': 2.0833333333333333e-07, 'epoch': 49.79}


100%|██████████| 2400/2400 [40:52<00:00,  1.31it/s]

{'loss': 0.6196, 'grad_norm': 14.181977272033691, 'learning_rate': 0.0, 'epoch': 50.0}



100%|██████████| 2400/2400 [40:59<00:00,  1.02s/it]


{'eval_loss': 0.9769041538238525, 'eval_runtime': 7.1851, 'eval_samples_per_second': 16.701, 'eval_steps_per_second': 2.088, 'epoch': 50.0}
{'train_runtime': 2459.9301, 'train_samples_per_second': 9.756, 'train_steps_per_second': 0.976, 'train_loss': 0.9346286817391714, 'epoch': 50.0}


100%|██████████| 15/15 [00:06<00:00,  2.17it/s]
Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-tiny and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([7]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 192]) in the checkpoint and torch.Size([7, 192]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda:0


  3%|▎         | 10/300 [00:11<05:12,  1.08s/it]

{'loss': 3.1714, 'grad_norm': 70.78014373779297, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.33}


  7%|▋         | 20/300 [00:22<04:59,  1.07s/it]

{'loss': 2.7615, 'grad_norm': 36.312564849853516, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.67}


 10%|█         | 30/300 [00:32<04:31,  1.01s/it]

{'loss': 2.3576, 'grad_norm': 33.613468170166016, 'learning_rate': 4.5e-05, 'epoch': 1.0}



 10%|█         | 30/300 [00:39<04:31,  1.01s/it]

{'eval_loss': 2.9309518337249756, 'eval_runtime': 6.5161, 'eval_samples_per_second': 18.416, 'eval_steps_per_second': 2.302, 'epoch': 1.0}


 13%|█▎        | 40/300 [00:50<05:09,  1.19s/it]

{'loss': 2.2187, 'grad_norm': 44.86357116699219, 'learning_rate': 4.3333333333333334e-05, 'epoch': 1.33}


 17%|█▋        | 50/300 [01:01<04:41,  1.13s/it]

{'loss': 2.152, 'grad_norm': 33.903648376464844, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.67}


 20%|██        | 60/300 [01:12<03:53,  1.03it/s]

{'loss': 2.0848, 'grad_norm': 30.29164695739746, 'learning_rate': 4e-05, 'epoch': 2.0}



 20%|██        | 60/300 [01:19<03:53,  1.03it/s]

{'eval_loss': 2.8232834339141846, 'eval_runtime': 6.8076, 'eval_samples_per_second': 17.627, 'eval_steps_per_second': 2.203, 'epoch': 2.0}


 23%|██▎       | 70/300 [01:30<04:23,  1.15s/it]

{'loss': 1.9213, 'grad_norm': 33.67599868774414, 'learning_rate': 3.8333333333333334e-05, 'epoch': 2.33}


 27%|██▋       | 80/300 [01:40<03:53,  1.06s/it]

{'loss': 2.061, 'grad_norm': 25.799541473388672, 'learning_rate': 3.6666666666666666e-05, 'epoch': 2.67}


 30%|███       | 90/300 [01:51<03:23,  1.03it/s]

{'loss': 1.9892, 'grad_norm': 64.37162017822266, 'learning_rate': 3.5e-05, 'epoch': 3.0}



 30%|███       | 90/300 [01:57<03:23,  1.03it/s]

{'eval_loss': 2.7535080909729004, 'eval_runtime': 6.43, 'eval_samples_per_second': 18.662, 'eval_steps_per_second': 2.333, 'epoch': 3.0}


 33%|███▎      | 100/300 [02:08<03:49,  1.15s/it]

{'loss': 1.8322, 'grad_norm': 39.305397033691406, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}


 37%|███▋      | 110/300 [02:19<03:22,  1.06s/it]

{'loss': 1.6622, 'grad_norm': 22.928571701049805, 'learning_rate': 3.1666666666666666e-05, 'epoch': 3.67}


 40%|████      | 120/300 [02:30<03:04,  1.03s/it]

{'loss': 1.8258, 'grad_norm': 29.03209114074707, 'learning_rate': 3e-05, 'epoch': 4.0}



 40%|████      | 120/300 [02:36<03:04,  1.03s/it]

{'eval_loss': 2.549314022064209, 'eval_runtime': 6.488, 'eval_samples_per_second': 18.496, 'eval_steps_per_second': 2.312, 'epoch': 4.0}


 43%|████▎     | 130/300 [02:47<03:19,  1.17s/it]

{'loss': 1.6539, 'grad_norm': 131.1112060546875, 'learning_rate': 2.8333333333333335e-05, 'epoch': 4.33}


 47%|████▋     | 140/300 [02:58<02:52,  1.08s/it]

{'loss': 1.6202, 'grad_norm': 31.073400497436523, 'learning_rate': 2.6666666666666667e-05, 'epoch': 4.67}


 50%|█████     | 150/300 [03:09<02:25,  1.03it/s]

{'loss': 1.6948, 'grad_norm': 25.3666934967041, 'learning_rate': 2.5e-05, 'epoch': 5.0}



 50%|█████     | 150/300 [03:15<02:25,  1.03it/s]

{'eval_loss': 2.720458984375, 'eval_runtime': 6.8781, 'eval_samples_per_second': 17.447, 'eval_steps_per_second': 2.181, 'epoch': 5.0}


 53%|█████▎    | 160/300 [03:26<02:40,  1.14s/it]

{'loss': 1.616, 'grad_norm': 28.47684097290039, 'learning_rate': 2.3333333333333336e-05, 'epoch': 5.33}


 57%|█████▋    | 170/300 [03:37<02:18,  1.06s/it]

{'loss': 1.6944, 'grad_norm': 28.174272537231445, 'learning_rate': 2.1666666666666667e-05, 'epoch': 5.67}


 60%|██████    | 180/300 [03:47<01:56,  1.03it/s]

{'loss': 1.5948, 'grad_norm': 22.262239456176758, 'learning_rate': 2e-05, 'epoch': 6.0}



 60%|██████    | 180/300 [03:54<01:56,  1.03it/s]

{'eval_loss': 2.573776960372925, 'eval_runtime': 6.6992, 'eval_samples_per_second': 17.913, 'eval_steps_per_second': 2.239, 'epoch': 6.0}


 63%|██████▎   | 190/300 [04:05<02:05,  1.14s/it]

{'loss': 1.4757, 'grad_norm': 20.361379623413086, 'learning_rate': 1.8333333333333333e-05, 'epoch': 6.33}


 67%|██████▋   | 200/300 [04:16<01:47,  1.07s/it]

{'loss': 1.5273, 'grad_norm': 31.9565486907959, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}


 70%|███████   | 210/300 [04:26<01:26,  1.04it/s]

{'loss': 1.5374, 'grad_norm': 26.899555206298828, 'learning_rate': 1.5e-05, 'epoch': 7.0}



 70%|███████   | 210/300 [04:33<01:26,  1.04it/s]

{'eval_loss': 2.5422022342681885, 'eval_runtime': 6.6485, 'eval_samples_per_second': 18.049, 'eval_steps_per_second': 2.256, 'epoch': 7.0}


 73%|███████▎  | 220/300 [04:44<01:32,  1.15s/it]

{'loss': 1.4767, 'grad_norm': 22.525907516479492, 'learning_rate': 1.3333333333333333e-05, 'epoch': 7.33}


 77%|███████▋  | 230/300 [04:54<01:15,  1.07s/it]

{'loss': 1.4233, 'grad_norm': 23.474853515625, 'learning_rate': 1.1666666666666668e-05, 'epoch': 7.67}


 80%|████████  | 240/300 [05:05<00:57,  1.04it/s]

{'loss': 1.3876, 'grad_norm': 41.45762252807617, 'learning_rate': 1e-05, 'epoch': 8.0}



 80%|████████  | 240/300 [05:12<00:57,  1.04it/s]

{'eval_loss': 2.4967851638793945, 'eval_runtime': 6.8612, 'eval_samples_per_second': 17.49, 'eval_steps_per_second': 2.186, 'epoch': 8.0}


 83%|████████▎ | 250/300 [05:23<01:01,  1.24s/it]

{'loss': 1.3406, 'grad_norm': 27.235563278198242, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}


 87%|████████▋ | 260/300 [05:35<00:48,  1.21s/it]

{'loss': 1.4289, 'grad_norm': 19.98151397705078, 'learning_rate': 6.666666666666667e-06, 'epoch': 8.67}


 90%|█████████ | 270/300 [05:47<00:33,  1.11s/it]

{'loss': 1.4095, 'grad_norm': 20.374225616455078, 'learning_rate': 5e-06, 'epoch': 9.0}



 90%|█████████ | 270/300 [05:54<00:33,  1.11s/it]

{'eval_loss': 2.432713508605957, 'eval_runtime': 6.881, 'eval_samples_per_second': 17.439, 'eval_steps_per_second': 2.18, 'epoch': 9.0}


 93%|█████████▎| 280/300 [06:06<00:24,  1.25s/it]

{'loss': 1.334, 'grad_norm': 16.399263381958008, 'learning_rate': 3.3333333333333333e-06, 'epoch': 9.33}


 97%|█████████▋| 290/300 [06:18<00:11,  1.14s/it]

{'loss': 1.2613, 'grad_norm': 26.53651237487793, 'learning_rate': 1.6666666666666667e-06, 'epoch': 9.67}


100%|██████████| 300/300 [06:29<00:00,  1.04s/it]

{'loss': 1.367, 'grad_norm': 18.546369552612305, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 300/300 [06:36<00:00,  1.32s/it]


{'eval_loss': 2.46610689163208, 'eval_runtime': 7.2456, 'eval_samples_per_second': 16.562, 'eval_steps_per_second': 2.07, 'epoch': 10.0}
{'train_runtime': 396.413, 'train_samples_per_second': 7.568, 'train_steps_per_second': 0.757, 'train_loss': 1.762710173924764, 'epoch': 10.0}


100%|██████████| 15/15 [00:06<00:00,  2.42it/s]
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetecti

Using device: cuda:0


  3%|▎         | 10/300 [00:09<04:26,  1.09it/s]

{'loss': 5.1472, 'grad_norm': 256.7642517089844, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.33}


  7%|▋         | 20/300 [00:18<04:15,  1.10it/s]

{'loss': 4.3557, 'grad_norm': 56.46696090698242, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.67}


 10%|█         | 30/300 [00:27<03:41,  1.22it/s]

{'loss': 3.9679, 'grad_norm': 218.04197692871094, 'learning_rate': 4.5e-05, 'epoch': 1.0}



 10%|█         | 30/300 [00:35<03:41,  1.22it/s]

{'eval_loss': 4.599515438079834, 'eval_runtime': 7.5971, 'eval_samples_per_second': 15.795, 'eval_steps_per_second': 1.974, 'epoch': 1.0}


 13%|█▎        | 40/300 [00:44<04:16,  1.01it/s]

{'loss': 3.3562, 'grad_norm': 43.49403762817383, 'learning_rate': 4.3333333333333334e-05, 'epoch': 1.33}


 17%|█▋        | 50/300 [00:53<03:43,  1.12it/s]

{'loss': 3.1018, 'grad_norm': 239.92384338378906, 'learning_rate': 4.166666666666667e-05, 'epoch': 1.67}


 20%|██        | 60/300 [01:02<03:10,  1.26it/s]

{'loss': 3.1036, 'grad_norm': 125.65767669677734, 'learning_rate': 4e-05, 'epoch': 2.0}



 20%|██        | 60/300 [01:09<03:10,  1.26it/s]

{'eval_loss': 4.062942981719971, 'eval_runtime': 7.5098, 'eval_samples_per_second': 15.979, 'eval_steps_per_second': 1.997, 'epoch': 2.0}


 23%|██▎       | 70/300 [01:19<03:48,  1.00it/s]

{'loss': 2.9073, 'grad_norm': 48.91495895385742, 'learning_rate': 3.8333333333333334e-05, 'epoch': 2.33}


 27%|██▋       | 80/300 [01:28<03:18,  1.11it/s]

{'loss': 2.8937, 'grad_norm': 47.364463806152344, 'learning_rate': 3.6666666666666666e-05, 'epoch': 2.67}


 30%|███       | 90/300 [01:36<02:47,  1.26it/s]

{'loss': 2.7114, 'grad_norm': 40.60143280029297, 'learning_rate': 3.5e-05, 'epoch': 3.0}



 30%|███       | 90/300 [01:44<02:47,  1.26it/s]

{'eval_loss': 3.707181215286255, 'eval_runtime': 7.9032, 'eval_samples_per_second': 15.184, 'eval_steps_per_second': 1.898, 'epoch': 3.0}


 33%|███▎      | 100/300 [01:54<03:28,  1.04s/it]

{'loss': 2.4727, 'grad_norm': 62.742156982421875, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}


 37%|███▋      | 110/300 [02:04<02:57,  1.07it/s]

{'loss': 2.463, 'grad_norm': 38.32307815551758, 'learning_rate': 3.1666666666666666e-05, 'epoch': 3.67}


 40%|████      | 120/300 [02:12<02:24,  1.25it/s]

{'loss': 2.504, 'grad_norm': 82.25381469726562, 'learning_rate': 3e-05, 'epoch': 4.0}



 40%|████      | 120/300 [02:20<02:24,  1.25it/s]

{'eval_loss': 3.2190816402435303, 'eval_runtime': 7.6259, 'eval_samples_per_second': 15.736, 'eval_steps_per_second': 1.967, 'epoch': 4.0}


 43%|████▎     | 130/300 [02:29<02:45,  1.02it/s]

{'loss': 2.6147, 'grad_norm': 47.52312469482422, 'learning_rate': 2.8333333333333335e-05, 'epoch': 4.33}


 47%|████▋     | 140/300 [02:38<02:22,  1.12it/s]

{'loss': 2.3048, 'grad_norm': 66.79940795898438, 'learning_rate': 2.6666666666666667e-05, 'epoch': 4.67}


 50%|█████     | 150/300 [02:47<01:57,  1.27it/s]

{'loss': 2.5865, 'grad_norm': 614.2724609375, 'learning_rate': 2.5e-05, 'epoch': 5.0}



 50%|█████     | 150/300 [02:54<01:57,  1.27it/s]

{'eval_loss': 3.1404430866241455, 'eval_runtime': 7.3055, 'eval_samples_per_second': 16.426, 'eval_steps_per_second': 2.053, 'epoch': 5.0}


 53%|█████▎    | 160/300 [03:03<02:19,  1.01it/s]

{'loss': 2.1794, 'grad_norm': 53.9801139831543, 'learning_rate': 2.3333333333333336e-05, 'epoch': 5.33}


 57%|█████▋    | 170/300 [03:12<01:57,  1.11it/s]

{'loss': 2.2497, 'grad_norm': 102.662353515625, 'learning_rate': 2.1666666666666667e-05, 'epoch': 5.67}


 60%|██████    | 180/300 [03:21<01:36,  1.25it/s]

{'loss': 2.3301, 'grad_norm': 35.23703384399414, 'learning_rate': 2e-05, 'epoch': 6.0}



 60%|██████    | 180/300 [03:29<01:36,  1.25it/s]

{'eval_loss': 2.865917444229126, 'eval_runtime': 7.5011, 'eval_samples_per_second': 15.998, 'eval_steps_per_second': 2.0, 'epoch': 6.0}


 63%|██████▎   | 190/300 [03:38<01:50,  1.00s/it]

{'loss': 2.1704, 'grad_norm': 61.36478042602539, 'learning_rate': 1.8333333333333333e-05, 'epoch': 6.33}


 67%|██████▋   | 200/300 [03:47<01:30,  1.11it/s]

{'loss': 2.179, 'grad_norm': 45.7247314453125, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}


 70%|███████   | 210/300 [03:56<01:10,  1.27it/s]

{'loss': 2.5547, 'grad_norm': 91.46826171875, 'learning_rate': 1.5e-05, 'epoch': 7.0}



 70%|███████   | 210/300 [04:03<01:10,  1.27it/s]

{'eval_loss': 3.3879573345184326, 'eval_runtime': 7.3555, 'eval_samples_per_second': 16.314, 'eval_steps_per_second': 2.039, 'epoch': 7.0}


 73%|███████▎  | 220/300 [04:12<01:17,  1.03it/s]

{'loss': 2.3535, 'grad_norm': 1590.615478515625, 'learning_rate': 1.3333333333333333e-05, 'epoch': 7.33}


 77%|███████▋  | 230/300 [04:21<01:01,  1.13it/s]

{'loss': 2.0791, 'grad_norm': 60.17585372924805, 'learning_rate': 1.1666666666666668e-05, 'epoch': 7.67}


 80%|████████  | 240/300 [04:30<00:49,  1.22it/s]

{'loss': 2.0399, 'grad_norm': 109.01530456542969, 'learning_rate': 1e-05, 'epoch': 8.0}



 80%|████████  | 240/300 [04:37<00:49,  1.22it/s]

{'eval_loss': 2.73077130317688, 'eval_runtime': 7.2068, 'eval_samples_per_second': 16.651, 'eval_steps_per_second': 2.081, 'epoch': 8.0}


 83%|████████▎ | 250/300 [04:47<00:49,  1.02it/s]

{'loss': 1.9557, 'grad_norm': 40.40715026855469, 'learning_rate': 8.333333333333334e-06, 'epoch': 8.33}


 87%|████████▋ | 260/300 [04:56<00:37,  1.06it/s]

{'loss': 2.1451, 'grad_norm': 36.5137939453125, 'learning_rate': 6.666666666666667e-06, 'epoch': 8.67}


 90%|█████████ | 270/300 [05:04<00:23,  1.26it/s]

{'loss': 2.0162, 'grad_norm': 83.4551010131836, 'learning_rate': 5e-06, 'epoch': 9.0}



 90%|█████████ | 270/300 [05:12<00:23,  1.26it/s]

{'eval_loss': 2.7870638370513916, 'eval_runtime': 7.315, 'eval_samples_per_second': 16.405, 'eval_steps_per_second': 2.051, 'epoch': 9.0}


 93%|█████████▎| 280/300 [05:21<00:19,  1.01it/s]

{'loss': 1.9604, 'grad_norm': 91.37383270263672, 'learning_rate': 3.3333333333333333e-06, 'epoch': 9.33}


 97%|█████████▋| 290/300 [05:30<00:08,  1.11it/s]

{'loss': 1.9493, 'grad_norm': 37.27324676513672, 'learning_rate': 1.6666666666666667e-06, 'epoch': 9.67}


100%|██████████| 300/300 [05:39<00:00,  1.25it/s]

{'loss': 1.9988, 'grad_norm': 97.90109252929688, 'learning_rate': 0.0, 'epoch': 10.0}



100%|██████████| 300/300 [05:46<00:00,  1.15s/it]


{'eval_loss': 2.8095669746398926, 'eval_runtime': 7.1177, 'eval_samples_per_second': 16.859, 'eval_steps_per_second': 2.107, 'epoch': 10.0}
{'train_runtime': 346.3445, 'train_samples_per_second': 8.662, 'train_steps_per_second': 0.866, 'train_loss': 2.621730702718099, 'epoch': 10.0}


100%|██████████| 15/15 [00:06<00:00,  2.32it/s]
Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-tiny and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([2]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 192]) in the checkpoint and torch.Size([2, 192]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda:0


  1%|          | 10/1200 [00:12<23:29,  1.18s/it]

{'loss': 1.782, 'grad_norm': 44.13979721069336, 'learning_rate': 4.958333333333334e-05, 'epoch': 0.42}


  2%|▏         | 20/1200 [00:23<23:41,  1.20s/it]

{'loss': 1.3475, 'grad_norm': 38.25126647949219, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.83}


  2%|▏         | 24/1200 [00:28<20:48,  1.06s/it]
  2%|▏         | 24/1200 [00:31<20:48,  1.06s/it]

{'eval_loss': 1.271316409111023, 'eval_runtime': 3.5937, 'eval_samples_per_second': 16.696, 'eval_steps_per_second': 2.226, 'epoch': 1.0}


  2%|▎         | 30/1200 [00:39<26:47,  1.37s/it]

{'loss': 1.2846, 'grad_norm': 34.4951057434082, 'learning_rate': 4.875e-05, 'epoch': 1.25}


  3%|▎         | 40/1200 [00:50<22:06,  1.14s/it]

{'loss': 1.2679, 'grad_norm': 45.64336013793945, 'learning_rate': 4.8333333333333334e-05, 'epoch': 1.67}


  4%|▍         | 48/1200 [00:59<19:52,  1.04s/it]
  4%|▍         | 48/1200 [01:03<19:52,  1.04s/it]

{'eval_loss': 1.2075865268707275, 'eval_runtime': 3.5196, 'eval_samples_per_second': 17.047, 'eval_steps_per_second': 2.273, 'epoch': 2.0}


  4%|▍         | 50/1200 [01:05<36:30,  1.91s/it]

{'loss': 1.3146, 'grad_norm': 53.9569091796875, 'learning_rate': 4.791666666666667e-05, 'epoch': 2.08}


  5%|▌         | 60/1200 [01:17<21:54,  1.15s/it]

{'loss': 1.1126, 'grad_norm': 66.23081970214844, 'learning_rate': 4.75e-05, 'epoch': 2.5}


  6%|▌         | 70/1200 [01:28<21:25,  1.14s/it]

{'loss': 1.0539, 'grad_norm': 33.212745666503906, 'learning_rate': 4.708333333333334e-05, 'epoch': 2.92}


  6%|▌         | 72/1200 [01:30<19:23,  1.03s/it]
  6%|▌         | 72/1200 [01:33<19:23,  1.03s/it]

{'eval_loss': 1.1426091194152832, 'eval_runtime': 3.4915, 'eval_samples_per_second': 17.184, 'eval_steps_per_second': 2.291, 'epoch': 3.0}


  7%|▋         | 80/1200 [01:43<23:10,  1.24s/it]

{'loss': 1.0887, 'grad_norm': 35.28169250488281, 'learning_rate': 4.666666666666667e-05, 'epoch': 3.33}


  8%|▊         | 90/1200 [01:54<21:11,  1.15s/it]

{'loss': 1.0968, 'grad_norm': 52.51437759399414, 'learning_rate': 4.6250000000000006e-05, 'epoch': 3.75}


  8%|▊         | 96/1200 [02:01<18:56,  1.03s/it]
  8%|▊         | 96/1200 [02:04<18:56,  1.03s/it]

{'eval_loss': 1.17392098903656, 'eval_runtime': 3.4818, 'eval_samples_per_second': 17.232, 'eval_steps_per_second': 2.298, 'epoch': 4.0}


  8%|▊         | 100/1200 [02:09<27:42,  1.51s/it]

{'loss': 1.0184, 'grad_norm': 35.90703582763672, 'learning_rate': 4.5833333333333334e-05, 'epoch': 4.17}


  9%|▉         | 110/1200 [02:21<21:07,  1.16s/it]

{'loss': 1.0403, 'grad_norm': 35.73445510864258, 'learning_rate': 4.541666666666667e-05, 'epoch': 4.58}


 10%|█         | 120/1200 [02:32<19:46,  1.10s/it]

{'loss': 1.0626, 'grad_norm': 42.64826965332031, 'learning_rate': 4.5e-05, 'epoch': 5.0}



 10%|█         | 120/1200 [02:36<19:46,  1.10s/it]

{'eval_loss': 1.111043095588684, 'eval_runtime': 3.6591, 'eval_samples_per_second': 16.398, 'eval_steps_per_second': 2.186, 'epoch': 5.0}


 11%|█         | 130/1200 [02:49<23:41,  1.33s/it]

{'loss': 1.0777, 'grad_norm': 30.40611457824707, 'learning_rate': 4.458333333333334e-05, 'epoch': 5.42}


 12%|█▏        | 140/1200 [03:01<21:11,  1.20s/it]

{'loss': 1.0194, 'grad_norm': 34.19078063964844, 'learning_rate': 4.4166666666666665e-05, 'epoch': 5.83}


 12%|█▏        | 144/1200 [03:05<19:47,  1.12s/it]
 12%|█▏        | 144/1200 [03:09<19:47,  1.12s/it]

{'eval_loss': 1.0929512977600098, 'eval_runtime': 3.635, 'eval_samples_per_second': 16.506, 'eval_steps_per_second': 2.201, 'epoch': 6.0}


 12%|█▎        | 150/1200 [03:16<23:37,  1.35s/it]

{'loss': 0.9841, 'grad_norm': 35.91096115112305, 'learning_rate': 4.375e-05, 'epoch': 6.25}


 13%|█▎        | 160/1200 [03:28<19:46,  1.14s/it]

{'loss': 1.011, 'grad_norm': 47.04050827026367, 'learning_rate': 4.3333333333333334e-05, 'epoch': 6.67}


 14%|█▍        | 168/1200 [03:36<17:38,  1.03s/it]
 14%|█▍        | 168/1200 [03:40<17:38,  1.03s/it]

{'eval_loss': 1.0742329359054565, 'eval_runtime': 3.4804, 'eval_samples_per_second': 17.239, 'eval_steps_per_second': 2.299, 'epoch': 7.0}


 14%|█▍        | 170/1200 [03:43<32:20,  1.88s/it]

{'loss': 0.8938, 'grad_norm': 46.459903717041016, 'learning_rate': 4.291666666666667e-05, 'epoch': 7.08}


 15%|█▌        | 180/1200 [03:54<19:39,  1.16s/it]

{'loss': 0.9034, 'grad_norm': 43.551631927490234, 'learning_rate': 4.25e-05, 'epoch': 7.5}


 16%|█▌        | 190/1200 [04:05<18:57,  1.13s/it]

{'loss': 0.9356, 'grad_norm': 24.883878707885742, 'learning_rate': 4.208333333333334e-05, 'epoch': 7.92}


 16%|█▌        | 192/1200 [04:07<17:12,  1.02s/it]
 16%|█▌        | 192/1200 [04:11<17:12,  1.02s/it]

{'eval_loss': 1.0774339437484741, 'eval_runtime': 3.442, 'eval_samples_per_second': 17.432, 'eval_steps_per_second': 2.324, 'epoch': 8.0}


 17%|█▋        | 200/1200 [04:20<20:05,  1.21s/it]

{'loss': 0.8653, 'grad_norm': 36.03822326660156, 'learning_rate': 4.166666666666667e-05, 'epoch': 8.33}


 18%|█▊        | 210/1200 [04:31<18:40,  1.13s/it]

{'loss': 0.9082, 'grad_norm': 32.983341217041016, 'learning_rate': 4.125e-05, 'epoch': 8.75}


 18%|█▊        | 216/1200 [04:38<16:48,  1.02s/it]
 18%|█▊        | 216/1200 [04:41<16:48,  1.02s/it]

{'eval_loss': 1.1376816034317017, 'eval_runtime': 3.4672, 'eval_samples_per_second': 17.305, 'eval_steps_per_second': 2.307, 'epoch': 9.0}


 18%|█▊        | 220/1200 [04:46<24:25,  1.50s/it]

{'loss': 0.962, 'grad_norm': 62.792049407958984, 'learning_rate': 4.0833333333333334e-05, 'epoch': 9.17}


 19%|█▉        | 230/1200 [04:57<18:31,  1.15s/it]

{'loss': 0.8223, 'grad_norm': 25.962841033935547, 'learning_rate': 4.041666666666667e-05, 'epoch': 9.58}


 20%|██        | 240/1200 [05:08<16:27,  1.03s/it]

{'loss': 0.9483, 'grad_norm': 34.47007369995117, 'learning_rate': 4e-05, 'epoch': 10.0}



 20%|██        | 240/1200 [05:12<16:27,  1.03s/it]

{'eval_loss': 1.1750428676605225, 'eval_runtime': 3.4736, 'eval_samples_per_second': 17.273, 'eval_steps_per_second': 2.303, 'epoch': 10.0}


 21%|██        | 250/1200 [05:23<18:26,  1.16s/it]

{'loss': 0.8656, 'grad_norm': 32.35223388671875, 'learning_rate': 3.958333333333333e-05, 'epoch': 10.42}


 22%|██▏       | 260/1200 [05:35<17:46,  1.13s/it]

{'loss': 0.8264, 'grad_norm': 51.4625129699707, 'learning_rate': 3.9166666666666665e-05, 'epoch': 10.83}


 22%|██▏       | 264/1200 [05:39<15:59,  1.03s/it]
 22%|██▏       | 264/1200 [05:42<15:59,  1.03s/it]

{'eval_loss': 0.9957117438316345, 'eval_runtime': 3.5028, 'eval_samples_per_second': 17.129, 'eval_steps_per_second': 2.284, 'epoch': 11.0}


 22%|██▎       | 270/1200 [05:49<20:16,  1.31s/it]

{'loss': 0.8696, 'grad_norm': 29.44060707092285, 'learning_rate': 3.875e-05, 'epoch': 11.25}


 23%|██▎       | 280/1200 [06:01<17:11,  1.12s/it]

{'loss': 0.8372, 'grad_norm': 36.73258590698242, 'learning_rate': 3.8333333333333334e-05, 'epoch': 11.67}


 24%|██▍       | 288/1200 [06:09<15:27,  1.02s/it]
 24%|██▍       | 288/1200 [06:13<15:27,  1.02s/it]

{'eval_loss': 0.9855015277862549, 'eval_runtime': 3.4512, 'eval_samples_per_second': 17.385, 'eval_steps_per_second': 2.318, 'epoch': 12.0}


 24%|██▍       | 290/1200 [06:15<28:31,  1.88s/it]

{'loss': 0.8792, 'grad_norm': 49.400386810302734, 'learning_rate': 3.791666666666667e-05, 'epoch': 12.08}


 25%|██▌       | 300/1200 [06:27<17:11,  1.15s/it]

{'loss': 0.7898, 'grad_norm': 28.812746047973633, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


 26%|██▌       | 310/1200 [06:38<16:35,  1.12s/it]

{'loss': 0.8099, 'grad_norm': 58.42182540893555, 'learning_rate': 3.708333333333334e-05, 'epoch': 12.92}


 26%|██▌       | 312/1200 [06:40<15:05,  1.02s/it]
 26%|██▌       | 312/1200 [06:43<15:05,  1.02s/it]

{'eval_loss': 1.0290693044662476, 'eval_runtime': 3.4846, 'eval_samples_per_second': 17.218, 'eval_steps_per_second': 2.296, 'epoch': 13.0}


 27%|██▋       | 320/1200 [06:53<17:51,  1.22s/it]

{'loss': 0.9038, 'grad_norm': 61.0261344909668, 'learning_rate': 3.6666666666666666e-05, 'epoch': 13.33}


 28%|██▊       | 330/1200 [07:04<16:20,  1.13s/it]

{'loss': 0.7191, 'grad_norm': 41.714595794677734, 'learning_rate': 3.625e-05, 'epoch': 13.75}


 28%|██▊       | 336/1200 [07:10<14:43,  1.02s/it]
 28%|██▊       | 336/1200 [07:14<14:43,  1.02s/it]

{'eval_loss': 1.0684961080551147, 'eval_runtime': 3.4523, 'eval_samples_per_second': 17.379, 'eval_steps_per_second': 2.317, 'epoch': 14.0}


 28%|██▊       | 340/1200 [07:19<21:25,  1.49s/it]

{'loss': 0.7727, 'grad_norm': 26.327943801879883, 'learning_rate': 3.5833333333333335e-05, 'epoch': 14.17}


 29%|██▉       | 350/1200 [07:30<16:21,  1.15s/it]

{'loss': 0.7545, 'grad_norm': 23.69206428527832, 'learning_rate': 3.541666666666667e-05, 'epoch': 14.58}


 30%|███       | 360/1200 [07:41<15:01,  1.07s/it]

{'loss': 0.771, 'grad_norm': 31.524511337280273, 'learning_rate': 3.5e-05, 'epoch': 15.0}



 30%|███       | 360/1200 [07:45<15:01,  1.07s/it]

{'eval_loss': 1.0248286724090576, 'eval_runtime': 3.4988, 'eval_samples_per_second': 17.149, 'eval_steps_per_second': 2.287, 'epoch': 15.0}


 31%|███       | 370/1200 [07:57<16:14,  1.17s/it]

{'loss': 0.6605, 'grad_norm': 62.323795318603516, 'learning_rate': 3.458333333333333e-05, 'epoch': 15.42}


 32%|███▏      | 380/1200 [08:08<15:24,  1.13s/it]

{'loss': 0.703, 'grad_norm': 25.634899139404297, 'learning_rate': 3.4166666666666666e-05, 'epoch': 15.83}


 32%|███▏      | 384/1200 [08:12<14:05,  1.04s/it]
 32%|███▏      | 384/1200 [08:15<14:05,  1.04s/it]

{'eval_loss': 1.047667384147644, 'eval_runtime': 3.3513, 'eval_samples_per_second': 17.903, 'eval_steps_per_second': 2.387, 'epoch': 16.0}


 32%|███▎      | 390/1200 [08:23<17:43,  1.31s/it]

{'loss': 0.6826, 'grad_norm': 24.394603729248047, 'learning_rate': 3.375000000000001e-05, 'epoch': 16.25}


 33%|███▎      | 400/1200 [08:34<15:11,  1.14s/it]

{'loss': 0.6598, 'grad_norm': 36.094390869140625, 'learning_rate': 3.3333333333333335e-05, 'epoch': 16.67}


 34%|███▍      | 408/1200 [08:43<13:27,  1.02s/it]
 34%|███▍      | 408/1200 [08:46<13:27,  1.02s/it]

{'eval_loss': 1.1325603723526, 'eval_runtime': 3.4804, 'eval_samples_per_second': 17.24, 'eval_steps_per_second': 2.299, 'epoch': 17.0}


 34%|███▍      | 410/1200 [08:49<24:46,  1.88s/it]

{'loss': 0.6479, 'grad_norm': 30.846467971801758, 'learning_rate': 3.291666666666667e-05, 'epoch': 17.08}


 35%|███▌      | 420/1200 [09:00<15:08,  1.16s/it]

{'loss': 0.6829, 'grad_norm': 21.177974700927734, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


 36%|███▌      | 430/1200 [09:12<14:26,  1.12s/it]

{'loss': 0.7043, 'grad_norm': 26.681184768676758, 'learning_rate': 3.208333333333334e-05, 'epoch': 17.92}


 36%|███▌      | 432/1200 [09:14<13:06,  1.02s/it]
 36%|███▌      | 432/1200 [09:17<13:06,  1.02s/it]

{'eval_loss': 0.9623064398765564, 'eval_runtime': 3.4646, 'eval_samples_per_second': 17.318, 'eval_steps_per_second': 2.309, 'epoch': 18.0}


 37%|███▋      | 440/1200 [09:26<15:15,  1.20s/it]

{'loss': 0.6123, 'grad_norm': 39.31587219238281, 'learning_rate': 3.1666666666666666e-05, 'epoch': 18.33}


 38%|███▊      | 450/1200 [09:38<14:07,  1.13s/it]

{'loss': 0.6523, 'grad_norm': 40.84862518310547, 'learning_rate': 3.125e-05, 'epoch': 18.75}


 38%|███▊      | 456/1200 [09:44<12:38,  1.02s/it]
 38%|███▊      | 456/1200 [09:48<12:38,  1.02s/it]

{'eval_loss': 0.9761972427368164, 'eval_runtime': 3.4312, 'eval_samples_per_second': 17.487, 'eval_steps_per_second': 2.332, 'epoch': 19.0}


 38%|███▊      | 460/1200 [09:52<18:29,  1.50s/it]

{'loss': 0.6049, 'grad_norm': 31.37299156188965, 'learning_rate': 3.0833333333333335e-05, 'epoch': 19.17}


 39%|███▉      | 470/1200 [10:04<13:58,  1.15s/it]

{'loss': 0.6059, 'grad_norm': 33.0498161315918, 'learning_rate': 3.0416666666666666e-05, 'epoch': 19.58}


 40%|████      | 480/1200 [10:15<12:21,  1.03s/it]

{'loss': 0.5796, 'grad_norm': 30.33670425415039, 'learning_rate': 3e-05, 'epoch': 20.0}



 40%|████      | 480/1200 [10:18<12:21,  1.03s/it]

{'eval_loss': 0.996796190738678, 'eval_runtime': 3.4441, 'eval_samples_per_second': 17.421, 'eval_steps_per_second': 2.323, 'epoch': 20.0}


 41%|████      | 490/1200 [10:30<13:45,  1.16s/it]

{'loss': 0.6055, 'grad_norm': 28.743432998657227, 'learning_rate': 2.9583333333333335e-05, 'epoch': 20.42}


 42%|████▏     | 500/1200 [10:41<13:06,  1.12s/it]

{'loss': 0.565, 'grad_norm': 44.64845657348633, 'learning_rate': 2.916666666666667e-05, 'epoch': 20.83}


 42%|████▏     | 504/1200 [10:45<12:11,  1.05s/it]
 42%|████▏     | 504/1200 [10:49<12:11,  1.05s/it]

{'eval_loss': 1.0415679216384888, 'eval_runtime': 3.526, 'eval_samples_per_second': 17.016, 'eval_steps_per_second': 2.269, 'epoch': 21.0}


 42%|████▎     | 510/1200 [10:56<15:20,  1.33s/it]

{'loss': 0.54, 'grad_norm': 40.299861907958984, 'learning_rate': 2.8749999999999997e-05, 'epoch': 21.25}


 43%|████▎     | 520/1200 [11:08<13:01,  1.15s/it]

{'loss': 0.5208, 'grad_norm': 21.501684188842773, 'learning_rate': 2.8333333333333335e-05, 'epoch': 21.67}


 44%|████▍     | 528/1200 [11:17<11:53,  1.06s/it]
 44%|████▍     | 528/1200 [11:20<11:53,  1.06s/it]

{'eval_loss': 0.986933171749115, 'eval_runtime': 3.5484, 'eval_samples_per_second': 16.909, 'eval_steps_per_second': 2.255, 'epoch': 22.0}


 44%|████▍     | 530/1200 [11:23<21:31,  1.93s/it]

{'loss': 0.5371, 'grad_norm': 55.047325134277344, 'learning_rate': 2.791666666666667e-05, 'epoch': 22.08}


 45%|████▌     | 540/1200 [11:34<12:50,  1.17s/it]

{'loss': 0.5292, 'grad_norm': 39.64912033081055, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


 46%|████▌     | 550/1200 [11:46<12:13,  1.13s/it]

{'loss': 0.5196, 'grad_norm': 26.22129249572754, 'learning_rate': 2.7083333333333332e-05, 'epoch': 22.92}


 46%|████▌     | 552/1200 [11:47<11:03,  1.02s/it]
 46%|████▌     | 552/1200 [11:51<11:03,  1.02s/it]

{'eval_loss': 0.9682430624961853, 'eval_runtime': 3.5231, 'eval_samples_per_second': 17.03, 'eval_steps_per_second': 2.271, 'epoch': 23.0}


 47%|████▋     | 560/1200 [12:00<13:03,  1.22s/it]

{'loss': 0.4938, 'grad_norm': 25.249244689941406, 'learning_rate': 2.6666666666666667e-05, 'epoch': 23.33}


 48%|████▊     | 570/1200 [12:12<11:54,  1.13s/it]

{'loss': 0.4447, 'grad_norm': 41.31029510498047, 'learning_rate': 2.625e-05, 'epoch': 23.75}


 48%|████▊     | 576/1200 [12:19<11:14,  1.08s/it]
 48%|████▊     | 576/1200 [12:22<11:14,  1.08s/it]

{'eval_loss': 0.9852283596992493, 'eval_runtime': 3.6689, 'eval_samples_per_second': 16.353, 'eval_steps_per_second': 2.18, 'epoch': 24.0}


 48%|████▊     | 580/1200 [12:27<16:16,  1.57s/it]

{'loss': 0.4288, 'grad_norm': 22.649381637573242, 'learning_rate': 2.5833333333333336e-05, 'epoch': 24.17}


 49%|████▉     | 590/1200 [12:39<12:16,  1.21s/it]

{'loss': 0.443, 'grad_norm': 35.47003936767578, 'learning_rate': 2.5416666666666667e-05, 'epoch': 24.58}


 50%|█████     | 600/1200 [12:51<10:38,  1.06s/it]

{'loss': 0.5296, 'grad_norm': 43.11166763305664, 'learning_rate': 2.5e-05, 'epoch': 25.0}



 50%|█████     | 600/1200 [12:54<10:38,  1.06s/it]

{'eval_loss': 0.9691581726074219, 'eval_runtime': 3.7085, 'eval_samples_per_second': 16.179, 'eval_steps_per_second': 2.157, 'epoch': 25.0}


 51%|█████     | 610/1200 [13:06<11:49,  1.20s/it]

{'loss': 0.4231, 'grad_norm': 30.968318939208984, 'learning_rate': 2.4583333333333332e-05, 'epoch': 25.42}


 52%|█████▏    | 620/1200 [13:18<10:49,  1.12s/it]

{'loss': 0.4453, 'grad_norm': 30.951915740966797, 'learning_rate': 2.4166666666666667e-05, 'epoch': 25.83}


 52%|█████▏    | 624/1200 [13:22<09:58,  1.04s/it]
 52%|█████▏    | 624/1200 [13:25<09:58,  1.04s/it]

{'eval_loss': 0.9849132895469666, 'eval_runtime': 3.2986, 'eval_samples_per_second': 18.19, 'eval_steps_per_second': 2.425, 'epoch': 26.0}


 52%|█████▎    | 630/1200 [13:32<12:13,  1.29s/it]

{'loss': 0.4482, 'grad_norm': 54.42508316040039, 'learning_rate': 2.375e-05, 'epoch': 26.25}


 53%|█████▎    | 640/1200 [13:44<11:02,  1.18s/it]

{'loss': 0.4161, 'grad_norm': 20.060016632080078, 'learning_rate': 2.3333333333333336e-05, 'epoch': 26.67}


 54%|█████▍    | 648/1200 [13:52<09:26,  1.03s/it]
 54%|█████▍    | 648/1200 [13:56<09:26,  1.03s/it]

{'eval_loss': 0.9480457901954651, 'eval_runtime': 3.3562, 'eval_samples_per_second': 17.877, 'eval_steps_per_second': 2.384, 'epoch': 27.0}


 54%|█████▍    | 650/1200 [13:58<17:03,  1.86s/it]

{'loss': 0.4399, 'grad_norm': 29.088151931762695, 'learning_rate': 2.2916666666666667e-05, 'epoch': 27.08}


 55%|█████▌    | 660/1200 [14:09<10:15,  1.14s/it]

{'loss': 0.4085, 'grad_norm': 38.655242919921875, 'learning_rate': 2.25e-05, 'epoch': 27.5}


 56%|█████▌    | 670/1200 [14:21<10:05,  1.14s/it]

{'loss': 0.4027, 'grad_norm': 28.5137939453125, 'learning_rate': 2.2083333333333333e-05, 'epoch': 27.92}


 56%|█████▌    | 672/1200 [14:23<09:09,  1.04s/it]
 56%|█████▌    | 672/1200 [14:26<09:09,  1.04s/it]

{'eval_loss': 0.9310138821601868, 'eval_runtime': 3.4557, 'eval_samples_per_second': 17.362, 'eval_steps_per_second': 2.315, 'epoch': 28.0}


 57%|█████▋    | 680/1200 [14:36<10:28,  1.21s/it]

{'loss': 0.3742, 'grad_norm': 23.81414031982422, 'learning_rate': 2.1666666666666667e-05, 'epoch': 28.33}


 57%|█████▊    | 690/1200 [14:47<09:26,  1.11s/it]

{'loss': 0.3812, 'grad_norm': 29.293973922729492, 'learning_rate': 2.125e-05, 'epoch': 28.75}


 58%|█████▊    | 696/1200 [14:53<08:31,  1.01s/it]
 58%|█████▊    | 696/1200 [14:56<08:31,  1.01s/it]

{'eval_loss': 0.991721510887146, 'eval_runtime': 3.2736, 'eval_samples_per_second': 18.328, 'eval_steps_per_second': 2.444, 'epoch': 29.0}


 58%|█████▊    | 700/1200 [15:01<12:20,  1.48s/it]

{'loss': 0.3493, 'grad_norm': 34.44700241088867, 'learning_rate': 2.0833333333333336e-05, 'epoch': 29.17}


 59%|█████▉    | 710/1200 [15:12<09:08,  1.12s/it]

{'loss': 0.3526, 'grad_norm': 24.197019577026367, 'learning_rate': 2.0416666666666667e-05, 'epoch': 29.58}


 60%|██████    | 720/1200 [15:23<08:05,  1.01s/it]

{'loss': 0.3574, 'grad_norm': 24.7655086517334, 'learning_rate': 2e-05, 'epoch': 30.0}



 60%|██████    | 720/1200 [15:26<08:05,  1.01s/it]

{'eval_loss': 0.9464970827102661, 'eval_runtime': 3.2726, 'eval_samples_per_second': 18.334, 'eval_steps_per_second': 2.445, 'epoch': 30.0}


 61%|██████    | 730/1200 [15:38<09:08,  1.17s/it]

{'loss': 0.2957, 'grad_norm': 36.75757598876953, 'learning_rate': 1.9583333333333333e-05, 'epoch': 30.42}


 62%|██████▏   | 740/1200 [15:49<08:35,  1.12s/it]

{'loss': 0.3003, 'grad_norm': 39.20328903198242, 'learning_rate': 1.9166666666666667e-05, 'epoch': 30.83}


 62%|██████▏   | 744/1200 [15:53<07:47,  1.02s/it]
 62%|██████▏   | 744/1200 [15:57<07:47,  1.02s/it]

{'eval_loss': 0.9839399456977844, 'eval_runtime': 3.383, 'eval_samples_per_second': 17.736, 'eval_steps_per_second': 2.365, 'epoch': 31.0}


 62%|██████▎   | 750/1200 [16:04<09:48,  1.31s/it]

{'loss': 0.373, 'grad_norm': 40.044776916503906, 'learning_rate': 1.8750000000000002e-05, 'epoch': 31.25}


 63%|██████▎   | 760/1200 [16:16<08:28,  1.16s/it]

{'loss': 0.3183, 'grad_norm': 35.29056167602539, 'learning_rate': 1.8333333333333333e-05, 'epoch': 31.67}


 64%|██████▍   | 768/1200 [16:24<07:20,  1.02s/it]
 64%|██████▍   | 768/1200 [16:27<07:20,  1.02s/it]

{'eval_loss': 0.965093731880188, 'eval_runtime': 3.2741, 'eval_samples_per_second': 18.326, 'eval_steps_per_second': 2.443, 'epoch': 32.0}


 64%|██████▍   | 770/1200 [16:30<13:09,  1.84s/it]

{'loss': 0.2845, 'grad_norm': 42.26608657836914, 'learning_rate': 1.7916666666666667e-05, 'epoch': 32.08}


 65%|██████▌   | 780/1200 [16:41<07:54,  1.13s/it]

{'loss': 0.2691, 'grad_norm': 27.64021110534668, 'learning_rate': 1.75e-05, 'epoch': 32.5}


 66%|██████▌   | 790/1200 [16:52<07:33,  1.11s/it]

{'loss': 0.3239, 'grad_norm': 35.31296157836914, 'learning_rate': 1.7083333333333333e-05, 'epoch': 32.92}


 66%|██████▌   | 792/1200 [16:54<06:52,  1.01s/it]
 66%|██████▌   | 792/1200 [16:57<06:52,  1.01s/it]

{'eval_loss': 0.9614853262901306, 'eval_runtime': 3.2766, 'eval_samples_per_second': 18.311, 'eval_steps_per_second': 2.442, 'epoch': 33.0}


 67%|██████▋   | 800/1200 [17:07<08:03,  1.21s/it]

{'loss': 0.285, 'grad_norm': 27.223804473876953, 'learning_rate': 1.6666666666666667e-05, 'epoch': 33.33}


 68%|██████▊   | 810/1200 [17:18<07:14,  1.12s/it]

{'loss': 0.2802, 'grad_norm': 28.733306884765625, 'learning_rate': 1.6250000000000002e-05, 'epoch': 33.75}


 68%|██████▊   | 816/1200 [17:24<06:28,  1.01s/it]
 68%|██████▊   | 816/1200 [17:28<06:28,  1.01s/it]

{'eval_loss': 0.953344464302063, 'eval_runtime': 3.3178, 'eval_samples_per_second': 18.084, 'eval_steps_per_second': 2.411, 'epoch': 34.0}


 68%|██████▊   | 820/1200 [17:32<09:18,  1.47s/it]

{'loss': 0.2494, 'grad_norm': 42.81733703613281, 'learning_rate': 1.5833333333333333e-05, 'epoch': 34.17}


 69%|██████▉   | 830/1200 [17:44<06:54,  1.12s/it]

{'loss': 0.2463, 'grad_norm': 24.563913345336914, 'learning_rate': 1.5416666666666668e-05, 'epoch': 34.58}


 70%|███████   | 840/1200 [17:54<06:03,  1.01s/it]

{'loss': 0.2599, 'grad_norm': 28.939617156982422, 'learning_rate': 1.5e-05, 'epoch': 35.0}



 70%|███████   | 840/1200 [17:58<06:03,  1.01s/it]

{'eval_loss': 0.9735797047615051, 'eval_runtime': 3.2779, 'eval_samples_per_second': 18.304, 'eval_steps_per_second': 2.441, 'epoch': 35.0}


 71%|███████   | 850/1200 [18:09<06:49,  1.17s/it]

{'loss': 0.2454, 'grad_norm': 39.70338821411133, 'learning_rate': 1.4583333333333335e-05, 'epoch': 35.42}


 72%|███████▏  | 860/1200 [18:21<06:35,  1.16s/it]

{'loss': 0.2341, 'grad_norm': 20.13418197631836, 'learning_rate': 1.4166666666666668e-05, 'epoch': 35.83}


 72%|███████▏  | 864/1200 [18:25<05:51,  1.05s/it]
 72%|███████▏  | 864/1200 [18:29<05:51,  1.05s/it]

{'eval_loss': 0.958763837814331, 'eval_runtime': 3.6872, 'eval_samples_per_second': 16.273, 'eval_steps_per_second': 2.17, 'epoch': 36.0}


 72%|███████▎  | 870/1200 [18:36<07:21,  1.34s/it]

{'loss': 0.2267, 'grad_norm': 63.387611389160156, 'learning_rate': 1.3750000000000002e-05, 'epoch': 36.25}


 73%|███████▎  | 880/1200 [18:47<06:02,  1.13s/it]

{'loss': 0.2444, 'grad_norm': 24.292497634887695, 'learning_rate': 1.3333333333333333e-05, 'epoch': 36.67}


 74%|███████▍  | 888/1200 [18:56<05:26,  1.05s/it]
 74%|███████▍  | 888/1200 [18:59<05:26,  1.05s/it]

{'eval_loss': 1.0146052837371826, 'eval_runtime': 3.3559, 'eval_samples_per_second': 17.879, 'eval_steps_per_second': 2.384, 'epoch': 37.0}


 74%|███████▍  | 890/1200 [19:02<09:37,  1.86s/it]

{'loss': 0.2311, 'grad_norm': 53.771060943603516, 'learning_rate': 1.2916666666666668e-05, 'epoch': 37.08}


 75%|███████▌  | 900/1200 [19:13<05:42,  1.14s/it]

{'loss': 0.2534, 'grad_norm': 57.21837615966797, 'learning_rate': 1.25e-05, 'epoch': 37.5}


 76%|███████▌  | 910/1200 [19:25<05:33,  1.15s/it]

{'loss': 0.2014, 'grad_norm': 19.89214515686035, 'learning_rate': 1.2083333333333333e-05, 'epoch': 37.92}


 76%|███████▌  | 912/1200 [19:27<04:59,  1.04s/it]
 76%|███████▌  | 912/1200 [19:31<04:59,  1.04s/it]

{'eval_loss': 0.9703168272972107, 'eval_runtime': 3.5383, 'eval_samples_per_second': 16.957, 'eval_steps_per_second': 2.261, 'epoch': 38.0}


 77%|███████▋  | 920/1200 [19:40<05:40,  1.22s/it]

{'loss': 0.2214, 'grad_norm': 31.015676498413086, 'learning_rate': 1.1666666666666668e-05, 'epoch': 38.33}


 78%|███████▊  | 930/1200 [19:52<05:10,  1.15s/it]

{'loss': 0.1993, 'grad_norm': 25.792768478393555, 'learning_rate': 1.125e-05, 'epoch': 38.75}


 78%|███████▊  | 936/1200 [19:58<04:34,  1.04s/it]
 78%|███████▊  | 936/1200 [20:02<04:34,  1.04s/it]

{'eval_loss': 1.0190386772155762, 'eval_runtime': 3.44, 'eval_samples_per_second': 17.442, 'eval_steps_per_second': 2.326, 'epoch': 39.0}


 78%|███████▊  | 940/1200 [20:07<06:32,  1.51s/it]

{'loss': 0.1928, 'grad_norm': 48.38698196411133, 'learning_rate': 1.0833333333333334e-05, 'epoch': 39.17}


 79%|███████▉  | 950/1200 [20:18<04:47,  1.15s/it]

{'loss': 0.2237, 'grad_norm': 27.06391716003418, 'learning_rate': 1.0416666666666668e-05, 'epoch': 39.58}


 80%|████████  | 960/1200 [20:29<04:06,  1.03s/it]

{'loss': 0.1762, 'grad_norm': 48.89309310913086, 'learning_rate': 1e-05, 'epoch': 40.0}



 80%|████████  | 960/1200 [20:33<04:06,  1.03s/it]

{'eval_loss': 1.0212231874465942, 'eval_runtime': 3.5218, 'eval_samples_per_second': 17.037, 'eval_steps_per_second': 2.272, 'epoch': 40.0}


 81%|████████  | 970/1200 [20:44<04:29,  1.17s/it]

{'loss': 0.1904, 'grad_norm': 30.236618041992188, 'learning_rate': 9.583333333333334e-06, 'epoch': 40.42}


 82%|████████▏ | 980/1200 [20:56<04:09,  1.13s/it]

{'loss': 0.169, 'grad_norm': 17.584463119506836, 'learning_rate': 9.166666666666666e-06, 'epoch': 40.83}


 82%|████████▏ | 984/1200 [21:00<03:42,  1.03s/it]
 82%|████████▏ | 984/1200 [21:04<03:42,  1.03s/it]

{'eval_loss': 1.0114353895187378, 'eval_runtime': 3.5845, 'eval_samples_per_second': 16.739, 'eval_steps_per_second': 2.232, 'epoch': 41.0}


 82%|████████▎ | 990/1200 [21:11<04:34,  1.31s/it]

{'loss': 0.1838, 'grad_norm': 24.550806045532227, 'learning_rate': 8.75e-06, 'epoch': 41.25}


 83%|████████▎ | 1000/1200 [21:22<03:53,  1.17s/it]

{'loss': 0.1713, 'grad_norm': 23.808353424072266, 'learning_rate': 8.333333333333334e-06, 'epoch': 41.67}


 84%|████████▍ | 1008/1200 [21:31<03:22,  1.05s/it]
 84%|████████▍ | 1008/1200 [21:35<03:22,  1.05s/it]

{'eval_loss': 1.0257980823516846, 'eval_runtime': 3.6676, 'eval_samples_per_second': 16.359, 'eval_steps_per_second': 2.181, 'epoch': 42.0}


 84%|████████▍ | 1010/1200 [21:37<06:10,  1.95s/it]

{'loss': 0.1768, 'grad_norm': 25.41652488708496, 'learning_rate': 7.916666666666667e-06, 'epoch': 42.08}


 85%|████████▌ | 1020/1200 [21:49<03:30,  1.17s/it]

{'loss': 0.1746, 'grad_norm': 23.91901397705078, 'learning_rate': 7.5e-06, 'epoch': 42.5}


 86%|████████▌ | 1030/1200 [22:00<03:12,  1.13s/it]

{'loss': 0.1575, 'grad_norm': 47.205894470214844, 'learning_rate': 7.083333333333334e-06, 'epoch': 42.92}


 86%|████████▌ | 1032/1200 [22:02<02:53,  1.03s/it]
 86%|████████▌ | 1032/1200 [22:06<02:53,  1.03s/it]

{'eval_loss': 1.0292760133743286, 'eval_runtime': 3.4583, 'eval_samples_per_second': 17.35, 'eval_steps_per_second': 2.313, 'epoch': 43.0}


 87%|████████▋ | 1040/1200 [22:15<03:13,  1.21s/it]

{'loss': 0.1363, 'grad_norm': 32.834266662597656, 'learning_rate': 6.666666666666667e-06, 'epoch': 43.33}


 88%|████████▊ | 1050/1200 [22:26<02:48,  1.13s/it]

{'loss': 0.1536, 'grad_norm': 27.688234329223633, 'learning_rate': 6.25e-06, 'epoch': 43.75}


 88%|████████▊ | 1056/1200 [22:33<02:30,  1.05s/it]
 88%|████████▊ | 1056/1200 [22:36<02:30,  1.05s/it]

{'eval_loss': 1.0196070671081543, 'eval_runtime': 3.4552, 'eval_samples_per_second': 17.365, 'eval_steps_per_second': 2.315, 'epoch': 44.0}


 88%|████████▊ | 1060/1200 [22:41<03:30,  1.51s/it]

{'loss': 0.156, 'grad_norm': 29.500364303588867, 'learning_rate': 5.833333333333334e-06, 'epoch': 44.17}


 89%|████████▉ | 1070/1200 [22:52<02:28,  1.14s/it]

{'loss': 0.1402, 'grad_norm': 67.59264373779297, 'learning_rate': 5.416666666666667e-06, 'epoch': 44.58}


 90%|█████████ | 1080/1200 [23:03<02:04,  1.04s/it]

{'loss': 0.1442, 'grad_norm': 36.154972076416016, 'learning_rate': 5e-06, 'epoch': 45.0}



 90%|█████████ | 1080/1200 [23:07<02:04,  1.04s/it]

{'eval_loss': 1.0358890295028687, 'eval_runtime': 3.5846, 'eval_samples_per_second': 16.738, 'eval_steps_per_second': 2.232, 'epoch': 45.0}


 91%|█████████ | 1090/1200 [23:19<02:09,  1.17s/it]

{'loss': 0.1125, 'grad_norm': 38.23357009887695, 'learning_rate': 4.583333333333333e-06, 'epoch': 45.42}


 92%|█████████▏| 1100/1200 [23:30<01:54,  1.14s/it]

{'loss': 0.1337, 'grad_norm': 20.861005783081055, 'learning_rate': 4.166666666666667e-06, 'epoch': 45.83}


 92%|█████████▏| 1104/1200 [23:34<01:39,  1.04s/it]
 92%|█████████▏| 1104/1200 [23:38<01:39,  1.04s/it]

{'eval_loss': 1.0331833362579346, 'eval_runtime': 3.4871, 'eval_samples_per_second': 17.206, 'eval_steps_per_second': 2.294, 'epoch': 46.0}


 92%|█████████▎| 1110/1200 [23:45<01:58,  1.32s/it]

{'loss': 0.1769, 'grad_norm': 31.215471267700195, 'learning_rate': 3.75e-06, 'epoch': 46.25}


 93%|█████████▎| 1120/1200 [23:56<01:30,  1.13s/it]

{'loss': 0.1196, 'grad_norm': 19.547391891479492, 'learning_rate': 3.3333333333333333e-06, 'epoch': 46.67}


 94%|█████████▍| 1128/1200 [24:05<01:13,  1.03s/it]
 94%|█████████▍| 1128/1200 [24:08<01:13,  1.03s/it]

{'eval_loss': 1.036226749420166, 'eval_runtime': 3.4766, 'eval_samples_per_second': 17.258, 'eval_steps_per_second': 2.301, 'epoch': 47.0}


 94%|█████████▍| 1130/1200 [24:11<02:12,  1.89s/it]

{'loss': 0.1082, 'grad_norm': 31.638479232788086, 'learning_rate': 2.916666666666667e-06, 'epoch': 47.08}


 95%|█████████▌| 1140/1200 [24:22<01:08,  1.15s/it]

{'loss': 0.0949, 'grad_norm': 17.711753845214844, 'learning_rate': 2.5e-06, 'epoch': 47.5}


 96%|█████████▌| 1150/1200 [24:34<00:56,  1.14s/it]

{'loss': 0.1106, 'grad_norm': 25.65234375, 'learning_rate': 2.0833333333333334e-06, 'epoch': 47.92}


 96%|█████████▌| 1152/1200 [24:36<00:49,  1.03s/it]
 96%|█████████▌| 1152/1200 [24:39<00:49,  1.03s/it]

{'eval_loss': 1.0354410409927368, 'eval_runtime': 3.4667, 'eval_samples_per_second': 17.308, 'eval_steps_per_second': 2.308, 'epoch': 48.0}


 97%|█████████▋| 1160/1200 [24:49<00:48,  1.21s/it]

{'loss': 0.1236, 'grad_norm': 35.002986907958984, 'learning_rate': 1.6666666666666667e-06, 'epoch': 48.33}


 98%|█████████▊| 1170/1200 [25:00<00:34,  1.14s/it]

{'loss': 0.1043, 'grad_norm': 52.733642578125, 'learning_rate': 1.25e-06, 'epoch': 48.75}


 98%|█████████▊| 1176/1200 [25:06<00:24,  1.03s/it]
 98%|█████████▊| 1176/1200 [25:10<00:24,  1.03s/it]

{'eval_loss': 1.0330302715301514, 'eval_runtime': 3.4568, 'eval_samples_per_second': 17.357, 'eval_steps_per_second': 2.314, 'epoch': 49.0}


 98%|█████████▊| 1180/1200 [25:15<00:29,  1.50s/it]

{'loss': 0.0841, 'grad_norm': 17.910795211791992, 'learning_rate': 8.333333333333333e-07, 'epoch': 49.17}


 99%|█████████▉| 1190/1200 [25:26<00:11,  1.14s/it]

{'loss': 0.1147, 'grad_norm': 24.065994262695312, 'learning_rate': 4.1666666666666667e-07, 'epoch': 49.58}


100%|██████████| 1200/1200 [25:37<00:00,  1.03s/it]

{'loss': 0.0945, 'grad_norm': 40.51728820800781, 'learning_rate': 0.0, 'epoch': 50.0}



100%|██████████| 1200/1200 [25:40<00:00,  1.28s/it]


{'eval_loss': 1.0353659391403198, 'eval_runtime': 3.4755, 'eval_samples_per_second': 17.264, 'eval_steps_per_second': 2.302, 'epoch': 50.0}
{'train_runtime': 1540.8784, 'train_samples_per_second': 7.788, 'train_steps_per_second': 0.779, 'train_loss': 0.5318161080777645, 'epoch': 50.0}


100%|██████████| 8/8 [00:02<00:00,  2.93it/s]
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection

Using device: cuda:0


  1%|          | 10/1200 [00:09<18:05,  1.10it/s]

{'loss': 1.8299, 'grad_norm': 50.28973388671875, 'learning_rate': 4.958333333333334e-05, 'epoch': 0.42}


  2%|▏         | 20/1200 [00:18<17:52,  1.10it/s]

{'loss': 1.6478, 'grad_norm': 252.32957458496094, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.83}


  2%|▏         | 24/1200 [00:22<15:41,  1.25it/s]
  2%|▏         | 24/1200 [00:25<15:41,  1.25it/s]

{'eval_loss': 1.6982085704803467, 'eval_runtime': 3.8554, 'eval_samples_per_second': 15.563, 'eval_steps_per_second': 2.075, 'epoch': 1.0}


  2%|▎         | 30/1200 [00:31<21:30,  1.10s/it]

{'loss': 1.8947, 'grad_norm': 149.8741912841797, 'learning_rate': 4.875e-05, 'epoch': 1.25}


  3%|▎         | 40/1200 [00:40<17:28,  1.11it/s]

{'loss': 1.7271, 'grad_norm': 205.10186767578125, 'learning_rate': 4.8333333333333334e-05, 'epoch': 1.67}


  4%|▍         | 48/1200 [00:47<15:30,  1.24it/s]
  4%|▍         | 48/1200 [00:51<15:30,  1.24it/s]

{'eval_loss': 1.562186360359192, 'eval_runtime': 3.8022, 'eval_samples_per_second': 15.78, 'eval_steps_per_second': 2.104, 'epoch': 2.0}


  4%|▍         | 50/1200 [00:53<33:18,  1.74s/it]

{'loss': 1.7113, 'grad_norm': 95.89041900634766, 'learning_rate': 4.791666666666667e-05, 'epoch': 2.08}


  5%|▌         | 60/1200 [01:02<17:23,  1.09it/s]

{'loss': 2.4956, 'grad_norm': 102.75077819824219, 'learning_rate': 4.75e-05, 'epoch': 2.5}


  6%|▌         | 70/1200 [01:11<17:10,  1.10it/s]

{'loss': 1.7535, 'grad_norm': 100.1623764038086, 'learning_rate': 4.708333333333334e-05, 'epoch': 2.92}


  6%|▌         | 72/1200 [01:13<15:06,  1.25it/s]
  6%|▌         | 72/1200 [01:17<15:06,  1.25it/s]

{'eval_loss': 1.5298614501953125, 'eval_runtime': 3.8283, 'eval_samples_per_second': 15.673, 'eval_steps_per_second': 2.09, 'epoch': 3.0}


  7%|▋         | 80/1200 [01:24<18:45,  1.00s/it]

{'loss': 1.7322, 'grad_norm': 282.79913330078125, 'learning_rate': 4.666666666666667e-05, 'epoch': 3.33}


  8%|▊         | 90/1200 [01:33<16:36,  1.11it/s]

{'loss': 1.3561, 'grad_norm': 39.54118347167969, 'learning_rate': 4.6250000000000006e-05, 'epoch': 3.75}


  8%|▊         | 96/1200 [01:38<14:41,  1.25it/s]
  8%|▊         | 96/1200 [01:42<14:41,  1.25it/s]

{'eval_loss': 1.4156373739242554, 'eval_runtime': 3.7372, 'eval_samples_per_second': 16.055, 'eval_steps_per_second': 2.141, 'epoch': 4.0}


  8%|▊         | 100/1200 [01:46<23:38,  1.29s/it]

{'loss': 1.4115, 'grad_norm': 86.21879577636719, 'learning_rate': 4.5833333333333334e-05, 'epoch': 4.17}


  9%|▉         | 110/1200 [01:55<16:31,  1.10it/s]

{'loss': 1.2251, 'grad_norm': 144.31884765625, 'learning_rate': 4.541666666666667e-05, 'epoch': 4.58}


 10%|█         | 120/1200 [02:04<14:04,  1.28it/s]

{'loss': 1.3995, 'grad_norm': 69.148193359375, 'learning_rate': 4.5e-05, 'epoch': 5.0}



 10%|█         | 120/1200 [02:07<14:04,  1.28it/s]

{'eval_loss': 1.3480989933013916, 'eval_runtime': 3.9295, 'eval_samples_per_second': 15.269, 'eval_steps_per_second': 2.036, 'epoch': 5.0}


 11%|█         | 130/1200 [02:17<16:50,  1.06it/s]

{'loss': 1.3588, 'grad_norm': 56.560523986816406, 'learning_rate': 4.458333333333334e-05, 'epoch': 5.42}


 12%|█▏        | 140/1200 [02:26<15:54,  1.11it/s]

{'loss': 1.2345, 'grad_norm': 51.28807067871094, 'learning_rate': 4.4166666666666665e-05, 'epoch': 5.83}


 12%|█▏        | 144/1200 [02:29<13:58,  1.26it/s]
 12%|█▏        | 144/1200 [02:33<13:58,  1.26it/s]

{'eval_loss': 1.2596144676208496, 'eval_runtime': 3.7324, 'eval_samples_per_second': 16.075, 'eval_steps_per_second': 2.143, 'epoch': 6.0}


 12%|█▎        | 150/1200 [02:39<19:06,  1.09s/it]

{'loss': 1.0373, 'grad_norm': 59.49148178100586, 'learning_rate': 4.375e-05, 'epoch': 6.25}


 13%|█▎        | 160/1200 [02:48<15:42,  1.10it/s]

{'loss': 1.5464, 'grad_norm': 157.49903869628906, 'learning_rate': 4.3333333333333334e-05, 'epoch': 6.67}


 14%|█▍        | 168/1200 [02:54<13:44,  1.25it/s]
 14%|█▍        | 168/1200 [02:58<13:44,  1.25it/s]

{'eval_loss': 1.6914738416671753, 'eval_runtime': 3.7831, 'eval_samples_per_second': 15.86, 'eval_steps_per_second': 2.115, 'epoch': 7.0}


 14%|█▍        | 170/1200 [03:00<29:35,  1.72s/it]

{'loss': 1.8768, 'grad_norm': 58.33460998535156, 'learning_rate': 4.291666666666667e-05, 'epoch': 7.08}


 15%|█▌        | 180/1200 [03:09<15:41,  1.08it/s]

{'loss': 1.1938, 'grad_norm': 31.653409957885742, 'learning_rate': 4.25e-05, 'epoch': 7.5}


 16%|█▌        | 190/1200 [03:18<15:04,  1.12it/s]

{'loss': 1.3344, 'grad_norm': 34.19877243041992, 'learning_rate': 4.208333333333334e-05, 'epoch': 7.92}


 16%|█▌        | 192/1200 [03:20<13:18,  1.26it/s]
 16%|█▌        | 192/1200 [03:24<13:18,  1.26it/s]

{'eval_loss': 1.1058865785598755, 'eval_runtime': 3.7842, 'eval_samples_per_second': 15.855, 'eval_steps_per_second': 2.114, 'epoch': 8.0}


 17%|█▋        | 200/1200 [03:31<16:29,  1.01it/s]

{'loss': 1.2356, 'grad_norm': 289.8751220703125, 'learning_rate': 4.166666666666667e-05, 'epoch': 8.33}


 18%|█▊        | 210/1200 [03:40<14:31,  1.14it/s]

{'loss': 1.1239, 'grad_norm': 33.75627136230469, 'learning_rate': 4.125e-05, 'epoch': 8.75}


 18%|█▊        | 216/1200 [03:45<13:27,  1.22it/s]
 18%|█▊        | 216/1200 [03:49<13:27,  1.22it/s]

{'eval_loss': 1.582410454750061, 'eval_runtime': 3.7379, 'eval_samples_per_second': 16.052, 'eval_steps_per_second': 2.14, 'epoch': 9.0}


 18%|█▊        | 220/1200 [03:53<21:07,  1.29s/it]

{'loss': 1.3976, 'grad_norm': 103.76017761230469, 'learning_rate': 4.0833333333333334e-05, 'epoch': 9.17}


 19%|█▉        | 230/1200 [04:02<14:40,  1.10it/s]

{'loss': 1.3588, 'grad_norm': 41.030303955078125, 'learning_rate': 4.041666666666667e-05, 'epoch': 9.58}


 20%|██        | 240/1200 [04:11<12:41,  1.26it/s]

{'loss': 1.3105, 'grad_norm': 73.9432144165039, 'learning_rate': 4e-05, 'epoch': 10.0}



 20%|██        | 240/1200 [04:14<12:41,  1.26it/s]

{'eval_loss': 1.1790013313293457, 'eval_runtime': 3.8262, 'eval_samples_per_second': 15.682, 'eval_steps_per_second': 2.091, 'epoch': 10.0}


 21%|██        | 250/1200 [04:24<14:53,  1.06it/s]

{'loss': 1.1868, 'grad_norm': 49.53660583496094, 'learning_rate': 3.958333333333333e-05, 'epoch': 10.42}


 22%|██▏       | 260/1200 [04:33<14:00,  1.12it/s]

{'loss': 1.2475, 'grad_norm': 148.08045959472656, 'learning_rate': 3.9166666666666665e-05, 'epoch': 10.83}


 22%|██▏       | 264/1200 [04:36<12:21,  1.26it/s]
 22%|██▏       | 264/1200 [04:40<12:21,  1.26it/s]

{'eval_loss': 1.5642074346542358, 'eval_runtime': 3.7743, 'eval_samples_per_second': 15.897, 'eval_steps_per_second': 2.12, 'epoch': 11.0}


 22%|██▎       | 270/1200 [04:46<17:34,  1.13s/it]

{'loss': 1.3747, 'grad_norm': 68.5721206665039, 'learning_rate': 3.875e-05, 'epoch': 11.25}


 23%|██▎       | 280/1200 [04:55<13:52,  1.10it/s]

{'loss': 1.1718, 'grad_norm': 49.47758483886719, 'learning_rate': 3.8333333333333334e-05, 'epoch': 11.67}


 24%|██▍       | 288/1200 [05:02<12:04,  1.26it/s]
 24%|██▍       | 288/1200 [05:05<12:04,  1.26it/s]

{'eval_loss': 1.0202546119689941, 'eval_runtime': 3.763, 'eval_samples_per_second': 15.945, 'eval_steps_per_second': 2.126, 'epoch': 12.0}


 24%|██▍       | 290/1200 [05:08<26:17,  1.73s/it]

{'loss': 1.1212, 'grad_norm': 1462.5751953125, 'learning_rate': 3.791666666666667e-05, 'epoch': 12.08}


 25%|██▌       | 300/1200 [05:17<13:51,  1.08it/s]

{'loss': 1.1928, 'grad_norm': 38.41757583618164, 'learning_rate': 3.7500000000000003e-05, 'epoch': 12.5}


 26%|██▌       | 310/1200 [05:26<13:21,  1.11it/s]

{'loss': 1.1459, 'grad_norm': 75.25556945800781, 'learning_rate': 3.708333333333334e-05, 'epoch': 12.92}


 26%|██▌       | 312/1200 [05:27<11:44,  1.26it/s]
 26%|██▌       | 312/1200 [05:31<11:44,  1.26it/s]

{'eval_loss': 1.3628878593444824, 'eval_runtime': 3.7731, 'eval_samples_per_second': 15.902, 'eval_steps_per_second': 2.12, 'epoch': 13.0}


 27%|██▋       | 320/1200 [05:38<14:30,  1.01it/s]

{'loss': 1.3022, 'grad_norm': 47.87825012207031, 'learning_rate': 3.6666666666666666e-05, 'epoch': 13.33}


 28%|██▊       | 330/1200 [05:48<13:24,  1.08it/s]

{'loss': 1.0627, 'grad_norm': 34.82111740112305, 'learning_rate': 3.625e-05, 'epoch': 13.75}


 28%|██▊       | 336/1200 [05:53<11:31,  1.25it/s]
 28%|██▊       | 336/1200 [05:56<11:31,  1.25it/s]

{'eval_loss': 1.0767930746078491, 'eval_runtime': 3.8249, 'eval_samples_per_second': 15.687, 'eval_steps_per_second': 2.092, 'epoch': 14.0}


 28%|██▊       | 340/1200 [06:00<18:45,  1.31s/it]

{'loss': 1.0456, 'grad_norm': 96.30834197998047, 'learning_rate': 3.5833333333333335e-05, 'epoch': 14.17}


 29%|██▉       | 350/1200 [06:10<13:04,  1.08it/s]

{'loss': 1.1019, 'grad_norm': 124.7628402709961, 'learning_rate': 3.541666666666667e-05, 'epoch': 14.58}


 30%|███       | 360/1200 [06:18<11:10,  1.25it/s]

{'loss': 1.1152, 'grad_norm': 35.65987014770508, 'learning_rate': 3.5e-05, 'epoch': 15.0}



 30%|███       | 360/1200 [06:22<11:10,  1.25it/s]

{'eval_loss': 1.232647180557251, 'eval_runtime': 3.7494, 'eval_samples_per_second': 16.003, 'eval_steps_per_second': 2.134, 'epoch': 15.0}


 31%|███       | 370/1200 [06:31<13:04,  1.06it/s]

{'loss': 0.9009, 'grad_norm': 77.63841247558594, 'learning_rate': 3.458333333333333e-05, 'epoch': 15.42}


 32%|███▏      | 380/1200 [06:40<12:12,  1.12it/s]

{'loss': 1.1499, 'grad_norm': 63.53948211669922, 'learning_rate': 3.4166666666666666e-05, 'epoch': 15.83}


 32%|███▏      | 384/1200 [06:44<10:52,  1.25it/s]
 32%|███▏      | 384/1200 [06:48<10:52,  1.25it/s]

{'eval_loss': 1.121288776397705, 'eval_runtime': 3.8869, 'eval_samples_per_second': 15.436, 'eval_steps_per_second': 2.058, 'epoch': 16.0}


 32%|███▎      | 390/1200 [06:53<14:59,  1.11s/it]

{'loss': 1.1899, 'grad_norm': 21.60268211364746, 'learning_rate': 3.375000000000001e-05, 'epoch': 16.25}


 33%|███▎      | 400/1200 [07:02<12:06,  1.10it/s]

{'loss': 1.1393, 'grad_norm': 23.56189727783203, 'learning_rate': 3.3333333333333335e-05, 'epoch': 16.67}


 34%|███▍      | 408/1200 [07:09<10:45,  1.23it/s]
 34%|███▍      | 408/1200 [07:13<10:45,  1.23it/s]

{'eval_loss': 1.0791313648223877, 'eval_runtime': 3.8153, 'eval_samples_per_second': 15.726, 'eval_steps_per_second': 2.097, 'epoch': 17.0}


 34%|███▍      | 410/1200 [07:15<23:01,  1.75s/it]

{'loss': 1.0128, 'grad_norm': 77.44168853759766, 'learning_rate': 3.291666666666667e-05, 'epoch': 17.08}


 35%|███▌      | 420/1200 [07:24<11:51,  1.10it/s]

{'loss': 1.2107, 'grad_norm': 42.77567672729492, 'learning_rate': 3.2500000000000004e-05, 'epoch': 17.5}


 36%|███▌      | 430/1200 [07:33<11:07,  1.15it/s]

{'loss': 1.2505, 'grad_norm': 43.199790954589844, 'learning_rate': 3.208333333333334e-05, 'epoch': 17.92}


 36%|███▌      | 432/1200 [07:35<09:57,  1.29it/s]
 36%|███▌      | 432/1200 [07:38<09:57,  1.29it/s]

{'eval_loss': 1.0446035861968994, 'eval_runtime': 3.5342, 'eval_samples_per_second': 16.977, 'eval_steps_per_second': 2.264, 'epoch': 18.0}


 37%|███▋      | 440/1200 [07:46<12:29,  1.01it/s]

{'loss': 1.1098, 'grad_norm': 50.99625778198242, 'learning_rate': 3.1666666666666666e-05, 'epoch': 18.33}


 38%|███▊      | 450/1200 [07:54<10:50,  1.15it/s]

{'loss': 0.9995, 'grad_norm': 44.850990295410156, 'learning_rate': 3.125e-05, 'epoch': 18.75}


 38%|███▊      | 456/1200 [07:59<09:35,  1.29it/s]
 38%|███▊      | 456/1200 [08:03<09:35,  1.29it/s]

{'eval_loss': 1.2175674438476562, 'eval_runtime': 3.5147, 'eval_samples_per_second': 17.071, 'eval_steps_per_second': 2.276, 'epoch': 19.0}


 38%|███▊      | 460/1200 [08:06<15:16,  1.24s/it]

{'loss': 1.0264, 'grad_norm': 72.9687271118164, 'learning_rate': 3.0833333333333335e-05, 'epoch': 19.17}


 39%|███▉      | 470/1200 [08:15<10:41,  1.14it/s]

{'loss': 1.2334, 'grad_norm': 32.71061706542969, 'learning_rate': 3.0416666666666666e-05, 'epoch': 19.58}


 40%|████      | 480/1200 [08:23<09:15,  1.30it/s]

{'loss': 0.9884, 'grad_norm': 130.5026397705078, 'learning_rate': 3e-05, 'epoch': 20.0}



 40%|████      | 480/1200 [08:27<09:15,  1.30it/s]

{'eval_loss': 1.4290308952331543, 'eval_runtime': 3.5277, 'eval_samples_per_second': 17.008, 'eval_steps_per_second': 2.268, 'epoch': 20.0}


 41%|████      | 490/1200 [08:36<10:45,  1.10it/s]

{'loss': 1.4202, 'grad_norm': 74.25851440429688, 'learning_rate': 2.9583333333333335e-05, 'epoch': 20.42}


 42%|████▏     | 500/1200 [08:45<10:32,  1.11it/s]

{'loss': 1.1984, 'grad_norm': 40.761810302734375, 'learning_rate': 2.916666666666667e-05, 'epoch': 20.83}


 42%|████▏     | 504/1200 [08:48<09:05,  1.28it/s]
 42%|████▏     | 504/1200 [08:51<09:05,  1.28it/s]

{'eval_loss': 1.159487247467041, 'eval_runtime': 3.5162, 'eval_samples_per_second': 17.064, 'eval_steps_per_second': 2.275, 'epoch': 21.0}


 42%|████▎     | 510/1200 [08:57<12:04,  1.05s/it]

{'loss': 1.0781, 'grad_norm': 59.62798309326172, 'learning_rate': 2.8749999999999997e-05, 'epoch': 21.25}


 43%|████▎     | 520/1200 [09:06<09:52,  1.15it/s]

{'loss': 1.062, 'grad_norm': 91.477783203125, 'learning_rate': 2.8333333333333335e-05, 'epoch': 21.67}


 44%|████▍     | 528/1200 [09:12<08:42,  1.29it/s]
 44%|████▍     | 528/1200 [09:16<08:42,  1.29it/s]

{'eval_loss': 1.2979165315628052, 'eval_runtime': 3.5919, 'eval_samples_per_second': 16.704, 'eval_steps_per_second': 2.227, 'epoch': 22.0}


 44%|████▍     | 530/1200 [09:18<18:27,  1.65s/it]

{'loss': 1.1972, 'grad_norm': 52.00320053100586, 'learning_rate': 2.791666666666667e-05, 'epoch': 22.08}


 45%|████▌     | 540/1200 [09:27<09:45,  1.13it/s]

{'loss': 0.9079, 'grad_norm': 40.87891387939453, 'learning_rate': 2.7500000000000004e-05, 'epoch': 22.5}


 46%|████▌     | 550/1200 [09:35<09:24,  1.15it/s]

{'loss': 1.2402, 'grad_norm': 41.10169219970703, 'learning_rate': 2.7083333333333332e-05, 'epoch': 22.92}


 46%|████▌     | 552/1200 [09:37<08:21,  1.29it/s]
 46%|████▌     | 552/1200 [09:40<08:21,  1.29it/s]

{'eval_loss': 1.126554012298584, 'eval_runtime': 3.5211, 'eval_samples_per_second': 17.04, 'eval_steps_per_second': 2.272, 'epoch': 23.0}


 47%|████▋     | 560/1200 [09:48<10:12,  1.05it/s]

{'loss': 1.062, 'grad_norm': 72.33210754394531, 'learning_rate': 2.6666666666666667e-05, 'epoch': 23.33}


 48%|████▊     | 570/1200 [09:56<09:03,  1.16it/s]

{'loss': 1.0246, 'grad_norm': 32.71592712402344, 'learning_rate': 2.625e-05, 'epoch': 23.75}


 48%|████▊     | 576/1200 [10:01<07:58,  1.30it/s]
 48%|████▊     | 576/1200 [10:05<07:58,  1.30it/s]

{'eval_loss': 1.189422607421875, 'eval_runtime': 3.5776, 'eval_samples_per_second': 16.771, 'eval_steps_per_second': 2.236, 'epoch': 24.0}


 48%|████▊     | 580/1200 [10:09<12:58,  1.26s/it]

{'loss': 1.0386, 'grad_norm': 274.7596435546875, 'learning_rate': 2.5833333333333336e-05, 'epoch': 24.17}


 49%|████▉     | 590/1200 [10:17<08:55,  1.14it/s]

{'loss': 0.953, 'grad_norm': 54.15077590942383, 'learning_rate': 2.5416666666666667e-05, 'epoch': 24.58}


 50%|█████     | 600/1200 [10:26<07:41,  1.30it/s]

{'loss': 0.9962, 'grad_norm': 41.23442459106445, 'learning_rate': 2.5e-05, 'epoch': 25.0}



 50%|█████     | 600/1200 [10:29<07:41,  1.30it/s]

{'eval_loss': 1.065224051475525, 'eval_runtime': 3.5406, 'eval_samples_per_second': 16.946, 'eval_steps_per_second': 2.26, 'epoch': 25.0}


 51%|█████     | 610/1200 [10:38<08:54,  1.10it/s]

{'loss': 0.9836, 'grad_norm': 29.271028518676758, 'learning_rate': 2.4583333333333332e-05, 'epoch': 25.42}


 52%|█████▏    | 620/1200 [10:47<08:21,  1.16it/s]

{'loss': 1.0723, 'grad_norm': 49.55335998535156, 'learning_rate': 2.4166666666666667e-05, 'epoch': 25.83}


 52%|█████▏    | 624/1200 [10:50<07:24,  1.30it/s]
 52%|█████▏    | 624/1200 [10:53<07:24,  1.30it/s]

{'eval_loss': 1.074468731880188, 'eval_runtime': 3.5372, 'eval_samples_per_second': 16.962, 'eval_steps_per_second': 2.262, 'epoch': 26.0}


 52%|█████▎    | 630/1200 [10:59<09:57,  1.05s/it]

{'loss': 0.9162, 'grad_norm': 92.66167449951172, 'learning_rate': 2.375e-05, 'epoch': 26.25}


 53%|█████▎    | 640/1200 [11:08<08:18,  1.12it/s]

{'loss': 1.2068, 'grad_norm': 31.373069763183594, 'learning_rate': 2.3333333333333336e-05, 'epoch': 26.67}


 54%|█████▍    | 648/1200 [11:14<07:11,  1.28it/s]
 54%|█████▍    | 648/1200 [11:18<07:11,  1.28it/s]

{'eval_loss': 1.1530238389968872, 'eval_runtime': 3.5997, 'eval_samples_per_second': 16.668, 'eval_steps_per_second': 2.222, 'epoch': 27.0}


 54%|█████▍    | 650/1200 [11:20<15:07,  1.65s/it]

{'loss': 1.3636, 'grad_norm': 95.0937728881836, 'learning_rate': 2.2916666666666667e-05, 'epoch': 27.08}


 55%|█████▌    | 660/1200 [11:29<07:59,  1.13it/s]

{'loss': 0.9477, 'grad_norm': 80.69564056396484, 'learning_rate': 2.25e-05, 'epoch': 27.5}


 56%|█████▌    | 670/1200 [11:37<07:43,  1.14it/s]

{'loss': 0.9947, 'grad_norm': 71.70631408691406, 'learning_rate': 2.2083333333333333e-05, 'epoch': 27.92}


 56%|█████▌    | 672/1200 [11:39<06:49,  1.29it/s]
 56%|█████▌    | 672/1200 [11:42<06:49,  1.29it/s]

{'eval_loss': 1.0721880197525024, 'eval_runtime': 3.5431, 'eval_samples_per_second': 16.934, 'eval_steps_per_second': 2.258, 'epoch': 28.0}


 57%|█████▋    | 680/1200 [11:50<08:21,  1.04it/s]

{'loss': 1.0254, 'grad_norm': 43.105587005615234, 'learning_rate': 2.1666666666666667e-05, 'epoch': 28.33}


 57%|█████▊    | 690/1200 [11:58<07:24,  1.15it/s]

{'loss': 0.937, 'grad_norm': 27.502565383911133, 'learning_rate': 2.125e-05, 'epoch': 28.75}


 58%|█████▊    | 696/1200 [12:03<06:30,  1.29it/s]
 58%|█████▊    | 696/1200 [12:07<06:30,  1.29it/s]

{'eval_loss': 1.0843676328659058, 'eval_runtime': 3.531, 'eval_samples_per_second': 16.992, 'eval_steps_per_second': 2.266, 'epoch': 29.0}


 58%|█████▊    | 700/1200 [12:11<10:21,  1.24s/it]

{'loss': 0.849, 'grad_norm': 23.72869873046875, 'learning_rate': 2.0833333333333336e-05, 'epoch': 29.17}


 59%|█████▉    | 710/1200 [12:19<07:13,  1.13it/s]

{'loss': 0.9237, 'grad_norm': 29.20526695251465, 'learning_rate': 2.0416666666666667e-05, 'epoch': 29.58}


 60%|██████    | 720/1200 [12:28<06:11,  1.29it/s]

{'loss': 0.9826, 'grad_norm': 26.874706268310547, 'learning_rate': 2e-05, 'epoch': 30.0}



 60%|██████    | 720/1200 [12:31<06:11,  1.29it/s]

{'eval_loss': 1.026708722114563, 'eval_runtime': 3.5197, 'eval_samples_per_second': 17.047, 'eval_steps_per_second': 2.273, 'epoch': 30.0}


 61%|██████    | 730/1200 [12:40<07:07,  1.10it/s]

{'loss': 0.9106, 'grad_norm': 39.04307556152344, 'learning_rate': 1.9583333333333333e-05, 'epoch': 30.42}


 62%|██████▏   | 740/1200 [12:49<06:44,  1.14it/s]

{'loss': 0.7689, 'grad_norm': 35.24350357055664, 'learning_rate': 1.9166666666666667e-05, 'epoch': 30.83}


 62%|██████▏   | 744/1200 [12:52<05:53,  1.29it/s]
 62%|██████▏   | 744/1200 [12:56<05:53,  1.29it/s]

{'eval_loss': 1.0485291481018066, 'eval_runtime': 3.5216, 'eval_samples_per_second': 17.038, 'eval_steps_per_second': 2.272, 'epoch': 31.0}


 62%|██████▎   | 750/1200 [13:01<07:51,  1.05s/it]

{'loss': 0.8931, 'grad_norm': 25.048742294311523, 'learning_rate': 1.8750000000000002e-05, 'epoch': 31.25}


 63%|██████▎   | 760/1200 [13:10<06:27,  1.13it/s]

{'loss': 0.9527, 'grad_norm': 55.01213455200195, 'learning_rate': 1.8333333333333333e-05, 'epoch': 31.67}


 64%|██████▍   | 768/1200 [13:17<05:35,  1.29it/s]
 64%|██████▍   | 768/1200 [13:20<05:35,  1.29it/s]

{'eval_loss': 1.08983314037323, 'eval_runtime': 3.532, 'eval_samples_per_second': 16.988, 'eval_steps_per_second': 2.265, 'epoch': 32.0}


 64%|██████▍   | 770/1200 [13:23<11:49,  1.65s/it]

{'loss': 0.885, 'grad_norm': 56.56557846069336, 'learning_rate': 1.7916666666666667e-05, 'epoch': 32.08}


 65%|██████▌   | 780/1200 [13:31<06:13,  1.13it/s]

{'loss': 0.8931, 'grad_norm': 40.99022674560547, 'learning_rate': 1.75e-05, 'epoch': 32.5}


 66%|██████▌   | 790/1200 [13:40<05:55,  1.15it/s]

{'loss': 0.9176, 'grad_norm': 43.00864028930664, 'learning_rate': 1.7083333333333333e-05, 'epoch': 32.92}


 66%|██████▌   | 792/1200 [13:41<05:14,  1.30it/s]
 66%|██████▌   | 792/1200 [13:45<05:14,  1.30it/s]

{'eval_loss': 1.0722582340240479, 'eval_runtime': 3.6648, 'eval_samples_per_second': 16.372, 'eval_steps_per_second': 2.183, 'epoch': 33.0}


 67%|██████▋   | 800/1200 [13:52<06:23,  1.04it/s]

{'loss': 0.8362, 'grad_norm': 30.473554611206055, 'learning_rate': 1.6666666666666667e-05, 'epoch': 33.33}


 68%|██████▊   | 810/1200 [14:01<05:37,  1.16it/s]

{'loss': 0.7811, 'grad_norm': 35.4805793762207, 'learning_rate': 1.6250000000000002e-05, 'epoch': 33.75}


 68%|██████▊   | 816/1200 [14:06<04:56,  1.29it/s]
 68%|██████▊   | 816/1200 [14:10<04:56,  1.29it/s]

{'eval_loss': 1.0472785234451294, 'eval_runtime': 3.5532, 'eval_samples_per_second': 16.886, 'eval_steps_per_second': 2.252, 'epoch': 34.0}


 68%|██████▊   | 820/1200 [14:13<07:55,  1.25s/it]

{'loss': 0.811, 'grad_norm': 25.1611270904541, 'learning_rate': 1.5833333333333333e-05, 'epoch': 34.17}


 69%|██████▉   | 830/1200 [14:22<05:24,  1.14it/s]

{'loss': 0.7937, 'grad_norm': 28.170846939086914, 'learning_rate': 1.5416666666666668e-05, 'epoch': 34.58}


 70%|███████   | 840/1200 [14:30<04:42,  1.28it/s]

{'loss': 0.7526, 'grad_norm': 294.2026672363281, 'learning_rate': 1.5e-05, 'epoch': 35.0}



 70%|███████   | 840/1200 [14:34<04:42,  1.28it/s]

{'eval_loss': 1.1136780977249146, 'eval_runtime': 3.5586, 'eval_samples_per_second': 16.86, 'eval_steps_per_second': 2.248, 'epoch': 35.0}


 71%|███████   | 850/1200 [14:43<05:19,  1.10it/s]

{'loss': 0.8113, 'grad_norm': 25.350996017456055, 'learning_rate': 1.4583333333333335e-05, 'epoch': 35.42}


 72%|███████▏  | 860/1200 [14:52<04:55,  1.15it/s]

{'loss': 0.8372, 'grad_norm': 40.88930130004883, 'learning_rate': 1.4166666666666668e-05, 'epoch': 35.83}


 72%|███████▏  | 864/1200 [14:55<04:19,  1.29it/s]
 72%|███████▏  | 864/1200 [14:59<04:19,  1.29it/s]

{'eval_loss': 1.0446101427078247, 'eval_runtime': 3.5254, 'eval_samples_per_second': 17.019, 'eval_steps_per_second': 2.269, 'epoch': 36.0}


 72%|███████▎  | 870/1200 [15:04<05:47,  1.05s/it]

{'loss': 0.7769, 'grad_norm': 37.21027374267578, 'learning_rate': 1.3750000000000002e-05, 'epoch': 36.25}


 73%|███████▎  | 880/1200 [15:13<04:39,  1.14it/s]

{'loss': 0.7702, 'grad_norm': 49.16093826293945, 'learning_rate': 1.3333333333333333e-05, 'epoch': 36.67}


 74%|███████▍  | 888/1200 [15:20<04:04,  1.28it/s]
 74%|███████▍  | 888/1200 [15:23<04:04,  1.28it/s]

{'eval_loss': 0.9777565002441406, 'eval_runtime': 3.5568, 'eval_samples_per_second': 16.869, 'eval_steps_per_second': 2.249, 'epoch': 37.0}


 74%|███████▍  | 890/1200 [15:25<08:31,  1.65s/it]

{'loss': 0.7372, 'grad_norm': 82.27398681640625, 'learning_rate': 1.2916666666666668e-05, 'epoch': 37.08}


 75%|███████▌  | 900/1200 [15:34<04:29,  1.11it/s]

{'loss': 0.8721, 'grad_norm': 29.210054397583008, 'learning_rate': 1.25e-05, 'epoch': 37.5}


 76%|███████▌  | 910/1200 [15:43<04:13,  1.14it/s]

{'loss': 0.7429, 'grad_norm': 20.201202392578125, 'learning_rate': 1.2083333333333333e-05, 'epoch': 37.92}


 76%|███████▌  | 912/1200 [15:44<03:46,  1.27it/s]
 76%|███████▌  | 912/1200 [15:48<03:46,  1.27it/s]

{'eval_loss': 0.9993676543235779, 'eval_runtime': 3.625, 'eval_samples_per_second': 16.552, 'eval_steps_per_second': 2.207, 'epoch': 38.0}


 77%|███████▋  | 920/1200 [15:55<04:29,  1.04it/s]

{'loss': 0.731, 'grad_norm': 17.710617065429688, 'learning_rate': 1.1666666666666668e-05, 'epoch': 38.33}


 78%|███████▊  | 930/1200 [16:04<03:55,  1.15it/s]

{'loss': 0.7373, 'grad_norm': 29.547321319580078, 'learning_rate': 1.125e-05, 'epoch': 38.75}


 78%|███████▊  | 936/1200 [16:09<03:25,  1.29it/s]
 78%|███████▊  | 936/1200 [16:12<03:25,  1.29it/s]

{'eval_loss': 1.0033679008483887, 'eval_runtime': 3.5766, 'eval_samples_per_second': 16.776, 'eval_steps_per_second': 2.237, 'epoch': 39.0}


 78%|███████▊  | 940/1200 [16:16<05:26,  1.26s/it]

{'loss': 0.7069, 'grad_norm': 27.94841957092285, 'learning_rate': 1.0833333333333334e-05, 'epoch': 39.17}


 79%|███████▉  | 950/1200 [16:25<03:40,  1.14it/s]

{'loss': 0.7038, 'grad_norm': 18.391376495361328, 'learning_rate': 1.0416666666666668e-05, 'epoch': 39.58}


 80%|████████  | 960/1200 [16:33<03:06,  1.29it/s]

{'loss': 0.8034, 'grad_norm': 22.283344268798828, 'learning_rate': 1e-05, 'epoch': 40.0}



 80%|████████  | 960/1200 [16:37<03:06,  1.29it/s]

{'eval_loss': 0.9987432956695557, 'eval_runtime': 3.5836, 'eval_samples_per_second': 16.743, 'eval_steps_per_second': 2.232, 'epoch': 40.0}


 81%|████████  | 970/1200 [16:46<03:37,  1.06it/s]

{'loss': 0.7721, 'grad_norm': 24.877408981323242, 'learning_rate': 9.583333333333334e-06, 'epoch': 40.42}


 82%|████████▏ | 980/1200 [16:55<03:18,  1.11it/s]

{'loss': 0.6379, 'grad_norm': 44.54510498046875, 'learning_rate': 9.166666666666666e-06, 'epoch': 40.83}


 82%|████████▏ | 984/1200 [16:59<02:51,  1.26it/s]
 82%|████████▏ | 984/1200 [17:02<02:51,  1.26it/s]

{'eval_loss': 0.9578483700752258, 'eval_runtime': 3.7901, 'eval_samples_per_second': 15.831, 'eval_steps_per_second': 2.111, 'epoch': 41.0}


 82%|████████▎ | 990/1200 [17:08<03:51,  1.10s/it]

{'loss': 0.7899, 'grad_norm': 23.80815887451172, 'learning_rate': 8.75e-06, 'epoch': 41.25}


 83%|████████▎ | 1000/1200 [17:17<03:01,  1.10it/s]

{'loss': 0.6601, 'grad_norm': 22.040721893310547, 'learning_rate': 8.333333333333334e-06, 'epoch': 41.67}


 84%|████████▍ | 1008/1200 [17:24<02:33,  1.25it/s]
 84%|████████▍ | 1008/1200 [17:28<02:33,  1.25it/s]

{'eval_loss': 0.9983320236206055, 'eval_runtime': 3.8257, 'eval_samples_per_second': 15.683, 'eval_steps_per_second': 2.091, 'epoch': 42.0}


 84%|████████▍ | 1010/1200 [17:30<05:31,  1.74s/it]

{'loss': 0.6576, 'grad_norm': 36.2408447265625, 'learning_rate': 7.916666666666667e-06, 'epoch': 42.08}


 85%|████████▌ | 1020/1200 [17:39<02:47,  1.07it/s]

{'loss': 0.7252, 'grad_norm': 28.938495635986328, 'learning_rate': 7.5e-06, 'epoch': 42.5}


 86%|████████▌ | 1030/1200 [17:48<02:36,  1.09it/s]

{'loss': 0.6803, 'grad_norm': 24.344425201416016, 'learning_rate': 7.083333333333334e-06, 'epoch': 42.92}


 86%|████████▌ | 1032/1200 [17:50<02:15,  1.24it/s]
 86%|████████▌ | 1032/1200 [17:54<02:15,  1.24it/s]

{'eval_loss': 1.002239465713501, 'eval_runtime': 3.7893, 'eval_samples_per_second': 15.834, 'eval_steps_per_second': 2.111, 'epoch': 43.0}


 87%|████████▋ | 1040/1200 [18:01<02:40,  1.01s/it]

{'loss': 0.7431, 'grad_norm': 23.992528915405273, 'learning_rate': 6.666666666666667e-06, 'epoch': 43.33}


 88%|████████▊ | 1050/1200 [18:10<02:16,  1.10it/s]

{'loss': 0.6406, 'grad_norm': 23.802204132080078, 'learning_rate': 6.25e-06, 'epoch': 43.75}


 88%|████████▊ | 1056/1200 [18:16<01:55,  1.25it/s]
 88%|████████▊ | 1056/1200 [18:19<01:55,  1.25it/s]

{'eval_loss': 1.0011521577835083, 'eval_runtime': 3.815, 'eval_samples_per_second': 15.728, 'eval_steps_per_second': 2.097, 'epoch': 44.0}


 88%|████████▊ | 1060/1200 [18:23<03:03,  1.31s/it]

{'loss': 0.5978, 'grad_norm': 23.514602661132812, 'learning_rate': 5.833333333333334e-06, 'epoch': 44.17}


 89%|████████▉ | 1070/1200 [18:32<01:59,  1.09it/s]

{'loss': 0.734, 'grad_norm': 25.96381187438965, 'learning_rate': 5.416666666666667e-06, 'epoch': 44.58}


 90%|█████████ | 1080/1200 [18:41<01:35,  1.25it/s]

{'loss': 0.6861, 'grad_norm': 21.885498046875, 'learning_rate': 5e-06, 'epoch': 45.0}



 90%|█████████ | 1080/1200 [18:45<01:35,  1.25it/s]

{'eval_loss': 0.9953131079673767, 'eval_runtime': 3.9628, 'eval_samples_per_second': 15.141, 'eval_steps_per_second': 2.019, 'epoch': 45.0}


 91%|█████████ | 1090/1200 [18:54<01:41,  1.09it/s]

{'loss': 0.627, 'grad_norm': 31.566144943237305, 'learning_rate': 4.583333333333333e-06, 'epoch': 45.42}


 92%|█████████▏| 1100/1200 [19:03<01:26,  1.15it/s]

{'loss': 0.6488, 'grad_norm': 61.0053596496582, 'learning_rate': 4.166666666666667e-06, 'epoch': 45.83}


 92%|█████████▏| 1104/1200 [19:06<01:14,  1.29it/s]
 92%|█████████▏| 1104/1200 [19:10<01:14,  1.29it/s]

{'eval_loss': 1.0441704988479614, 'eval_runtime': 3.6257, 'eval_samples_per_second': 16.549, 'eval_steps_per_second': 2.206, 'epoch': 46.0}


 92%|█████████▎| 1110/1200 [19:15<01:36,  1.08s/it]

{'loss': 0.6844, 'grad_norm': 39.24168395996094, 'learning_rate': 3.75e-06, 'epoch': 46.25}


 93%|█████████▎| 1120/1200 [19:24<01:10,  1.13it/s]

{'loss': 0.6941, 'grad_norm': 26.574764251708984, 'learning_rate': 3.3333333333333333e-06, 'epoch': 46.67}


 94%|█████████▍| 1128/1200 [19:31<00:55,  1.29it/s]
 94%|█████████▍| 1128/1200 [19:34<00:55,  1.29it/s]

{'eval_loss': 1.0467650890350342, 'eval_runtime': 3.6206, 'eval_samples_per_second': 16.572, 'eval_steps_per_second': 2.21, 'epoch': 47.0}


 94%|█████████▍| 1130/1200 [19:37<01:56,  1.66s/it]

{'loss': 0.6164, 'grad_norm': 109.46661376953125, 'learning_rate': 2.916666666666667e-06, 'epoch': 47.08}


 95%|█████████▌| 1140/1200 [19:45<00:54,  1.11it/s]

{'loss': 0.5928, 'grad_norm': 22.058425903320312, 'learning_rate': 2.5e-06, 'epoch': 47.5}


 96%|█████████▌| 1150/1200 [19:54<00:43,  1.15it/s]

{'loss': 0.7019, 'grad_norm': 54.915313720703125, 'learning_rate': 2.0833333333333334e-06, 'epoch': 47.92}


 96%|█████████▌| 1152/1200 [19:55<00:37,  1.29it/s]
 96%|█████████▌| 1152/1200 [19:59<00:37,  1.29it/s]

{'eval_loss': 1.0479233264923096, 'eval_runtime': 3.5428, 'eval_samples_per_second': 16.936, 'eval_steps_per_second': 2.258, 'epoch': 48.0}


 97%|█████████▋| 1160/1200 [20:06<00:38,  1.04it/s]

{'loss': 0.6681, 'grad_norm': 85.62725830078125, 'learning_rate': 1.6666666666666667e-06, 'epoch': 48.33}


 98%|█████████▊| 1170/1200 [20:15<00:26,  1.15it/s]

{'loss': 0.6025, 'grad_norm': 47.91534423828125, 'learning_rate': 1.25e-06, 'epoch': 48.75}


 98%|█████████▊| 1176/1200 [20:20<00:18,  1.29it/s]
 98%|█████████▊| 1176/1200 [20:24<00:18,  1.29it/s]

{'eval_loss': 1.0196349620819092, 'eval_runtime': 3.5456, 'eval_samples_per_second': 16.922, 'eval_steps_per_second': 2.256, 'epoch': 49.0}


 98%|█████████▊| 1180/1200 [20:27<00:24,  1.25s/it]

{'loss': 0.6495, 'grad_norm': 32.56723403930664, 'learning_rate': 8.333333333333333e-07, 'epoch': 49.17}


 99%|█████████▉| 1190/1200 [20:36<00:08,  1.13it/s]

{'loss': 0.69, 'grad_norm': 21.667678833007812, 'learning_rate': 4.1666666666666667e-07, 'epoch': 49.58}


100%|██████████| 1200/1200 [20:45<00:00,  1.28it/s]

{'loss': 0.5918, 'grad_norm': 127.974609375, 'learning_rate': 0.0, 'epoch': 50.0}



100%|██████████| 1200/1200 [20:48<00:00,  1.04s/it]


{'eval_loss': 1.0189175605773926, 'eval_runtime': 3.5332, 'eval_samples_per_second': 16.982, 'eval_steps_per_second': 2.264, 'epoch': 50.0}
{'train_runtime': 1248.714, 'train_samples_per_second': 9.61, 'train_steps_per_second': 0.961, 'train_loss': 1.0364275976022084, 'epoch': 50.0}


100%|██████████| 8/8 [00:02<00:00,  2.89it/s]
Some weights of YolosForObjectDetection were not initialized from the model checkpoint at hustvl/yolos-tiny and are newly initialized because the shapes did not match:
- class_labels_classifier.layers.2.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([7]) in the model instantiated
- class_labels_classifier.layers.2.weight: found shape torch.Size([92, 192]) in the checkpoint and torch.Size([7, 192]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda:0


  3%|▎         | 10/300 [00:11<05:27,  1.13s/it]

{'loss': 3.9338, 'grad_norm': 38.586578369140625, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.67}


  5%|▌         | 15/300 [00:16<04:52,  1.03s/it]
  5%|▌         | 15/300 [00:20<04:52,  1.03s/it]

{'eval_loss': 3.0849671363830566, 'eval_runtime': 3.3998, 'eval_samples_per_second': 17.648, 'eval_steps_per_second': 2.353, 'epoch': 1.0}


  7%|▋         | 20/300 [00:26<06:27,  1.38s/it]

{'loss': 2.8928, 'grad_norm': 33.2906379699707, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.33}


 10%|█         | 30/300 [00:37<04:39,  1.04s/it]

{'loss': 2.4341, 'grad_norm': 34.45074462890625, 'learning_rate': 4.5e-05, 'epoch': 2.0}



 10%|█         | 30/300 [00:40<04:39,  1.04s/it]

{'eval_loss': 2.7149100303649902, 'eval_runtime': 3.4684, 'eval_samples_per_second': 17.299, 'eval_steps_per_second': 2.307, 'epoch': 2.0}


 13%|█▎        | 40/300 [00:52<05:10,  1.20s/it]

{'loss': 2.1282, 'grad_norm': 55.5739860534668, 'learning_rate': 4.3333333333333334e-05, 'epoch': 2.67}


 15%|█▌        | 45/300 [00:58<04:29,  1.06s/it]
 15%|█▌        | 45/300 [01:01<04:29,  1.06s/it]

{'eval_loss': 2.685706377029419, 'eval_runtime': 3.5732, 'eval_samples_per_second': 16.792, 'eval_steps_per_second': 2.239, 'epoch': 3.0}


 17%|█▋        | 50/300 [01:07<05:51,  1.41s/it]

{'loss': 2.0415, 'grad_norm': 28.720773696899414, 'learning_rate': 4.166666666666667e-05, 'epoch': 3.33}


 20%|██        | 60/300 [01:18<04:06,  1.03s/it]

{'loss': 1.923, 'grad_norm': 29.62256622314453, 'learning_rate': 4e-05, 'epoch': 4.0}



 20%|██        | 60/300 [01:22<04:06,  1.03s/it]

{'eval_loss': 2.556957244873047, 'eval_runtime': 3.4716, 'eval_samples_per_second': 17.283, 'eval_steps_per_second': 2.304, 'epoch': 4.0}


 23%|██▎       | 70/300 [01:33<04:31,  1.18s/it]

{'loss': 1.8915, 'grad_norm': 19.237747192382812, 'learning_rate': 3.8333333333333334e-05, 'epoch': 4.67}


 25%|██▌       | 75/300 [01:39<03:50,  1.02s/it]
 25%|██▌       | 75/300 [01:42<03:50,  1.02s/it]

{'eval_loss': 2.748621940612793, 'eval_runtime': 3.2569, 'eval_samples_per_second': 18.422, 'eval_steps_per_second': 2.456, 'epoch': 5.0}


 27%|██▋       | 80/300 [01:48<04:58,  1.35s/it]

{'loss': 1.7652, 'grad_norm': 29.787981033325195, 'learning_rate': 3.6666666666666666e-05, 'epoch': 5.33}


 30%|███       | 90/300 [01:59<03:39,  1.04s/it]

{'loss': 1.7946, 'grad_norm': 24.598100662231445, 'learning_rate': 3.5e-05, 'epoch': 6.0}



 30%|███       | 90/300 [02:02<03:39,  1.04s/it]

{'eval_loss': 2.392141819000244, 'eval_runtime': 3.6665, 'eval_samples_per_second': 16.364, 'eval_steps_per_second': 2.182, 'epoch': 6.0}


 33%|███▎      | 100/300 [02:14<04:02,  1.21s/it]

{'loss': 1.6127, 'grad_norm': 23.837331771850586, 'learning_rate': 3.3333333333333335e-05, 'epoch': 6.67}


 35%|███▌      | 105/300 [02:20<03:28,  1.07s/it]
 35%|███▌      | 105/300 [02:23<03:28,  1.07s/it]

{'eval_loss': 2.4355309009552, 'eval_runtime': 3.5078, 'eval_samples_per_second': 17.105, 'eval_steps_per_second': 2.281, 'epoch': 7.0}


 37%|███▋      | 110/300 [02:30<04:32,  1.43s/it]

{'loss': 1.6554, 'grad_norm': 23.344520568847656, 'learning_rate': 3.1666666666666666e-05, 'epoch': 7.33}


 40%|████      | 120/300 [02:41<03:10,  1.06s/it]

{'loss': 1.6264, 'grad_norm': 28.85584259033203, 'learning_rate': 3e-05, 'epoch': 8.0}



 40%|████      | 120/300 [02:45<03:10,  1.06s/it]

{'eval_loss': 2.4212634563446045, 'eval_runtime': 3.6678, 'eval_samples_per_second': 16.358, 'eval_steps_per_second': 2.181, 'epoch': 8.0}


 43%|████▎     | 130/300 [02:57<03:29,  1.23s/it]

{'loss': 1.5359, 'grad_norm': 22.50617027282715, 'learning_rate': 2.8333333333333335e-05, 'epoch': 8.67}


 45%|████▌     | 135/300 [03:02<02:54,  1.06s/it]
 45%|████▌     | 135/300 [03:06<02:54,  1.06s/it]

{'eval_loss': 2.3524701595306396, 'eval_runtime': 3.6546, 'eval_samples_per_second': 16.418, 'eval_steps_per_second': 2.189, 'epoch': 9.0}


 47%|████▋     | 140/300 [03:12<03:49,  1.44s/it]

{'loss': 1.4463, 'grad_norm': 20.297603607177734, 'learning_rate': 2.6666666666666667e-05, 'epoch': 9.33}


 50%|█████     | 150/300 [03:23<02:37,  1.05s/it]

{'loss': 1.3847, 'grad_norm': 17.6071834564209, 'learning_rate': 2.5e-05, 'epoch': 10.0}



 50%|█████     | 150/300 [03:27<02:37,  1.05s/it]

{'eval_loss': 2.3315389156341553, 'eval_runtime': 3.5878, 'eval_samples_per_second': 16.723, 'eval_steps_per_second': 2.23, 'epoch': 10.0}


 53%|█████▎    | 160/300 [03:39<02:48,  1.21s/it]

{'loss': 1.3178, 'grad_norm': 26.763303756713867, 'learning_rate': 2.3333333333333336e-05, 'epoch': 10.67}


 55%|█████▌    | 165/300 [03:44<02:22,  1.05s/it]
 55%|█████▌    | 165/300 [03:48<02:22,  1.05s/it]

{'eval_loss': 2.308208465576172, 'eval_runtime': 3.5868, 'eval_samples_per_second': 16.728, 'eval_steps_per_second': 2.23, 'epoch': 11.0}


 57%|█████▋    | 170/300 [03:54<03:06,  1.44s/it]

{'loss': 1.3109, 'grad_norm': 22.018539428710938, 'learning_rate': 2.1666666666666667e-05, 'epoch': 11.33}


 60%|██████    | 180/300 [04:05<02:07,  1.06s/it]

{'loss': 1.3088, 'grad_norm': 23.243303298950195, 'learning_rate': 2e-05, 'epoch': 12.0}



 60%|██████    | 180/300 [04:09<02:07,  1.06s/it]

{'eval_loss': 2.3522069454193115, 'eval_runtime': 3.6748, 'eval_samples_per_second': 16.327, 'eval_steps_per_second': 2.177, 'epoch': 12.0}


 63%|██████▎   | 190/300 [04:21<02:13,  1.21s/it]

{'loss': 1.1803, 'grad_norm': 24.41699981689453, 'learning_rate': 1.8333333333333333e-05, 'epoch': 12.67}


 65%|██████▌   | 195/300 [04:27<01:51,  1.06s/it]
 65%|██████▌   | 195/300 [04:30<01:51,  1.06s/it]

{'eval_loss': 2.329010009765625, 'eval_runtime': 3.6908, 'eval_samples_per_second': 16.256, 'eval_steps_per_second': 2.168, 'epoch': 13.0}


 67%|██████▋   | 200/300 [04:37<02:24,  1.45s/it]

{'loss': 1.1804, 'grad_norm': 27.442096710205078, 'learning_rate': 1.6666666666666667e-05, 'epoch': 13.33}


 70%|███████   | 210/300 [04:48<01:34,  1.05s/it]

{'loss': 1.2051, 'grad_norm': 27.331449508666992, 'learning_rate': 1.5e-05, 'epoch': 14.0}



 70%|███████   | 210/300 [04:52<01:34,  1.05s/it]

{'eval_loss': 2.330258369445801, 'eval_runtime': 3.8029, 'eval_samples_per_second': 15.778, 'eval_steps_per_second': 2.104, 'epoch': 14.0}


 73%|███████▎  | 220/300 [05:04<01:37,  1.21s/it]

{'loss': 1.0846, 'grad_norm': 30.261486053466797, 'learning_rate': 1.3333333333333333e-05, 'epoch': 14.67}


 75%|███████▌  | 225/300 [05:09<01:19,  1.06s/it]
 75%|███████▌  | 225/300 [05:13<01:19,  1.06s/it]

{'eval_loss': 2.3028039932250977, 'eval_runtime': 3.7108, 'eval_samples_per_second': 16.169, 'eval_steps_per_second': 2.156, 'epoch': 15.0}


 77%|███████▋  | 230/300 [05:19<01:41,  1.45s/it]

{'loss': 1.127, 'grad_norm': 27.001163482666016, 'learning_rate': 1.1666666666666668e-05, 'epoch': 15.33}


 80%|████████  | 240/300 [05:30<01:03,  1.06s/it]

{'loss': 1.0412, 'grad_norm': 16.748788833618164, 'learning_rate': 1e-05, 'epoch': 16.0}



 80%|████████  | 240/300 [05:34<01:03,  1.06s/it]

{'eval_loss': 2.2606089115142822, 'eval_runtime': 3.599, 'eval_samples_per_second': 16.671, 'eval_steps_per_second': 2.223, 'epoch': 16.0}


 83%|████████▎ | 250/300 [05:46<01:00,  1.20s/it]

{'loss': 1.0357, 'grad_norm': 17.2320556640625, 'learning_rate': 8.333333333333334e-06, 'epoch': 16.67}


 85%|████████▌ | 255/300 [05:51<00:47,  1.06s/it]
 85%|████████▌ | 255/300 [05:55<00:47,  1.06s/it]

{'eval_loss': 2.329871654510498, 'eval_runtime': 3.4638, 'eval_samples_per_second': 17.322, 'eval_steps_per_second': 2.31, 'epoch': 17.0}


 87%|████████▋ | 260/300 [06:01<00:55,  1.40s/it]

{'loss': 0.979, 'grad_norm': 15.01718521118164, 'learning_rate': 6.666666666666667e-06, 'epoch': 17.33}


 90%|█████████ | 270/300 [06:12<00:30,  1.01s/it]

{'loss': 1.0242, 'grad_norm': 18.075618743896484, 'learning_rate': 5e-06, 'epoch': 18.0}



 90%|█████████ | 270/300 [06:15<00:30,  1.01s/it]

{'eval_loss': 2.2740824222564697, 'eval_runtime': 3.2959, 'eval_samples_per_second': 18.205, 'eval_steps_per_second': 2.427, 'epoch': 18.0}


 93%|█████████▎| 280/300 [06:26<00:23,  1.16s/it]

{'loss': 0.9741, 'grad_norm': 20.32126235961914, 'learning_rate': 3.3333333333333333e-06, 'epoch': 18.67}


 95%|█████████▌| 285/300 [06:32<00:15,  1.02s/it]
 95%|█████████▌| 285/300 [06:35<00:15,  1.02s/it]

{'eval_loss': 2.2602248191833496, 'eval_runtime': 3.1773, 'eval_samples_per_second': 18.884, 'eval_steps_per_second': 2.518, 'epoch': 19.0}


 97%|█████████▋| 290/300 [06:41<00:13,  1.33s/it]

{'loss': 0.9733, 'grad_norm': 14.603944778442383, 'learning_rate': 1.6666666666666667e-06, 'epoch': 19.33}


100%|██████████| 300/300 [06:51<00:00,  1.01s/it]

{'loss': 0.895, 'grad_norm': 19.150976181030273, 'learning_rate': 0.0, 'epoch': 20.0}



100%|██████████| 300/300 [06:55<00:00,  1.38s/it]


{'eval_loss': 2.261028289794922, 'eval_runtime': 3.2103, 'eval_samples_per_second': 18.69, 'eval_steps_per_second': 2.492, 'epoch': 20.0}
{'train_runtime': 415.0485, 'train_samples_per_second': 7.228, 'train_steps_per_second': 0.723, 'train_loss': 1.556786028544108, 'epoch': 20.0}


100%|██████████| 8/8 [00:02<00:00,  3.24it/s]
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection

Using device: cuda:0


  3%|▎         | 10/300 [00:09<04:08,  1.17it/s]

{'loss': 6.5161, 'grad_norm': 300.8685607910156, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.67}


  5%|▌         | 15/300 [00:13<03:39,  1.30it/s]
  5%|▌         | 15/300 [00:16<03:39,  1.30it/s]

{'eval_loss': 5.336240291595459, 'eval_runtime': 3.4783, 'eval_samples_per_second': 17.25, 'eval_steps_per_second': 2.3, 'epoch': 1.0}


  7%|▋         | 20/300 [00:21<05:10,  1.11s/it]

{'loss': 5.423, 'grad_norm': 180.41317749023438, 'learning_rate': 4.666666666666667e-05, 'epoch': 1.33}


 10%|█         | 30/300 [00:29<03:27,  1.30it/s]

{'loss': 4.8034, 'grad_norm': 46.03608703613281, 'learning_rate': 4.5e-05, 'epoch': 2.0}



 10%|█         | 30/300 [00:32<03:27,  1.30it/s]

{'eval_loss': 5.396916389465332, 'eval_runtime': 3.4342, 'eval_samples_per_second': 17.472, 'eval_steps_per_second': 2.33, 'epoch': 2.0}


 13%|█▎        | 40/300 [00:41<03:50,  1.13it/s]

{'loss': 4.5207, 'grad_norm': 100.11878204345703, 'learning_rate': 4.3333333333333334e-05, 'epoch': 2.67}


 15%|█▌        | 45/300 [00:45<03:15,  1.31it/s]
 15%|█▌        | 45/300 [00:48<03:15,  1.31it/s]

{'eval_loss': 4.313567161560059, 'eval_runtime': 3.3865, 'eval_samples_per_second': 17.717, 'eval_steps_per_second': 2.362, 'epoch': 3.0}


 17%|█▋        | 50/300 [00:53<04:38,  1.12s/it]

{'loss': 3.5977, 'grad_norm': 67.84677124023438, 'learning_rate': 4.166666666666667e-05, 'epoch': 3.33}


 20%|██        | 60/300 [01:01<03:02,  1.31it/s]

{'loss': 3.4467, 'grad_norm': 49.33641052246094, 'learning_rate': 4e-05, 'epoch': 4.0}



 20%|██        | 60/300 [01:05<03:02,  1.31it/s]

{'eval_loss': 4.366943836212158, 'eval_runtime': 3.3785, 'eval_samples_per_second': 17.759, 'eval_steps_per_second': 2.368, 'epoch': 4.0}


 23%|██▎       | 70/300 [01:13<03:23,  1.13it/s]

{'loss': 3.4072, 'grad_norm': 33.13031768798828, 'learning_rate': 3.8333333333333334e-05, 'epoch': 4.67}


 25%|██▌       | 75/300 [01:17<02:52,  1.31it/s]
 25%|██▌       | 75/300 [01:21<02:52,  1.31it/s]

{'eval_loss': 3.818007230758667, 'eval_runtime': 3.4861, 'eval_samples_per_second': 17.211, 'eval_steps_per_second': 2.295, 'epoch': 5.0}


 27%|██▋       | 80/300 [01:25<04:03,  1.10s/it]

{'loss': 3.0517, 'grad_norm': 47.956233978271484, 'learning_rate': 3.6666666666666666e-05, 'epoch': 5.33}


 30%|███       | 90/300 [01:33<02:40,  1.31it/s]

{'loss': 3.1342, 'grad_norm': 69.0641860961914, 'learning_rate': 3.5e-05, 'epoch': 6.0}



 30%|███       | 90/300 [01:37<02:40,  1.31it/s]

{'eval_loss': 4.3976826667785645, 'eval_runtime': 3.4233, 'eval_samples_per_second': 17.527, 'eval_steps_per_second': 2.337, 'epoch': 6.0}


 33%|███▎      | 100/300 [01:46<02:56,  1.13it/s]

{'loss': 3.2105, 'grad_norm': 40.628997802734375, 'learning_rate': 3.3333333333333335e-05, 'epoch': 6.67}


 35%|███▌      | 105/300 [01:50<02:28,  1.31it/s]
 35%|███▌      | 105/300 [01:53<02:28,  1.31it/s]

{'eval_loss': 3.682332754135132, 'eval_runtime': 3.4108, 'eval_samples_per_second': 17.591, 'eval_steps_per_second': 2.345, 'epoch': 7.0}


 37%|███▋      | 110/300 [01:58<03:28,  1.10s/it]

{'loss': 2.9974, 'grad_norm': 31.048585891723633, 'learning_rate': 3.1666666666666666e-05, 'epoch': 7.33}


 40%|████      | 120/300 [02:06<02:17,  1.31it/s]

{'loss': 3.3314, 'grad_norm': 35.647830963134766, 'learning_rate': 3e-05, 'epoch': 8.0}



 40%|████      | 120/300 [02:09<02:17,  1.31it/s]

{'eval_loss': 3.6149556636810303, 'eval_runtime': 3.3946, 'eval_samples_per_second': 17.675, 'eval_steps_per_second': 2.357, 'epoch': 8.0}


 43%|████▎     | 130/300 [02:18<02:32,  1.12it/s]

{'loss': 2.9588, 'grad_norm': 33.97581481933594, 'learning_rate': 2.8333333333333335e-05, 'epoch': 8.67}


 45%|████▌     | 135/300 [02:22<02:06,  1.30it/s]
 45%|████▌     | 135/300 [02:25<02:06,  1.30it/s]

{'eval_loss': 3.5564253330230713, 'eval_runtime': 3.404, 'eval_samples_per_second': 17.626, 'eval_steps_per_second': 2.35, 'epoch': 9.0}


 47%|████▋     | 140/300 [02:30<02:55,  1.10s/it]

{'loss': 2.9515, 'grad_norm': 41.49867630004883, 'learning_rate': 2.6666666666666667e-05, 'epoch': 9.33}


 50%|█████     | 150/300 [02:38<01:54,  1.31it/s]

{'loss': 2.8739, 'grad_norm': 26.20855712890625, 'learning_rate': 2.5e-05, 'epoch': 10.0}



 50%|█████     | 150/300 [02:41<01:54,  1.31it/s]

{'eval_loss': 3.6404073238372803, 'eval_runtime': 3.3962, 'eval_samples_per_second': 17.667, 'eval_steps_per_second': 2.356, 'epoch': 10.0}


 53%|█████▎    | 160/300 [02:50<02:05,  1.11it/s]

{'loss': 2.7828, 'grad_norm': 60.18735122680664, 'learning_rate': 2.3333333333333336e-05, 'epoch': 10.67}


 55%|█████▌    | 165/300 [02:54<01:44,  1.29it/s]
 55%|█████▌    | 165/300 [02:58<01:44,  1.29it/s]

{'eval_loss': 3.5938971042633057, 'eval_runtime': 3.4442, 'eval_samples_per_second': 17.42, 'eval_steps_per_second': 2.323, 'epoch': 11.0}


 57%|█████▋    | 170/300 [03:02<02:24,  1.11s/it]

{'loss': 2.8758, 'grad_norm': 40.58694839477539, 'learning_rate': 2.1666666666666667e-05, 'epoch': 11.33}


 60%|██████    | 180/300 [03:10<01:32,  1.30it/s]

{'loss': 2.8514, 'grad_norm': 28.92098045349121, 'learning_rate': 2e-05, 'epoch': 12.0}



 60%|██████    | 180/300 [03:14<01:32,  1.30it/s]

{'eval_loss': 3.4920554161071777, 'eval_runtime': 3.5683, 'eval_samples_per_second': 16.815, 'eval_steps_per_second': 2.242, 'epoch': 12.0}


 63%|██████▎   | 190/300 [03:23<01:39,  1.10it/s]

{'loss': 2.7641, 'grad_norm': 47.999324798583984, 'learning_rate': 1.8333333333333333e-05, 'epoch': 12.67}


 65%|██████▌   | 195/300 [03:27<01:21,  1.29it/s]
 65%|██████▌   | 195/300 [03:30<01:21,  1.29it/s]

{'eval_loss': 3.588409662246704, 'eval_runtime': 3.5522, 'eval_samples_per_second': 16.891, 'eval_steps_per_second': 2.252, 'epoch': 13.0}


 67%|██████▋   | 200/300 [03:35<01:52,  1.12s/it]

{'loss': 2.674, 'grad_norm': 47.82052993774414, 'learning_rate': 1.6666666666666667e-05, 'epoch': 13.33}


 70%|███████   | 210/300 [03:43<01:09,  1.30it/s]

{'loss': 2.71, 'grad_norm': 46.134033203125, 'learning_rate': 1.5e-05, 'epoch': 14.0}



 70%|███████   | 210/300 [03:47<01:09,  1.30it/s]

{'eval_loss': 3.232269763946533, 'eval_runtime': 3.4722, 'eval_samples_per_second': 17.28, 'eval_steps_per_second': 2.304, 'epoch': 14.0}


 73%|███████▎  | 220/300 [03:56<01:12,  1.10it/s]

{'loss': 2.4857, 'grad_norm': 48.15557861328125, 'learning_rate': 1.3333333333333333e-05, 'epoch': 14.67}


 75%|███████▌  | 225/300 [04:00<00:58,  1.28it/s]
 75%|███████▌  | 225/300 [04:03<00:58,  1.28it/s]

{'eval_loss': 3.1173226833343506, 'eval_runtime': 3.4722, 'eval_samples_per_second': 17.28, 'eval_steps_per_second': 2.304, 'epoch': 15.0}


 77%|███████▋  | 230/300 [04:08<01:17,  1.11s/it]

{'loss': 2.4466, 'grad_norm': 60.06028747558594, 'learning_rate': 1.1666666666666668e-05, 'epoch': 15.33}


 80%|████████  | 240/300 [04:16<00:46,  1.30it/s]

{'loss': 2.3727, 'grad_norm': 58.68257522583008, 'learning_rate': 1e-05, 'epoch': 16.0}



 80%|████████  | 240/300 [04:20<00:46,  1.30it/s]

{'eval_loss': 3.068908214569092, 'eval_runtime': 3.4414, 'eval_samples_per_second': 17.435, 'eval_steps_per_second': 2.325, 'epoch': 16.0}


 83%|████████▎ | 250/300 [04:29<00:45,  1.10it/s]

{'loss': 2.2653, 'grad_norm': 33.1005859375, 'learning_rate': 8.333333333333334e-06, 'epoch': 16.67}


 85%|████████▌ | 255/300 [04:33<00:34,  1.29it/s]
 85%|████████▌ | 255/300 [04:36<00:34,  1.29it/s]

{'eval_loss': 3.157379627227783, 'eval_runtime': 3.4591, 'eval_samples_per_second': 17.345, 'eval_steps_per_second': 2.313, 'epoch': 17.0}


 87%|████████▋ | 260/300 [04:41<00:44,  1.11s/it]

{'loss': 2.325, 'grad_norm': 51.008541107177734, 'learning_rate': 6.666666666666667e-06, 'epoch': 17.33}


 90%|█████████ | 270/300 [04:49<00:23,  1.30it/s]

{'loss': 2.3506, 'grad_norm': 55.32683563232422, 'learning_rate': 5e-06, 'epoch': 18.0}



 90%|█████████ | 270/300 [04:52<00:23,  1.30it/s]

{'eval_loss': 3.0573689937591553, 'eval_runtime': 3.5939, 'eval_samples_per_second': 16.695, 'eval_steps_per_second': 2.226, 'epoch': 18.0}


 93%|█████████▎| 280/300 [05:01<00:17,  1.11it/s]

{'loss': 2.2508, 'grad_norm': 72.43883514404297, 'learning_rate': 3.3333333333333333e-06, 'epoch': 18.67}


 95%|█████████▌| 285/300 [05:05<00:11,  1.26it/s]
 95%|█████████▌| 285/300 [05:09<00:11,  1.26it/s]

{'eval_loss': 3.0934038162231445, 'eval_runtime': 3.4971, 'eval_samples_per_second': 17.157, 'eval_steps_per_second': 2.288, 'epoch': 19.0}


 97%|█████████▋| 290/300 [05:14<00:12,  1.21s/it]

{'loss': 2.3135, 'grad_norm': 36.431522369384766, 'learning_rate': 1.6666666666666667e-06, 'epoch': 19.33}


100%|██████████| 300/300 [05:23<00:00,  1.19it/s]

{'loss': 2.09, 'grad_norm': 102.3625259399414, 'learning_rate': 0.0, 'epoch': 20.0}



100%|██████████| 300/300 [05:27<00:00,  1.09s/it]


{'eval_loss': 3.1181180477142334, 'eval_runtime': 3.6755, 'eval_samples_per_second': 16.324, 'eval_steps_per_second': 2.177, 'epoch': 20.0}
{'train_runtime': 327.1202, 'train_samples_per_second': 9.171, 'train_steps_per_second': 0.917, 'train_loss': 3.1260866355895995, 'epoch': 20.0}


 38%|███▊      | 3/8 [00:00<00:01,  2.96it/s]

KeyboardInterrupt: 

In [7]:
for model_name, score in best_models_kugellager_dict.items():
    print(f"model: {model_name}, performte mit folgenden Scores:\n{score}")

for model_name, score in best_models_oberfläche_dict.items():
    print(f"model: {model_name}, performte mit folgenden Scores:\n{score}")

for model_name, score in best_models_kugellager_dict_halb.items():
    print(f"model: {model_name}, performte mit folgenden Scores:\n{score}")

for model_name, score in best_models_oberfläche_dict_halb.items():
    print(f"model: {model_name}, performte mit folgenden Scores:\n{score}")

model: hustvl/yolos-tiny, performte mit folgenden Scores:
{'evaluation_results': {'eval_loss': 0.9846168160438538, 'eval_runtime': 6.1671, 'eval_samples_per_second': 19.458, 'eval_steps_per_second': 2.432, 'epoch': 50.0}, 'train_losses': [1.5652, 1.5312, 1.2966, 1.3194, 1.2127, 1.2189, 1.3403, 1.3228, 1.3172, 1.6135, 1.1993, 1.1112, 1.2941, 1.2427, 1.2233, 1.1958, 1.0332, 1.1606, 1.1071, 1.1547, 1.0124, 1.0647, 1.0306, 1.0422, 1.1228, 0.9944, 1.1028, 1.0793, 1.0689, 1.0131, 1.0513, 0.9871, 1.0195, 0.9839, 1.0762, 1.1371, 1.0283, 1.1042, 1.0052, 0.8853, 0.9762, 0.9569, 0.9352, 0.9637, 0.8683, 0.8804, 0.8388, 0.9413, 1.0316, 0.8786, 0.8046, 0.8878, 0.8308, 0.7955, 0.8837, 0.9218, 0.8276, 0.7941, 0.7414, 0.7645, 0.9002, 0.8929, 0.8705, 0.8282, 0.8039, 0.8647, 0.7541, 0.8, 0.7175, 0.7535, 0.713, 0.8802, 0.6926, 0.7339, 0.7084, 0.6583, 0.7666, 0.711, 0.6861, 0.6481, 0.697, 0.7283, 0.6562, 0.7198, 0.6114, 0.6634, 0.6836, 0.6274, 0.7584, 0.8059, 0.7435, 0.6525, 0.7667, 0.6837, 0.6471, 0.6742,

In [None]:

def load_model(model_path):
    """
    Lädt ein Object Detection Modell aus dem angegebenen Ordner.
    """
    model = AutoModelForObjectDetection.from_pretrained(model_path)
    processor = YolosImageProcessor.from_pretrained(model_path)
    model.eval()
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device)
    return model, processor, device


def load_random_images(image_folder, num_images=10):
    """
    Lädt zufällig ausgewählte Bilder aus einem Ordner.
    """
    image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('jpg', 'jpeg', 'png'))]
    selected_files = random.sample(image_files, min(len(image_files), num_images))
    return [os.path.join(image_folder, f) for f in selected_files]


def draw_bounding_boxes(image, detections, labels):
    """
    Zeichnet Bounding Boxes und Labels auf das Bild.
    """
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for detection in detections:
        box = detection['box']
        label = labels[detection['label']]
        score = detection['score']
        draw.rectangle(box, outline='red', width=3)
        draw.text((box[0], box[1] - 10), f"{label} {score:.2f}", fill="red", font=font)
    return image


def predict_and_label_images(model_path, image_folder, output_folder, num_images=10):
    """
    Lädt ein Modell, führt Object Detection auf zufälligen Bildern durch und speichert die Ergebnisse.
    """
    model, processor, device = load_model(model_path)
    images = load_random_images(image_folder, num_images)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for image_path in images:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        detections = []
        for logits, box in zip(outputs.logits[0], outputs.pred_boxes[0]):
            prob = logits.softmax(-1).max().item()
            label = logits.argmax().item()
            if prob > 0.5:  # Threshold
                detections.append({
                    'label': label,
                    'score': prob,
                    'box': [
                        box[0].item() * image.width, box[1].item() * image.height,
                        box[2].item() * image.width, box[3].item() * image.height
                    ]
                })

        labeled_image = draw_bounding_boxes(image, detections, model.config.id2label)
        output_path = os.path.join(output_folder, os.path.basename(image_path))
        labeled_image.save(output_path)


# Beispiel zur Nutzung:
model_path_oberfläche_yolo = os.path.join(BASE_DIR, "trained_model", "Oberfläche", "hustlv", "yolos-tiny", "final_model")
model_path_oberfläche_resnet = os.path.join(BASE_DIR, "trained_model", "Oberfläche", "facebook", "detr-resnet-50", "final_model")
model_path_kugellager_yolo = os.path.join(BASE_DIR, "trained_model", "Kugellager", "hustlv", "yolos-tiny", "final_model")
model_path_kugellager_resenet = os.path.join(BASE_DIR, "trained_model", "Kugellager", "facebook", "detr-resnet-50", "final_model")
predict_and_label_images(model_path=model_path_oberfläche_yolo, image_folder=images_dir_oberfläche_test, output_folder="test_labeling_pics_oberfläche_yolo", num_images=10)
predict_and_label_images(model_path=model_path_oberfläche_resnet, image_folder=images_dir_oberfläche_test, output_folder="test_labeling_pics_oberfläche_resnet", num_images=10)
predict_and_label_images(model_path=model_path_kugellager_yolo, image_folder=images_dir_oberfläche_test, output_folder="test_labeling_pics_kugellager_yolo", num_images=10)
predict_and_label_images(model_path=model_path_kugellager_resenet, image_folder=images_dir_oberfläche_test, output_folder="test_labeling_pics_kugellager_resnet", num_images=10)

In [8]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()  # Stellt sicher, dass alle CUDA-Operationen abgeschlossen sind

# Optional: Garbage Collection erzwingen
import gc
gc.collect()

14063

In [9]:
# from transformers import AutoImageProcessor, pipeline
# from PIL import Image, ImageDraw, ImageFont

# random_images = random.sample(dataset_oberfläche_test["image"], 10)
# for test_image in random_images:
#     test = r"C:\Users\anohl\OneDrive\Dokumente\A_Uni_stuff\Albstadt\Semester 2\Computer_vision\Aufgaben\trained_model\hustvl\yolos-tiny\final_model"
#     out = pipeline("object-detection", model=test, image_processor=YolosImageProcessor())(test_image, threshold=0)

#     # Normalize the prediction thresholds
#     outs = [o["score"] for o in out]
#     filtered_out = []
#     threshold = 0.9
#     for o in out:
#         o["score"] = 1 / max(outs) * o["score"]
#         if o["score"] >= threshold:
#             filtered_out.append(o)
#             print(filtered_out)

#     def plot_box(img, results):
#         try:
#             image = Image.open(img)
#         except:
#             image = img
#         draw = ImageDraw.Draw(image)
#         for i, obj in enumerate(results):
#             box = [round(obj["box"][value], 2) for value in obj["box"].keys()]
#             x, y, x2, y2 = tuple(box)
#             draw.rectangle((x, y, x2, y2), outline="orange", width=2)
#             draw.text((x, y), f"{obj['label']} - {np.round(obj['score'], 3)}", fill="orange", stroke_width=2, stroke_fill="white")
#         image.save(f"./{np.random.randint(10e6)}.jpg")

#     # Visualize Results
#     plot_box(test_image, filtered_out)

In [10]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

# # Verwendung von yolv8m um eine Blance zwischen performance und Genauigkeit zu haben 
# model = YOLO("yolov8m.pt")

# # Train the model
# path_to_yolo_yaml = os.path.join(BASE_DIR, "yaml_files", "yolo_dataset.yaml")
# train_results = model.train(
#     data=path_to_yolo_yaml,
#     epochs=400,
#     imgsz=150, # Weil das die tatsächliche Größe unserer Bilder darstellt
#     device=device,
#     batch=16,
#     mosaic=1.0,
# )

# # Evaluate model performance on the validation set
# metrics = model.val()
# print(metrics)

# # Export the model to the same directory as the script
# export_path = os.path.join(BASE_DIR, "Models", "yolo_kugellager_modell.pt")
# model_path = model.export()
# shutil.move(src=model_path, dst=export_path)
# print(f"Model exported to: {export_path}")


In [11]:
# metrics = model.val()
# print(f"Precision: {metrics['precision']}")
# print(f"Recall: {metrics['recall']}")
# print(f"mAP@0.5: {metrics['map50']}")
# print(f"mAP@0.5:0.95: {metrics['map']}")