### Imports

In [2]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

# print(os.getcwd())
os.chdir("C:/Users/metet/OneDrive/Documents/GitHub/thesis") # have to go up one directory, can also use os.chdir("..")
print(os.getcwd())

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

C:\Users\metet\OneDrive\Documents\GitHub\thesis
cuda


### Load Datasets and Processor

In [3]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)

print(faces_ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [4]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)

In [5]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [6]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<?, ?B/s]


### Parameters

In [7]:
# dataset_to_use = faces_ds_transformed
dataset_to_use = art_ds_transformed

lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 
chosen_lr = 1

# output_dir_name = "./sdxl-fine-tune"
output_dir_name = "./sdxl-fine-tune-art"

### Load Model

In [8]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir=output_dir_name,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=False, # True leads to runtime errors on CUDA
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[chosen_lr],  
    lr_scheduler_type="constant",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [11]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


  1%|          | 10/1600 [00:09<23:06,  1.15it/s]

{'loss': 2.4958, 'grad_norm': 25.296981811523438, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|▏         | 20/1600 [00:18<22:31,  1.17it/s]

{'loss': 0.7042, 'grad_norm': 23.66070556640625, 'learning_rate': 5e-05, 'epoch': 0.01}


  2%|▏         | 30/1600 [00:27<22:35,  1.16it/s]

{'loss': 0.3335, 'grad_norm': 28.998109817504883, 'learning_rate': 5e-05, 'epoch': 0.02}


  2%|▎         | 40/1600 [00:35<22:21,  1.16it/s]

{'loss': 0.2727, 'grad_norm': 20.41728401184082, 'learning_rate': 5e-05, 'epoch': 0.03}


  3%|▎         | 50/1600 [00:44<21:59,  1.17it/s]

{'loss': 0.2965, 'grad_norm': 11.940719604492188, 'learning_rate': 5e-05, 'epoch': 0.03}


  4%|▍         | 60/1600 [00:52<21:30,  1.19it/s]

{'loss': 0.3245, 'grad_norm': 51.65280532836914, 'learning_rate': 5e-05, 'epoch': 0.04}


  4%|▍         | 70/1600 [01:01<21:52,  1.17it/s]

{'loss': 0.5706, 'grad_norm': 75.0677719116211, 'learning_rate': 5e-05, 'epoch': 0.04}


  5%|▌         | 80/1600 [01:10<21:42,  1.17it/s]

{'loss': 0.1334, 'grad_norm': 27.067684173583984, 'learning_rate': 5e-05, 'epoch': 0.05}


  6%|▌         | 90/1600 [01:18<21:26,  1.17it/s]

{'loss': 0.3126, 'grad_norm': 6.170212268829346, 'learning_rate': 5e-05, 'epoch': 0.06}


  6%|▋         | 100/1600 [01:27<21:24,  1.17it/s]

{'loss': 0.3878, 'grad_norm': 26.43212890625, 'learning_rate': 5e-05, 'epoch': 0.06}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.5166861414909363, 'eval_Accuracy': 0.894375, 'eval_F1': 0.9038133181559478, 'eval_runtime': 71.4637, 'eval_samples_per_second': 22.389, 'eval_steps_per_second': 2.799, 'epoch': 0.06}


  7%|▋         | 110/1600 [02:50<44:55,  1.81s/it]  

{'loss': 0.1946, 'grad_norm': 4.881194591522217, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 120/1600 [02:59<22:18,  1.11it/s]

{'loss': 0.4384, 'grad_norm': 23.842098236083984, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 130/1600 [03:08<21:11,  1.16it/s]

{'loss': 0.2406, 'grad_norm': 16.382144927978516, 'learning_rate': 5e-05, 'epoch': 0.08}


  9%|▉         | 140/1600 [03:16<20:55,  1.16it/s]

{'loss': 0.3008, 'grad_norm': 1.3535693883895874, 'learning_rate': 5e-05, 'epoch': 0.09}


  9%|▉         | 150/1600 [03:25<20:43,  1.17it/s]

{'loss': 0.3453, 'grad_norm': 26.694047927856445, 'learning_rate': 5e-05, 'epoch': 0.09}


 10%|█         | 160/1600 [03:34<20:52,  1.15it/s]

{'loss': 0.3139, 'grad_norm': 0.6078062653541565, 'learning_rate': 5e-05, 'epoch': 0.1}


 11%|█         | 170/1600 [03:42<20:10,  1.18it/s]

{'loss': 0.3086, 'grad_norm': 0.03087734431028366, 'learning_rate': 5e-05, 'epoch': 0.11}


 11%|█▏        | 180/1600 [03:51<20:11,  1.17it/s]

{'loss': 0.3474, 'grad_norm': 5.62446928024292, 'learning_rate': 5e-05, 'epoch': 0.11}


 12%|█▏        | 190/1600 [04:00<20:13,  1.16it/s]

{'loss': 0.2326, 'grad_norm': 23.75773811340332, 'learning_rate': 5e-05, 'epoch': 0.12}


 12%|█▎        | 200/1600 [04:08<19:57,  1.17it/s]

{'loss': 0.2083, 'grad_norm': 12.376899719238281, 'learning_rate': 5e-05, 'epoch': 0.12}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.19363334774971008, 'eval_Accuracy': 0.93125, 'eval_F1': 0.9302915082382762, 'eval_runtime': 56.8746, 'eval_samples_per_second': 28.132, 'eval_steps_per_second': 3.517, 'epoch': 0.12}


 13%|█▎        | 210/1600 [05:18<37:24,  1.61s/it]  

{'loss': 0.3037, 'grad_norm': 18.937335968017578, 'learning_rate': 5e-05, 'epoch': 0.13}


 14%|█▍        | 220/1600 [05:26<20:19,  1.13it/s]

{'loss': 0.203, 'grad_norm': 41.078819274902344, 'learning_rate': 5e-05, 'epoch': 0.14}


 14%|█▍        | 230/1600 [05:35<20:00,  1.14it/s]

{'loss': 0.2893, 'grad_norm': 10.32539176940918, 'learning_rate': 5e-05, 'epoch': 0.14}


 15%|█▌        | 240/1600 [05:44<19:32,  1.16it/s]

{'loss': 0.1015, 'grad_norm': 2.0060904026031494, 'learning_rate': 5e-05, 'epoch': 0.15}


 16%|█▌        | 250/1600 [05:52<19:13,  1.17it/s]

{'loss': 0.4872, 'grad_norm': 6.093726634979248, 'learning_rate': 5e-05, 'epoch': 0.16}


 16%|█▋        | 260/1600 [06:01<19:08,  1.17it/s]

{'loss': 0.1111, 'grad_norm': 36.29031753540039, 'learning_rate': 5e-05, 'epoch': 0.16}


 17%|█▋        | 270/1600 [06:10<18:56,  1.17it/s]

{'loss': 0.2382, 'grad_norm': 14.445043563842773, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 280/1600 [06:18<19:06,  1.15it/s]

{'loss': 0.4475, 'grad_norm': 92.79743194580078, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 290/1600 [06:27<18:44,  1.17it/s]

{'loss': 0.2687, 'grad_norm': 9.300115585327148, 'learning_rate': 5e-05, 'epoch': 0.18}


 19%|█▉        | 300/1600 [06:36<18:31,  1.17it/s]

{'loss': 0.251, 'grad_norm': 1.8239731788635254, 'learning_rate': 5e-05, 'epoch': 0.19}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22721485793590546, 'eval_Accuracy': 0.925625, 'eval_F1': 0.927127985303123, 'eval_runtime': 57.0192, 'eval_samples_per_second': 28.061, 'eval_steps_per_second': 3.508, 'epoch': 0.19}


 19%|█▉        | 310/1600 [07:46<34:30,  1.60s/it]  

{'loss': 0.2604, 'grad_norm': 3.1132919788360596, 'learning_rate': 5e-05, 'epoch': 0.19}


 20%|██        | 320/1600 [07:54<18:38,  1.14it/s]

{'loss': 0.2812, 'grad_norm': 1.6643500328063965, 'learning_rate': 5e-05, 'epoch': 0.2}


 21%|██        | 330/1600 [08:03<18:06,  1.17it/s]

{'loss': 0.5684, 'grad_norm': 17.18016242980957, 'learning_rate': 5e-05, 'epoch': 0.21}


 21%|██▏       | 340/1600 [08:12<18:06,  1.16it/s]

{'loss': 0.424, 'grad_norm': 2.9249327182769775, 'learning_rate': 5e-05, 'epoch': 0.21}


 22%|██▏       | 350/1600 [08:20<17:55,  1.16it/s]

{'loss': 0.3537, 'grad_norm': 20.89545440673828, 'learning_rate': 5e-05, 'epoch': 0.22}


 22%|██▎       | 360/1600 [08:29<17:49,  1.16it/s]

{'loss': 0.254, 'grad_norm': 39.78134536743164, 'learning_rate': 5e-05, 'epoch': 0.23}


 23%|██▎       | 370/1600 [08:38<17:38,  1.16it/s]

{'loss': 0.147, 'grad_norm': 26.556259155273438, 'learning_rate': 5e-05, 'epoch': 0.23}


 24%|██▍       | 380/1600 [08:46<17:28,  1.16it/s]

{'loss': 0.1098, 'grad_norm': 8.117420196533203, 'learning_rate': 5e-05, 'epoch': 0.24}


 24%|██▍       | 390/1600 [08:55<17:18,  1.16it/s]

{'loss': 0.371, 'grad_norm': 211.08543395996094, 'learning_rate': 5e-05, 'epoch': 0.24}


 25%|██▌       | 400/1600 [09:04<16:59,  1.18it/s]

{'loss': 0.2905, 'grad_norm': 54.78911209106445, 'learning_rate': 5e-05, 'epoch': 0.25}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.265254944562912, 'eval_Accuracy': 0.929375, 'eval_F1': 0.9313069908814591, 'eval_runtime': 56.9229, 'eval_samples_per_second': 28.108, 'eval_steps_per_second': 3.514, 'epoch': 0.25}


 26%|██▌       | 410/1600 [10:12<31:10,  1.57s/it]  

{'loss': 0.223, 'grad_norm': 28.523075103759766, 'learning_rate': 5e-05, 'epoch': 0.26}


 26%|██▋       | 420/1600 [10:20<17:21,  1.13it/s]

{'loss': 0.3024, 'grad_norm': 37.597286224365234, 'learning_rate': 5e-05, 'epoch': 0.26}


 27%|██▋       | 430/1600 [10:29<16:49,  1.16it/s]

{'loss': 0.3061, 'grad_norm': 15.285408020019531, 'learning_rate': 5e-05, 'epoch': 0.27}


 28%|██▊       | 440/1600 [10:38<16:41,  1.16it/s]

{'loss': 0.2138, 'grad_norm': 11.39369010925293, 'learning_rate': 5e-05, 'epoch': 0.28}


 28%|██▊       | 450/1600 [10:46<16:27,  1.16it/s]

{'loss': 0.1514, 'grad_norm': 0.01798662170767784, 'learning_rate': 5e-05, 'epoch': 0.28}


 29%|██▉       | 460/1600 [10:55<16:13,  1.17it/s]

{'loss': 0.1952, 'grad_norm': 15.90272045135498, 'learning_rate': 5e-05, 'epoch': 0.29}


 29%|██▉       | 470/1600 [11:04<16:09,  1.17it/s]

{'loss': 0.3139, 'grad_norm': 104.1943130493164, 'learning_rate': 5e-05, 'epoch': 0.29}


 30%|███       | 480/1600 [11:12<16:21,  1.14it/s]

{'loss': 0.2212, 'grad_norm': 30.378433227539062, 'learning_rate': 5e-05, 'epoch': 0.3}


 31%|███       | 490/1600 [11:21<16:07,  1.15it/s]

{'loss': 0.214, 'grad_norm': 54.12253189086914, 'learning_rate': 5e-05, 'epoch': 0.31}


 31%|███▏      | 500/1600 [11:30<15:50,  1.16it/s]

{'loss': 0.1748, 'grad_norm': 26.206504821777344, 'learning_rate': 5e-05, 'epoch': 0.31}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.5235199928283691, 'eval_Accuracy': 0.884375, 'eval_F1': 0.8956570783981952, 'eval_runtime': 57.3096, 'eval_samples_per_second': 27.919, 'eval_steps_per_second': 3.49, 'epoch': 0.31}


 32%|███▏      | 510/1600 [12:38<28:42,  1.58s/it]  

{'loss': 0.345, 'grad_norm': 0.4016208350658417, 'learning_rate': 5e-05, 'epoch': 0.32}


 32%|███▎      | 520/1600 [12:47<16:00,  1.12it/s]

{'loss': 0.2969, 'grad_norm': 15.089537620544434, 'learning_rate': 5e-05, 'epoch': 0.33}


 33%|███▎      | 530/1600 [12:56<15:37,  1.14it/s]

{'loss': 0.287, 'grad_norm': 1.586912989616394, 'learning_rate': 5e-05, 'epoch': 0.33}


 34%|███▍      | 540/1600 [13:04<15:17,  1.16it/s]

{'loss': 0.0587, 'grad_norm': 30.19501304626465, 'learning_rate': 5e-05, 'epoch': 0.34}


 34%|███▍      | 550/1600 [13:13<15:00,  1.17it/s]

{'loss': 0.3416, 'grad_norm': 0.29155629873275757, 'learning_rate': 5e-05, 'epoch': 0.34}


 35%|███▌      | 560/1600 [13:22<15:03,  1.15it/s]

{'loss': 0.0036, 'grad_norm': 0.004087112378329039, 'learning_rate': 5e-05, 'epoch': 0.35}


 36%|███▌      | 570/1600 [13:31<14:49,  1.16it/s]

{'loss': 0.2329, 'grad_norm': 0.01137535460293293, 'learning_rate': 5e-05, 'epoch': 0.36}


 36%|███▋      | 580/1600 [13:39<14:33,  1.17it/s]

{'loss': 0.2756, 'grad_norm': 0.06275182217359543, 'learning_rate': 5e-05, 'epoch': 0.36}


 37%|███▋      | 590/1600 [13:48<14:21,  1.17it/s]

{'loss': 0.1935, 'grad_norm': 27.531707763671875, 'learning_rate': 5e-05, 'epoch': 0.37}


 38%|███▊      | 600/1600 [13:57<14:25,  1.16it/s]

{'loss': 0.1259, 'grad_norm': 0.1884840875864029, 'learning_rate': 5e-05, 'epoch': 0.38}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22836610674858093, 'eval_Accuracy': 0.928125, 'eval_F1': 0.9245901639344262, 'eval_runtime': 57.9195, 'eval_samples_per_second': 27.625, 'eval_steps_per_second': 3.453, 'epoch': 0.38}


 38%|███▊      | 610/1600 [15:06<26:15,  1.59s/it]  

{'loss': 0.3662, 'grad_norm': 9.322120666503906, 'learning_rate': 5e-05, 'epoch': 0.38}


 39%|███▉      | 620/1600 [15:15<14:28,  1.13it/s]

{'loss': 0.3949, 'grad_norm': 10.063549995422363, 'learning_rate': 5e-05, 'epoch': 0.39}


 39%|███▉      | 630/1600 [15:23<14:08,  1.14it/s]

{'loss': 0.1492, 'grad_norm': 0.7042864561080933, 'learning_rate': 5e-05, 'epoch': 0.39}


 40%|████      | 640/1600 [15:32<13:57,  1.15it/s]

{'loss': 0.1785, 'grad_norm': 66.28573608398438, 'learning_rate': 5e-05, 'epoch': 0.4}


 41%|████      | 650/1600 [15:41<13:34,  1.17it/s]

{'loss': 0.3133, 'grad_norm': 0.44955000281333923, 'learning_rate': 5e-05, 'epoch': 0.41}


 41%|████▏     | 660/1600 [15:49<13:29,  1.16it/s]

{'loss': 0.1901, 'grad_norm': 1.5936861038208008, 'learning_rate': 5e-05, 'epoch': 0.41}


 42%|████▏     | 670/1600 [15:58<13:11,  1.17it/s]

{'loss': 0.2317, 'grad_norm': 0.1489221155643463, 'learning_rate': 5e-05, 'epoch': 0.42}


 42%|████▎     | 680/1600 [16:07<13:14,  1.16it/s]

{'loss': 0.8096, 'grad_norm': 0.244822159409523, 'learning_rate': 5e-05, 'epoch': 0.42}


 43%|████▎     | 690/1600 [16:16<13:14,  1.15it/s]

{'loss': 0.3885, 'grad_norm': 2.225020408630371, 'learning_rate': 5e-05, 'epoch': 0.43}


 44%|████▍     | 700/1600 [16:24<13:10,  1.14it/s]

{'loss': 0.0447, 'grad_norm': 1.9713393449783325, 'learning_rate': 5e-05, 'epoch': 0.44}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.19736172258853912, 'eval_Accuracy': 0.940625, 'eval_F1': 0.9405134627426424, 'eval_runtime': 56.8924, 'eval_samples_per_second': 28.123, 'eval_steps_per_second': 3.515, 'epoch': 0.44}


 44%|████▍     | 710/1600 [17:33<23:29,  1.58s/it]  

{'loss': 0.3642, 'grad_norm': 10.134503364562988, 'learning_rate': 5e-05, 'epoch': 0.44}


 45%|████▌     | 720/1600 [17:42<13:02,  1.13it/s]

{'loss': 0.34, 'grad_norm': 1.2244627475738525, 'learning_rate': 5e-05, 'epoch': 0.45}


 46%|████▌     | 730/1600 [17:50<12:27,  1.16it/s]

{'loss': 0.2461, 'grad_norm': 11.096857070922852, 'learning_rate': 5e-05, 'epoch': 0.46}


 46%|████▋     | 740/1600 [17:59<12:12,  1.17it/s]

{'loss': 0.305, 'grad_norm': 2.4325690269470215, 'learning_rate': 5e-05, 'epoch': 0.46}


 47%|████▋     | 750/1600 [18:08<12:07,  1.17it/s]

{'loss': 0.1976, 'grad_norm': 7.76582670211792, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 760/1600 [18:16<12:03,  1.16it/s]

{'loss': 0.2794, 'grad_norm': 20.331424713134766, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 770/1600 [18:25<11:55,  1.16it/s]

{'loss': 0.117, 'grad_norm': 8.068358421325684, 'learning_rate': 5e-05, 'epoch': 0.48}


 49%|████▉     | 780/1600 [18:34<11:43,  1.17it/s]

{'loss': 0.3093, 'grad_norm': 19.399232864379883, 'learning_rate': 5e-05, 'epoch': 0.49}


 49%|████▉     | 790/1600 [18:42<11:34,  1.17it/s]

{'loss': 0.118, 'grad_norm': 29.88656997680664, 'learning_rate': 5e-05, 'epoch': 0.49}


 50%|█████     | 800/1600 [18:51<11:21,  1.17it/s]

{'loss': 0.1136, 'grad_norm': 43.9355583190918, 'learning_rate': 5e-05, 'epoch': 0.5}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2410442978143692, 'eval_Accuracy': 0.926875, 'eval_F1': 0.931297709923664, 'eval_runtime': 56.6901, 'eval_samples_per_second': 28.224, 'eval_steps_per_second': 3.528, 'epoch': 0.5}


 51%|█████     | 810/1600 [19:58<20:51,  1.58s/it]  

{'loss': 0.3011, 'grad_norm': 0.23324762284755707, 'learning_rate': 5e-05, 'epoch': 0.51}


 51%|█████▏    | 820/1600 [20:07<11:30,  1.13it/s]

{'loss': 0.1979, 'grad_norm': 13.67980670928955, 'learning_rate': 5e-05, 'epoch': 0.51}


 52%|█████▏    | 830/1600 [20:16<11:00,  1.17it/s]

{'loss': 0.2582, 'grad_norm': 0.5750077962875366, 'learning_rate': 5e-05, 'epoch': 0.52}


 52%|█████▎    | 840/1600 [20:24<10:48,  1.17it/s]

{'loss': 0.1517, 'grad_norm': 2.4139516353607178, 'learning_rate': 5e-05, 'epoch': 0.53}


 53%|█████▎    | 850/1600 [20:33<10:49,  1.15it/s]

{'loss': 0.2655, 'grad_norm': 0.19784998893737793, 'learning_rate': 5e-05, 'epoch': 0.53}


 54%|█████▍    | 860/1600 [20:42<10:46,  1.14it/s]

{'loss': 0.2887, 'grad_norm': 0.013382002711296082, 'learning_rate': 5e-05, 'epoch': 0.54}


 54%|█████▍    | 870/1600 [20:50<10:31,  1.16it/s]

{'loss': 0.396, 'grad_norm': 34.112152099609375, 'learning_rate': 5e-05, 'epoch': 0.54}


 55%|█████▌    | 880/1600 [20:59<10:18,  1.17it/s]

{'loss': 0.1843, 'grad_norm': 18.22429847717285, 'learning_rate': 5e-05, 'epoch': 0.55}


 56%|█████▌    | 890/1600 [21:08<10:13,  1.16it/s]

{'loss': 0.0577, 'grad_norm': 0.10384578257799149, 'learning_rate': 5e-05, 'epoch': 0.56}


 56%|█████▋    | 900/1600 [21:16<10:07,  1.15it/s]

{'loss': 0.0651, 'grad_norm': 11.981380462646484, 'learning_rate': 5e-05, 'epoch': 0.56}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3639777898788452, 'eval_Accuracy': 0.920625, 'eval_F1': 0.914822266934943, 'eval_runtime': 56.8429, 'eval_samples_per_second': 28.148, 'eval_steps_per_second': 3.518, 'epoch': 0.56}


 57%|█████▋    | 910/1600 [22:24<18:15,  1.59s/it]  

{'loss': 0.2608, 'grad_norm': 0.03727322816848755, 'learning_rate': 5e-05, 'epoch': 0.57}


 57%|█████▊    | 920/1600 [22:33<10:04,  1.12it/s]

{'loss': 0.4846, 'grad_norm': 5.36939001083374, 'learning_rate': 5e-05, 'epoch': 0.57}


 58%|█████▊    | 930/1600 [22:41<09:35,  1.16it/s]

{'loss': 0.3632, 'grad_norm': 7.869729042053223, 'learning_rate': 5e-05, 'epoch': 0.58}


 59%|█████▉    | 940/1600 [22:50<09:26,  1.17it/s]

{'loss': 0.2564, 'grad_norm': 22.860258102416992, 'learning_rate': 5e-05, 'epoch': 0.59}


 59%|█████▉    | 950/1600 [22:59<09:24,  1.15it/s]

{'loss': 0.161, 'grad_norm': 0.2966982126235962, 'learning_rate': 5e-05, 'epoch': 0.59}


 60%|██████    | 960/1600 [23:08<09:10,  1.16it/s]

{'loss': 0.2181, 'grad_norm': 70.02642059326172, 'learning_rate': 5e-05, 'epoch': 0.6}


 61%|██████    | 970/1600 [23:16<09:07,  1.15it/s]

{'loss': 0.2065, 'grad_norm': 0.09606526046991348, 'learning_rate': 5e-05, 'epoch': 0.61}


 61%|██████▏   | 980/1600 [23:25<08:50,  1.17it/s]

{'loss': 0.1931, 'grad_norm': 15.417179107666016, 'learning_rate': 5e-05, 'epoch': 0.61}


 62%|██████▏   | 990/1600 [23:34<08:50,  1.15it/s]

{'loss': 0.2259, 'grad_norm': 1.2951431274414062, 'learning_rate': 5e-05, 'epoch': 0.62}


 62%|██████▎   | 1000/1600 [23:43<08:41,  1.15it/s]

{'loss': 0.1617, 'grad_norm': 0.6947004795074463, 'learning_rate': 5e-05, 'epoch': 0.62}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.24056793749332428, 'eval_Accuracy': 0.93125, 'eval_F1': 0.9288486416558861, 'eval_runtime': 56.7646, 'eval_samples_per_second': 28.187, 'eval_steps_per_second': 3.523, 'epoch': 0.62}


 63%|██████▎   | 1010/1600 [24:50<15:30,  1.58s/it]  

{'loss': 0.1457, 'grad_norm': 58.66796112060547, 'learning_rate': 5e-05, 'epoch': 0.63}


 64%|██████▍   | 1020/1600 [24:59<08:33,  1.13it/s]

{'loss': 0.1219, 'grad_norm': 20.633541107177734, 'learning_rate': 5e-05, 'epoch': 0.64}


 64%|██████▍   | 1030/1600 [25:08<08:14,  1.15it/s]

{'loss': 0.4574, 'grad_norm': 16.271434783935547, 'learning_rate': 5e-05, 'epoch': 0.64}


 65%|██████▌   | 1040/1600 [25:16<08:00,  1.16it/s]

{'loss': 0.2708, 'grad_norm': 29.678911209106445, 'learning_rate': 5e-05, 'epoch': 0.65}


 66%|██████▌   | 1050/1600 [25:25<07:55,  1.16it/s]

{'loss': 0.2346, 'grad_norm': 11.274038314819336, 'learning_rate': 5e-05, 'epoch': 0.66}


 66%|██████▋   | 1060/1600 [25:34<07:47,  1.15it/s]

{'loss': 0.1979, 'grad_norm': 109.81813049316406, 'learning_rate': 5e-05, 'epoch': 0.66}


 67%|██████▋   | 1070/1600 [25:42<07:35,  1.16it/s]

{'loss': 0.0988, 'grad_norm': 30.583810806274414, 'learning_rate': 5e-05, 'epoch': 0.67}


 68%|██████▊   | 1080/1600 [25:51<07:33,  1.15it/s]

{'loss': 0.2557, 'grad_norm': 29.60158920288086, 'learning_rate': 5e-05, 'epoch': 0.68}


 68%|██████▊   | 1090/1600 [26:00<07:20,  1.16it/s]

{'loss': 0.3964, 'grad_norm': 12.781795501708984, 'learning_rate': 5e-05, 'epoch': 0.68}


 69%|██████▉   | 1100/1600 [26:09<07:14,  1.15it/s]

{'loss': 0.3736, 'grad_norm': 25.141984939575195, 'learning_rate': 5e-05, 'epoch': 0.69}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.25139376521110535, 'eval_Accuracy': 0.919375, 'eval_F1': 0.9248689574839837, 'eval_runtime': 57.0649, 'eval_samples_per_second': 28.038, 'eval_steps_per_second': 3.505, 'epoch': 0.69}


 69%|██████▉   | 1110/1600 [27:17<12:57,  1.59s/it]  

{'loss': 0.242, 'grad_norm': 10.25417709350586, 'learning_rate': 5e-05, 'epoch': 0.69}


 70%|███████   | 1120/1600 [27:26<07:08,  1.12it/s]

{'loss': 0.1707, 'grad_norm': 14.584976196289062, 'learning_rate': 5e-05, 'epoch': 0.7}


 71%|███████   | 1130/1600 [27:34<06:45,  1.16it/s]

{'loss': 0.0556, 'grad_norm': 1.891177773475647, 'learning_rate': 5e-05, 'epoch': 0.71}


 71%|███████▏  | 1140/1600 [27:43<06:38,  1.15it/s]

{'loss': 0.1503, 'grad_norm': 23.369726181030273, 'learning_rate': 5e-05, 'epoch': 0.71}


 72%|███████▏  | 1150/1600 [27:52<06:35,  1.14it/s]

{'loss': 0.0762, 'grad_norm': 47.01958084106445, 'learning_rate': 5e-05, 'epoch': 0.72}


 72%|███████▎  | 1160/1600 [28:01<06:28,  1.13it/s]

{'loss': 0.4956, 'grad_norm': 0.2383388876914978, 'learning_rate': 5e-05, 'epoch': 0.72}


 73%|███████▎  | 1170/1600 [28:09<06:13,  1.15it/s]

{'loss': 0.2577, 'grad_norm': 3.615140914916992, 'learning_rate': 5e-05, 'epoch': 0.73}


 74%|███████▍  | 1180/1600 [28:18<06:02,  1.16it/s]

{'loss': 0.1568, 'grad_norm': 0.8251069784164429, 'learning_rate': 5e-05, 'epoch': 0.74}


 74%|███████▍  | 1190/1600 [28:27<05:58,  1.14it/s]

{'loss': 0.2622, 'grad_norm': 35.440673828125, 'learning_rate': 5e-05, 'epoch': 0.74}


 75%|███████▌  | 1200/1600 [28:36<05:47,  1.15it/s]

{'loss': 0.2506, 'grad_norm': 0.13330583274364471, 'learning_rate': 5e-05, 'epoch': 0.75}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.19124488532543182, 'eval_Accuracy': 0.94625, 'eval_F1': 0.9445161290322581, 'eval_runtime': 57.0269, 'eval_samples_per_second': 28.057, 'eval_steps_per_second': 3.507, 'epoch': 0.75}


 76%|███████▌  | 1210/1600 [29:45<10:43,  1.65s/it]  

{'loss': 0.1415, 'grad_norm': 0.8880603909492493, 'learning_rate': 5e-05, 'epoch': 0.76}


 76%|███████▋  | 1220/1600 [29:53<05:42,  1.11it/s]

{'loss': 0.2108, 'grad_norm': 31.289100646972656, 'learning_rate': 5e-05, 'epoch': 0.76}


 77%|███████▋  | 1230/1600 [30:02<05:41,  1.08it/s]

{'loss': 0.2025, 'grad_norm': 0.07716178894042969, 'learning_rate': 5e-05, 'epoch': 0.77}


 78%|███████▊  | 1240/1600 [30:11<05:11,  1.15it/s]

{'loss': 0.0634, 'grad_norm': 0.08000524342060089, 'learning_rate': 5e-05, 'epoch': 0.78}


 78%|███████▊  | 1250/1600 [30:20<05:02,  1.16it/s]

{'loss': 0.0861, 'grad_norm': 0.12230020761489868, 'learning_rate': 5e-05, 'epoch': 0.78}


 79%|███████▉  | 1260/1600 [30:29<04:54,  1.15it/s]

{'loss': 0.0577, 'grad_norm': 0.0042463126592338085, 'learning_rate': 5e-05, 'epoch': 0.79}


 79%|███████▉  | 1270/1600 [30:37<04:45,  1.15it/s]

{'loss': 0.0854, 'grad_norm': 0.08634265512228012, 'learning_rate': 5e-05, 'epoch': 0.79}


 80%|████████  | 1280/1600 [30:46<04:37,  1.15it/s]

{'loss': 0.2324, 'grad_norm': 0.03253873437643051, 'learning_rate': 5e-05, 'epoch': 0.8}


 81%|████████  | 1290/1600 [30:55<04:28,  1.15it/s]

{'loss': 0.3982, 'grad_norm': 0.1627541035413742, 'learning_rate': 5e-05, 'epoch': 0.81}


 81%|████████▏ | 1300/1600 [31:03<04:20,  1.15it/s]

{'loss': 0.2782, 'grad_norm': 38.556396484375, 'learning_rate': 5e-05, 'epoch': 0.81}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.21529893577098846, 'eval_Accuracy': 0.944375, 'eval_F1': 0.9423948220064725, 'eval_runtime': 56.6756, 'eval_samples_per_second': 28.231, 'eval_steps_per_second': 3.529, 'epoch': 0.81}


 82%|████████▏ | 1310/1600 [32:12<07:38,  1.58s/it]  

{'loss': 0.1456, 'grad_norm': 12.042495727539062, 'learning_rate': 5e-05, 'epoch': 0.82}


 82%|████████▎ | 1320/1600 [32:20<04:07,  1.13it/s]

{'loss': 0.154, 'grad_norm': 0.2713335156440735, 'learning_rate': 5e-05, 'epoch': 0.82}


 83%|████████▎ | 1330/1600 [32:29<03:51,  1.16it/s]

{'loss': 0.0476, 'grad_norm': 0.3507419526576996, 'learning_rate': 5e-05, 'epoch': 0.83}


 84%|████████▍ | 1340/1600 [32:38<03:43,  1.16it/s]

{'loss': 0.0905, 'grad_norm': 0.009692618623375893, 'learning_rate': 5e-05, 'epoch': 0.84}


 84%|████████▍ | 1350/1600 [32:46<03:34,  1.16it/s]

{'loss': 0.1284, 'grad_norm': 0.011318167671561241, 'learning_rate': 5e-05, 'epoch': 0.84}


 85%|████████▌ | 1360/1600 [32:55<03:28,  1.15it/s]

{'loss': 0.1923, 'grad_norm': 19.012292861938477, 'learning_rate': 5e-05, 'epoch': 0.85}


 86%|████████▌ | 1370/1600 [33:04<03:16,  1.17it/s]

{'loss': 0.2528, 'grad_norm': 0.6896706223487854, 'learning_rate': 5e-05, 'epoch': 0.86}


 86%|████████▋ | 1380/1600 [33:12<03:09,  1.16it/s]

{'loss': 0.2511, 'grad_norm': 0.18757060170173645, 'learning_rate': 5e-05, 'epoch': 0.86}


 87%|████████▋ | 1390/1600 [33:21<03:02,  1.15it/s]

{'loss': 0.142, 'grad_norm': 33.47026062011719, 'learning_rate': 5e-05, 'epoch': 0.87}


 88%|████████▊ | 1400/1600 [33:30<02:53,  1.15it/s]

{'loss': 0.0725, 'grad_norm': 0.5674712061882019, 'learning_rate': 5e-05, 'epoch': 0.88}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.19486339390277863, 'eval_Accuracy': 0.944375, 'eval_F1': 0.9464178205900061, 'eval_runtime': 56.7088, 'eval_samples_per_second': 28.214, 'eval_steps_per_second': 3.527, 'epoch': 0.88}


 88%|████████▊ | 1410/1600 [34:38<04:58,  1.57s/it]  

{'loss': 0.138, 'grad_norm': 20.193925857543945, 'learning_rate': 5e-05, 'epoch': 0.88}


 89%|████████▉ | 1420/1600 [34:47<02:37,  1.15it/s]

{'loss': 0.717, 'grad_norm': 0.27296218276023865, 'learning_rate': 5e-05, 'epoch': 0.89}


 89%|████████▉ | 1430/1600 [34:55<02:26,  1.16it/s]

{'loss': 0.2531, 'grad_norm': 13.086801528930664, 'learning_rate': 5e-05, 'epoch': 0.89}


 90%|█████████ | 1440/1600 [35:04<02:17,  1.16it/s]

{'loss': 0.1955, 'grad_norm': 20.96681785583496, 'learning_rate': 5e-05, 'epoch': 0.9}


 91%|█████████ | 1450/1600 [35:13<02:09,  1.16it/s]

{'loss': 0.2141, 'grad_norm': 0.5169796347618103, 'learning_rate': 5e-05, 'epoch': 0.91}


 91%|█████████▏| 1460/1600 [35:21<01:58,  1.18it/s]

{'loss': 0.1321, 'grad_norm': 13.893712997436523, 'learning_rate': 5e-05, 'epoch': 0.91}


 92%|█████████▏| 1470/1600 [35:30<01:51,  1.16it/s]

{'loss': 0.2607, 'grad_norm': 0.012937555089592934, 'learning_rate': 5e-05, 'epoch': 0.92}


 92%|█████████▎| 1480/1600 [35:39<01:43,  1.16it/s]

{'loss': 0.0607, 'grad_norm': 0.27531370520591736, 'learning_rate': 5e-05, 'epoch': 0.93}


 93%|█████████▎| 1490/1600 [35:47<01:33,  1.17it/s]

{'loss': 0.1853, 'grad_norm': 24.332372665405273, 'learning_rate': 5e-05, 'epoch': 0.93}


 94%|█████████▍| 1500/1600 [35:56<01:25,  1.17it/s]

{'loss': 0.113, 'grad_norm': 1.7014496326446533, 'learning_rate': 5e-05, 'epoch': 0.94}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2619403898715973, 'eval_Accuracy': 0.935625, 'eval_F1': 0.9321922317314021, 'eval_runtime': 57.2784, 'eval_samples_per_second': 27.934, 'eval_steps_per_second': 3.492, 'epoch': 0.94}


 94%|█████████▍| 1510/1600 [37:03<02:19,  1.55s/it]

{'loss': 0.118, 'grad_norm': 7.036049842834473, 'learning_rate': 5e-05, 'epoch': 0.94}


 95%|█████████▌| 1520/1600 [37:12<01:08,  1.17it/s]

{'loss': 0.1353, 'grad_norm': 52.61398696899414, 'learning_rate': 5e-05, 'epoch': 0.95}


 96%|█████████▌| 1530/1600 [37:20<00:58,  1.19it/s]

{'loss': 0.0888, 'grad_norm': 8.121234893798828, 'learning_rate': 5e-05, 'epoch': 0.96}


 96%|█████████▋| 1540/1600 [37:29<00:50,  1.20it/s]

{'loss': 0.2337, 'grad_norm': 0.16364197432994843, 'learning_rate': 5e-05, 'epoch': 0.96}


 97%|█████████▋| 1550/1600 [37:37<00:42,  1.19it/s]

{'loss': 0.2758, 'grad_norm': 0.42667439579963684, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1560/1600 [37:46<00:32,  1.21it/s]

{'loss': 0.4276, 'grad_norm': 10.075235366821289, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1570/1600 [37:54<00:25,  1.20it/s]

{'loss': 0.3602, 'grad_norm': 57.7889518737793, 'learning_rate': 5e-05, 'epoch': 0.98}


 99%|█████████▉| 1580/1600 [38:02<00:16,  1.20it/s]

{'loss': 0.1065, 'grad_norm': 21.01119041442871, 'learning_rate': 5e-05, 'epoch': 0.99}


 99%|█████████▉| 1590/1600 [38:11<00:08,  1.21it/s]

{'loss': 0.1363, 'grad_norm': 26.984020233154297, 'learning_rate': 5e-05, 'epoch': 0.99}


100%|██████████| 1600/1600 [38:19<00:00,  1.25it/s]

{'loss': 0.1133, 'grad_norm': 24.224008560180664, 'learning_rate': 5e-05, 'epoch': 1.0}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22246776521205902, 'eval_Accuracy': 0.92875, 'eval_F1': 0.9327830188679246, 'eval_runtime': 55.7393, 'eval_samples_per_second': 28.705, 'eval_steps_per_second': 3.588, 'epoch': 1.0}


100%|██████████| 1600/1600 [39:17<00:00,  1.47s/it]
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 2357.4368, 'train_samples_per_second': 5.43, 'train_steps_per_second': 0.679, 'train_loss': 0.2582701690075919, 'epoch': 1.0}
***** train metrics *****
  epoch                    =         1.0
  total_flos               = 933951802GF
  train_loss               =      0.2583
  train_runtime            =  0:39:17.43
  train_samples_per_second =        5.43
  train_steps_per_second   =       0.679


100%|██████████| 200/200 [00:55<00:00,  3.61it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_Accuracy           =     0.9463
  eval_F1                 =     0.9445
  eval_loss               =     0.1912
  eval_runtime            = 0:00:55.71
  eval_samples_per_second =     28.716
  eval_steps_per_second   =      3.589



