### Imports

In [2]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

print(os.getcwd())
os.chdir("..") # have to go up one directory, can also use os.chdir("..")
print(os.getcwd())

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


e:\Projects & Temp\GitHub\thesis\RQ2
e:\Projects & Temp\GitHub\thesis
cuda


### Load Datasets and Processor

In [3]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'
mixed_dataset_path = 'archive/datasets/mixed_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)
mixed_ds = load_dataset("imagefolder", data_dir=mixed_dataset_path)

print(faces_ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [None]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)
mixed_ds_transformed = mixed_ds.with_transform(transform)

print(type(art_ds))
print(type(art_ds_transformed))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [5]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

### Parameters

In [6]:
dataset_to_use = faces_ds_transformed
# dataset_to_use = art_ds_transformed
# dataset_to_use = mixed_ds_transformed

lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 
chosen_lr = 1

output_dir_name = "./sdxl-fine-tune"
# output_dir_name = "./sdxl-fine-tune-art"
# output_dir_name = "./sdxl-fine-tune-mixed"

### Load Model

In [7]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


In [8]:
# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir=output_dir_name,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=False, # True leads to runtime errors on CUDA
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[chosen_lr],  
    lr_scheduler_type="constant",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)




In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [10]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


  1%|          | 10/1600 [00:04<08:43,  3.03it/s]

{'loss': 1.3357, 'grad_norm': 8.868562698364258, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|▏         | 20/1600 [00:07<08:09,  3.23it/s]

{'loss': 0.6397, 'grad_norm': 14.044528007507324, 'learning_rate': 5e-05, 'epoch': 0.01}


  2%|▏         | 30/1600 [00:10<08:02,  3.26it/s]

{'loss': 0.5949, 'grad_norm': 14.267044067382812, 'learning_rate': 5e-05, 'epoch': 0.02}


  2%|▎         | 40/1600 [00:13<07:54,  3.29it/s]

{'loss': 0.6712, 'grad_norm': 35.152400970458984, 'learning_rate': 5e-05, 'epoch': 0.03}


  3%|▎         | 50/1600 [00:16<07:51,  3.29it/s]

{'loss': 0.6625, 'grad_norm': 42.32004928588867, 'learning_rate': 5e-05, 'epoch': 0.03}


  4%|▍         | 60/1600 [00:20<07:54,  3.24it/s]

{'loss': 0.5721, 'grad_norm': 20.754199981689453, 'learning_rate': 5e-05, 'epoch': 0.04}


  4%|▍         | 70/1600 [00:23<07:47,  3.27it/s]

{'loss': 0.6746, 'grad_norm': 18.810182571411133, 'learning_rate': 5e-05, 'epoch': 0.04}


  5%|▌         | 80/1600 [00:26<07:43,  3.28it/s]

{'loss': 0.4948, 'grad_norm': 36.37137222290039, 'learning_rate': 5e-05, 'epoch': 0.05}


  6%|▌         | 90/1600 [00:29<07:42,  3.26it/s]

{'loss': 0.632, 'grad_norm': 8.206205368041992, 'learning_rate': 5e-05, 'epoch': 0.06}


  6%|▋         | 100/1600 [00:32<07:43,  3.24it/s]

{'loss': 0.4583, 'grad_norm': 13.83261489868164, 'learning_rate': 5e-05, 'epoch': 0.06}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.44313326478004456, 'eval_Accuracy': 0.79, 'eval_F1': 0.7818181818181819, 'eval_runtime': 26.1008, 'eval_samples_per_second': 61.301, 'eval_steps_per_second': 7.663, 'epoch': 0.06}


  7%|▋         | 110/1600 [01:02<16:05,  1.54it/s]  

{'loss': 0.4336, 'grad_norm': 32.423095703125, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 120/1600 [01:05<07:50,  3.15it/s]

{'loss': 0.7419, 'grad_norm': 17.383861541748047, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 130/1600 [01:09<07:35,  3.22it/s]

{'loss': 0.4986, 'grad_norm': 6.0257744789123535, 'learning_rate': 5e-05, 'epoch': 0.08}


  9%|▉         | 140/1600 [01:12<07:34,  3.21it/s]

{'loss': 0.4894, 'grad_norm': 8.932435989379883, 'learning_rate': 5e-05, 'epoch': 0.09}


  9%|▉         | 150/1600 [01:15<07:29,  3.22it/s]

{'loss': 0.5563, 'grad_norm': 12.131482124328613, 'learning_rate': 5e-05, 'epoch': 0.09}


 10%|█         | 160/1600 [01:18<07:24,  3.24it/s]

{'loss': 0.3478, 'grad_norm': 23.73040771484375, 'learning_rate': 5e-05, 'epoch': 0.1}


 11%|█         | 170/1600 [01:21<07:25,  3.21it/s]

{'loss': 0.3844, 'grad_norm': 17.140888214111328, 'learning_rate': 5e-05, 'epoch': 0.11}


 11%|█▏        | 180/1600 [01:24<07:24,  3.20it/s]

{'loss': 0.2342, 'grad_norm': 7.581023693084717, 'learning_rate': 5e-05, 'epoch': 0.11}


 12%|█▏        | 190/1600 [01:27<07:16,  3.23it/s]

{'loss': 0.4322, 'grad_norm': 58.44593811035156, 'learning_rate': 5e-05, 'epoch': 0.12}


 12%|█▎        | 200/1600 [01:30<07:12,  3.23it/s]

{'loss': 0.4491, 'grad_norm': 26.44786834716797, 'learning_rate': 5e-05, 'epoch': 0.12}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.7963348627090454, 'eval_Accuracy': 0.699375, 'eval_F1': 0.766164316966456, 'eval_runtime': 24.6416, 'eval_samples_per_second': 64.931, 'eval_steps_per_second': 8.116, 'epoch': 0.12}


 13%|█▎        | 210/1600 [02:00<14:44,  1.57it/s]  

{'loss': 0.53, 'grad_norm': 24.838228225708008, 'learning_rate': 5e-05, 'epoch': 0.13}


 14%|█▍        | 220/1600 [02:03<07:19,  3.14it/s]

{'loss': 0.6188, 'grad_norm': 10.573554992675781, 'learning_rate': 5e-05, 'epoch': 0.14}


 14%|█▍        | 230/1600 [02:06<07:06,  3.21it/s]

{'loss': 0.59, 'grad_norm': 38.162132263183594, 'learning_rate': 5e-05, 'epoch': 0.14}


 15%|█▌        | 240/1600 [02:09<07:03,  3.21it/s]

{'loss': 0.3172, 'grad_norm': 5.408655643463135, 'learning_rate': 5e-05, 'epoch': 0.15}


 16%|█▌        | 250/1600 [02:12<07:00,  3.21it/s]

{'loss': 0.5469, 'grad_norm': 42.102500915527344, 'learning_rate': 5e-05, 'epoch': 0.16}


 16%|█▋        | 260/1600 [02:16<06:56,  3.22it/s]

{'loss': 0.3651, 'grad_norm': 21.87632179260254, 'learning_rate': 5e-05, 'epoch': 0.16}


 17%|█▋        | 270/1600 [02:19<06:51,  3.23it/s]

{'loss': 0.2792, 'grad_norm': 21.421039581298828, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 280/1600 [02:22<06:51,  3.21it/s]

{'loss': 0.5261, 'grad_norm': 89.96378326416016, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 290/1600 [02:25<06:47,  3.21it/s]

{'loss': 0.3267, 'grad_norm': 22.950706481933594, 'learning_rate': 5e-05, 'epoch': 0.18}


 19%|█▉        | 300/1600 [02:28<06:49,  3.18it/s]

{'loss': 0.3294, 'grad_norm': 13.327759742736816, 'learning_rate': 5e-05, 'epoch': 0.19}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.28120100498199463, 'eval_Accuracy': 0.884375, 'eval_F1': 0.8775645268034414, 'eval_runtime': 24.2129, 'eval_samples_per_second': 66.08, 'eval_steps_per_second': 8.26, 'epoch': 0.19}


 19%|█▉        | 310/1600 [02:57<13:25,  1.60it/s]  

{'loss': 0.1854, 'grad_norm': 8.54852294921875, 'learning_rate': 5e-05, 'epoch': 0.19}


 20%|██        | 320/1600 [03:00<06:52,  3.10it/s]

{'loss': 0.4046, 'grad_norm': 40.43581008911133, 'learning_rate': 5e-05, 'epoch': 0.2}


 21%|██        | 330/1600 [03:03<06:40,  3.17it/s]

{'loss': 0.358, 'grad_norm': 34.539310455322266, 'learning_rate': 5e-05, 'epoch': 0.21}


 21%|██▏       | 340/1600 [03:07<06:52,  3.06it/s]

{'loss': 0.3951, 'grad_norm': 18.849023818969727, 'learning_rate': 5e-05, 'epoch': 0.21}


 22%|██▏       | 350/1600 [03:10<06:31,  3.19it/s]

{'loss': 0.5211, 'grad_norm': 7.7060933113098145, 'learning_rate': 5e-05, 'epoch': 0.22}


 22%|██▎       | 360/1600 [03:13<06:26,  3.21it/s]

{'loss': 0.3049, 'grad_norm': 13.202537536621094, 'learning_rate': 5e-05, 'epoch': 0.23}


 23%|██▎       | 370/1600 [03:16<06:23,  3.20it/s]

{'loss': 0.2967, 'grad_norm': 5.804488182067871, 'learning_rate': 5e-05, 'epoch': 0.23}


 24%|██▍       | 380/1600 [03:19<06:21,  3.20it/s]

{'loss': 0.2422, 'grad_norm': 8.500662803649902, 'learning_rate': 5e-05, 'epoch': 0.24}


 24%|██▍       | 390/1600 [03:22<06:30,  3.10it/s]

{'loss': 0.5486, 'grad_norm': 11.140424728393555, 'learning_rate': 5e-05, 'epoch': 0.24}


 25%|██▌       | 400/1600 [03:26<06:21,  3.14it/s]

{'loss': 0.6048, 'grad_norm': 20.86114501953125, 'learning_rate': 5e-05, 'epoch': 0.25}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.15365348756313324, 'eval_Accuracy': 0.93875, 'eval_F1': 0.9394313967861557, 'eval_runtime': 24.5053, 'eval_samples_per_second': 65.292, 'eval_steps_per_second': 8.161, 'epoch': 0.25}


 26%|██▌       | 410/1600 [03:55<12:43,  1.56it/s]  

{'loss': 0.339, 'grad_norm': 18.82723617553711, 'learning_rate': 5e-05, 'epoch': 0.26}


 26%|██▋       | 420/1600 [03:58<06:22,  3.09it/s]

{'loss': 0.332, 'grad_norm': 12.293581008911133, 'learning_rate': 5e-05, 'epoch': 0.26}


 27%|██▋       | 430/1600 [04:01<06:05,  3.20it/s]

{'loss': 0.2304, 'grad_norm': 4.770654678344727, 'learning_rate': 5e-05, 'epoch': 0.27}


 28%|██▊       | 440/1600 [04:04<06:16,  3.08it/s]

{'loss': 0.1472, 'grad_norm': 19.99747085571289, 'learning_rate': 5e-05, 'epoch': 0.28}


 28%|██▊       | 450/1600 [04:08<06:04,  3.16it/s]

{'loss': 0.1531, 'grad_norm': 3.0601603984832764, 'learning_rate': 5e-05, 'epoch': 0.28}


 29%|██▉       | 460/1600 [04:11<06:04,  3.12it/s]

{'loss': 0.1941, 'grad_norm': 17.108957290649414, 'learning_rate': 5e-05, 'epoch': 0.29}


 29%|██▉       | 470/1600 [04:14<05:57,  3.16it/s]

{'loss': 0.5803, 'grad_norm': 97.5943832397461, 'learning_rate': 5e-05, 'epoch': 0.29}


 30%|███       | 480/1600 [04:17<05:56,  3.15it/s]

{'loss': 0.2652, 'grad_norm': 42.29377365112305, 'learning_rate': 5e-05, 'epoch': 0.3}


 31%|███       | 490/1600 [04:21<06:00,  3.08it/s]

{'loss': 0.2031, 'grad_norm': 2.96684193611145, 'learning_rate': 5e-05, 'epoch': 0.31}


 31%|███▏      | 500/1600 [04:24<05:59,  3.06it/s]

{'loss': 0.1888, 'grad_norm': 3.0432233810424805, 'learning_rate': 5e-05, 'epoch': 0.31}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3906843960285187, 'eval_Accuracy': 0.901875, 'eval_F1': 0.8917987594762233, 'eval_runtime': 25.177, 'eval_samples_per_second': 63.55, 'eval_steps_per_second': 7.944, 'epoch': 0.31}


 32%|███▏      | 510/1600 [04:53<11:39,  1.56it/s]  

{'loss': 0.7633, 'grad_norm': 38.148765563964844, 'learning_rate': 5e-05, 'epoch': 0.32}


 32%|███▎      | 520/1600 [04:57<06:02,  2.98it/s]

{'loss': 0.2794, 'grad_norm': 113.061279296875, 'learning_rate': 5e-05, 'epoch': 0.33}


 33%|███▎      | 530/1600 [05:00<05:47,  3.08it/s]

{'loss': 0.3181, 'grad_norm': 0.15382543206214905, 'learning_rate': 5e-05, 'epoch': 0.33}


 34%|███▍      | 540/1600 [05:03<05:44,  3.08it/s]

{'loss': 0.2658, 'grad_norm': 31.80568504333496, 'learning_rate': 5e-05, 'epoch': 0.34}


 34%|███▍      | 550/1600 [05:06<05:47,  3.03it/s]

{'loss': 0.5735, 'grad_norm': 135.075439453125, 'learning_rate': 5e-05, 'epoch': 0.34}


 35%|███▌      | 560/1600 [05:10<05:42,  3.03it/s]

{'loss': 0.0965, 'grad_norm': 0.26201772689819336, 'learning_rate': 5e-05, 'epoch': 0.35}


 36%|███▌      | 570/1600 [05:13<05:33,  3.09it/s]

{'loss': 0.2474, 'grad_norm': 57.6207389831543, 'learning_rate': 5e-05, 'epoch': 0.36}


 36%|███▋      | 580/1600 [05:16<05:26,  3.13it/s]

{'loss': 0.4027, 'grad_norm': 1.2770665884017944, 'learning_rate': 5e-05, 'epoch': 0.36}


 37%|███▋      | 590/1600 [05:19<05:26,  3.10it/s]

{'loss': 0.0897, 'grad_norm': 1.8269219398498535, 'learning_rate': 5e-05, 'epoch': 0.37}


 38%|███▊      | 600/1600 [05:23<05:23,  3.09it/s]

{'loss': 0.2253, 'grad_norm': 0.43051308393478394, 'learning_rate': 5e-05, 'epoch': 0.38}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.23094625771045685, 'eval_Accuracy': 0.94, 'eval_F1': 0.9365918097754293, 'eval_runtime': 25.0341, 'eval_samples_per_second': 63.913, 'eval_steps_per_second': 7.989, 'epoch': 0.38}


 38%|███▊      | 610/1600 [05:52<10:40,  1.55it/s]  

{'loss': 0.2211, 'grad_norm': 24.419363021850586, 'learning_rate': 5e-05, 'epoch': 0.38}


 39%|███▉      | 620/1600 [05:56<05:20,  3.06it/s]

{'loss': 0.2478, 'grad_norm': 93.34734344482422, 'learning_rate': 5e-05, 'epoch': 0.39}


 39%|███▉      | 630/1600 [05:59<05:19,  3.04it/s]

{'loss': 0.4516, 'grad_norm': 93.25667572021484, 'learning_rate': 5e-05, 'epoch': 0.39}


 40%|████      | 640/1600 [06:02<05:19,  3.00it/s]

{'loss': 0.2666, 'grad_norm': 34.10965347290039, 'learning_rate': 5e-05, 'epoch': 0.4}


 41%|████      | 650/1600 [06:05<05:06,  3.10it/s]

{'loss': 0.2464, 'grad_norm': 2.5443363189697266, 'learning_rate': 5e-05, 'epoch': 0.41}


 41%|████▏     | 660/1600 [06:09<05:03,  3.09it/s]

{'loss': 0.3445, 'grad_norm': 52.917091369628906, 'learning_rate': 5e-05, 'epoch': 0.41}


 42%|████▏     | 670/1600 [06:12<04:59,  3.11it/s]

{'loss': 0.3237, 'grad_norm': 6.227956295013428, 'learning_rate': 5e-05, 'epoch': 0.42}


 42%|████▎     | 680/1600 [06:15<05:02,  3.04it/s]

{'loss': 0.2601, 'grad_norm': 0.03697267547249794, 'learning_rate': 5e-05, 'epoch': 0.42}


 43%|████▎     | 690/1600 [06:18<04:53,  3.10it/s]

{'loss': 0.235, 'grad_norm': 29.725749969482422, 'learning_rate': 5e-05, 'epoch': 0.43}


 44%|████▍     | 700/1600 [06:22<05:21,  2.80it/s]

{'loss': 0.0156, 'grad_norm': 1.2789503335952759, 'learning_rate': 5e-05, 'epoch': 0.44}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.31161758303642273, 'eval_Accuracy': 0.924375, 'eval_F1': 0.9289489136817382, 'eval_runtime': 25.0877, 'eval_samples_per_second': 63.776, 'eval_steps_per_second': 7.972, 'epoch': 0.44}


 44%|████▍     | 710/1600 [06:51<09:26,  1.57it/s]  

{'loss': 0.4384, 'grad_norm': 23.45962142944336, 'learning_rate': 5e-05, 'epoch': 0.44}


 45%|████▌     | 720/1600 [06:55<04:52,  3.01it/s]

{'loss': 0.7279, 'grad_norm': 44.53520965576172, 'learning_rate': 5e-05, 'epoch': 0.45}


 46%|████▌     | 730/1600 [06:58<04:44,  3.06it/s]

{'loss': 0.1485, 'grad_norm': 9.199864387512207, 'learning_rate': 5e-05, 'epoch': 0.46}


 46%|████▋     | 740/1600 [07:01<04:40,  3.07it/s]

{'loss': 0.2812, 'grad_norm': 0.49843084812164307, 'learning_rate': 5e-05, 'epoch': 0.46}


 47%|████▋     | 750/1600 [07:05<04:37,  3.06it/s]

{'loss': 0.1188, 'grad_norm': 6.30863094329834, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 760/1600 [07:08<04:30,  3.10it/s]

{'loss': 0.0424, 'grad_norm': 61.7647819519043, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 770/1600 [07:11<04:42,  2.93it/s]

{'loss': 0.0524, 'grad_norm': 1.5270429849624634, 'learning_rate': 5e-05, 'epoch': 0.48}


 49%|████▉     | 780/1600 [07:15<04:30,  3.03it/s]

{'loss': 0.1214, 'grad_norm': 0.6374890208244324, 'learning_rate': 5e-05, 'epoch': 0.49}


 49%|████▉     | 790/1600 [07:18<04:28,  3.02it/s]

{'loss': 0.1398, 'grad_norm': 0.08599533140659332, 'learning_rate': 5e-05, 'epoch': 0.49}


 50%|█████     | 800/1600 [07:21<04:33,  2.93it/s]

{'loss': 0.4239, 'grad_norm': 0.0020513683557510376, 'learning_rate': 5e-05, 'epoch': 0.5}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.24863745272159576, 'eval_Accuracy': 0.95, 'eval_F1': 0.9477806788511749, 'eval_runtime': 25.8189, 'eval_samples_per_second': 61.97, 'eval_steps_per_second': 7.746, 'epoch': 0.5}


 51%|█████     | 810/1600 [07:52<08:36,  1.53it/s]  

{'loss': 0.2451, 'grad_norm': 134.16543579101562, 'learning_rate': 5e-05, 'epoch': 0.51}


 51%|█████▏    | 820/1600 [07:55<04:19,  3.00it/s]

{'loss': 0.2321, 'grad_norm': 23.879621505737305, 'learning_rate': 5e-05, 'epoch': 0.51}


 52%|█████▏    | 830/1600 [07:58<04:16,  3.00it/s]

{'loss': 0.2988, 'grad_norm': 53.4588623046875, 'learning_rate': 5e-05, 'epoch': 0.52}


 52%|█████▎    | 840/1600 [08:02<04:11,  3.03it/s]

{'loss': 0.5872, 'grad_norm': 36.85921096801758, 'learning_rate': 5e-05, 'epoch': 0.53}


 53%|█████▎    | 850/1600 [08:05<04:13,  2.96it/s]

{'loss': 0.1986, 'grad_norm': 68.88720703125, 'learning_rate': 5e-05, 'epoch': 0.53}


 54%|█████▍    | 860/1600 [08:08<04:08,  2.98it/s]

{'loss': 0.2472, 'grad_norm': 57.42824935913086, 'learning_rate': 5e-05, 'epoch': 0.54}


 54%|█████▍    | 870/1600 [08:12<04:06,  2.96it/s]

{'loss': 0.1929, 'grad_norm': 50.27336883544922, 'learning_rate': 5e-05, 'epoch': 0.54}


 55%|█████▌    | 880/1600 [08:15<03:56,  3.05it/s]

{'loss': 0.3055, 'grad_norm': 24.466257095336914, 'learning_rate': 5e-05, 'epoch': 0.55}


 56%|█████▌    | 890/1600 [08:18<03:56,  3.00it/s]

{'loss': 0.0856, 'grad_norm': 0.6835881471633911, 'learning_rate': 5e-05, 'epoch': 0.56}


 56%|█████▋    | 900/1600 [08:22<03:51,  3.02it/s]

{'loss': 0.1147, 'grad_norm': 44.443119049072266, 'learning_rate': 5e-05, 'epoch': 0.56}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.1341375708580017, 'eval_Accuracy': 0.960625, 'eval_F1': 0.9609423434593924, 'eval_runtime': 25.3792, 'eval_samples_per_second': 63.044, 'eval_steps_per_second': 7.88, 'epoch': 0.56}


 57%|█████▋    | 910/1600 [08:51<07:32,  1.53it/s]  

{'loss': 0.098, 'grad_norm': 64.48885345458984, 'learning_rate': 5e-05, 'epoch': 0.57}


 57%|█████▊    | 920/1600 [08:55<03:56,  2.88it/s]

{'loss': 0.2894, 'grad_norm': 36.126407623291016, 'learning_rate': 5e-05, 'epoch': 0.57}


 58%|█████▊    | 930/1600 [08:58<03:44,  2.98it/s]

{'loss': 0.0396, 'grad_norm': 0.13958726823329926, 'learning_rate': 5e-05, 'epoch': 0.58}


 59%|█████▉    | 940/1600 [09:02<03:59,  2.75it/s]

{'loss': 0.2496, 'grad_norm': 0.12505128979682922, 'learning_rate': 5e-05, 'epoch': 0.59}


 59%|█████▉    | 950/1600 [09:05<03:48,  2.84it/s]

{'loss': 0.0847, 'grad_norm': 0.0621800571680069, 'learning_rate': 5e-05, 'epoch': 0.59}


 60%|██████    | 960/1600 [09:09<03:33,  2.99it/s]

{'loss': 0.113, 'grad_norm': 0.0025621557142585516, 'learning_rate': 5e-05, 'epoch': 0.6}


 61%|██████    | 970/1600 [09:12<03:31,  2.98it/s]

{'loss': 0.2119, 'grad_norm': 0.6103683114051819, 'learning_rate': 5e-05, 'epoch': 0.61}


 61%|██████▏   | 980/1600 [09:16<03:27,  2.98it/s]

{'loss': 0.2594, 'grad_norm': 0.09207798540592194, 'learning_rate': 5e-05, 'epoch': 0.61}


 62%|██████▏   | 990/1600 [09:19<03:20,  3.05it/s]

{'loss': 0.5016, 'grad_norm': 77.56138610839844, 'learning_rate': 5e-05, 'epoch': 0.62}


 62%|██████▎   | 1000/1600 [09:22<03:20,  3.00it/s]

{'loss': 0.2105, 'grad_norm': 2.9580063819885254, 'learning_rate': 5e-05, 'epoch': 0.62}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.16277550160884857, 'eval_Accuracy': 0.95375, 'eval_F1': 0.9543209876543209, 'eval_runtime': 25.1974, 'eval_samples_per_second': 63.499, 'eval_steps_per_second': 7.937, 'epoch': 0.62}


 63%|██████▎   | 1010/1600 [09:52<06:21,  1.55it/s]  

{'loss': 0.071, 'grad_norm': 0.02848014421761036, 'learning_rate': 5e-05, 'epoch': 0.63}


 64%|██████▍   | 1020/1600 [09:55<03:13,  3.00it/s]

{'loss': 0.1097, 'grad_norm': 0.7422537207603455, 'learning_rate': 5e-05, 'epoch': 0.64}


 64%|██████▍   | 1030/1600 [09:58<03:06,  3.06it/s]

{'loss': 0.0472, 'grad_norm': 1.1982530355453491, 'learning_rate': 5e-05, 'epoch': 0.64}


 65%|██████▌   | 1040/1600 [10:02<03:05,  3.02it/s]

{'loss': 0.2281, 'grad_norm': 0.21135073900222778, 'learning_rate': 5e-05, 'epoch': 0.65}


 66%|██████▌   | 1050/1600 [10:05<03:00,  3.04it/s]

{'loss': 0.1919, 'grad_norm': 14.672274589538574, 'learning_rate': 5e-05, 'epoch': 0.66}


 66%|██████▋   | 1060/1600 [10:08<02:58,  3.03it/s]

{'loss': 0.1469, 'grad_norm': 0.05044042691588402, 'learning_rate': 5e-05, 'epoch': 0.66}


 67%|██████▋   | 1070/1600 [10:12<02:57,  2.99it/s]

{'loss': 0.3439, 'grad_norm': 13.545866966247559, 'learning_rate': 5e-05, 'epoch': 0.67}


 68%|██████▊   | 1080/1600 [10:15<02:53,  2.99it/s]

{'loss': 0.0065, 'grad_norm': 0.033192895352840424, 'learning_rate': 5e-05, 'epoch': 0.68}


 68%|██████▊   | 1090/1600 [10:19<02:57,  2.88it/s]

{'loss': 0.1895, 'grad_norm': 53.264495849609375, 'learning_rate': 5e-05, 'epoch': 0.68}


 69%|██████▉   | 1100/1600 [10:22<02:46,  3.00it/s]

{'loss': 0.6992, 'grad_norm': 51.2429313659668, 'learning_rate': 5e-05, 'epoch': 0.69}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3516307473182678, 'eval_Accuracy': 0.915625, 'eval_F1': 0.9221004039238315, 'eval_runtime': 26.801, 'eval_samples_per_second': 59.699, 'eval_steps_per_second': 7.462, 'epoch': 0.69}


 69%|██████▉   | 1110/1600 [10:53<05:28,  1.49it/s]  

{'loss': 0.1201, 'grad_norm': 42.383968353271484, 'learning_rate': 5e-05, 'epoch': 0.69}


 70%|███████   | 1120/1600 [10:57<02:49,  2.84it/s]

{'loss': 0.2512, 'grad_norm': 0.04160100221633911, 'learning_rate': 5e-05, 'epoch': 0.7}


 71%|███████   | 1130/1600 [11:00<02:46,  2.82it/s]

{'loss': 0.2551, 'grad_norm': 0.03559454530477524, 'learning_rate': 5e-05, 'epoch': 0.71}


 71%|███████▏  | 1140/1600 [11:04<02:47,  2.75it/s]

{'loss': 0.0218, 'grad_norm': 0.06556572020053864, 'learning_rate': 5e-05, 'epoch': 0.71}


 72%|███████▏  | 1150/1600 [11:07<02:36,  2.88it/s]

{'loss': 0.2376, 'grad_norm': 0.2456500232219696, 'learning_rate': 5e-05, 'epoch': 0.72}


 72%|███████▎  | 1160/1600 [11:11<02:32,  2.88it/s]

{'loss': 0.005, 'grad_norm': 0.22571240365505219, 'learning_rate': 5e-05, 'epoch': 0.72}


 73%|███████▎  | 1170/1600 [11:14<02:23,  3.00it/s]

{'loss': 0.1861, 'grad_norm': 0.04925158992409706, 'learning_rate': 5e-05, 'epoch': 0.73}


 74%|███████▍  | 1180/1600 [11:17<02:18,  3.03it/s]

{'loss': 0.0056, 'grad_norm': 0.32851698994636536, 'learning_rate': 5e-05, 'epoch': 0.74}


 74%|███████▍  | 1190/1600 [11:21<02:19,  2.94it/s]

{'loss': 0.2275, 'grad_norm': 0.08694787323474884, 'learning_rate': 5e-05, 'epoch': 0.74}


 75%|███████▌  | 1200/1600 [11:24<02:15,  2.96it/s]

{'loss': 0.003, 'grad_norm': 0.018346795812249184, 'learning_rate': 5e-05, 'epoch': 0.75}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.10265610367059708, 'eval_Accuracy': 0.97375, 'eval_F1': 0.9730769230769231, 'eval_runtime': 25.4663, 'eval_samples_per_second': 62.828, 'eval_steps_per_second': 7.854, 'epoch': 0.75}


 76%|███████▌  | 1210/1600 [11:54<04:12,  1.54it/s]

{'loss': 0.1328, 'grad_norm': 90.0884780883789, 'learning_rate': 5e-05, 'epoch': 0.76}


 76%|███████▋  | 1220/1600 [11:57<02:10,  2.90it/s]

{'loss': 0.1354, 'grad_norm': 39.292301177978516, 'learning_rate': 5e-05, 'epoch': 0.76}


 77%|███████▋  | 1230/1600 [12:01<02:00,  3.07it/s]

{'loss': 0.0002, 'grad_norm': 0.005211107432842255, 'learning_rate': 5e-05, 'epoch': 0.77}


 78%|███████▊  | 1240/1600 [12:04<02:06,  2.85it/s]

{'loss': 0.1243, 'grad_norm': 0.00048654957208782434, 'learning_rate': 5e-05, 'epoch': 0.78}


 78%|███████▊  | 1250/1600 [12:08<01:57,  2.98it/s]

{'loss': 0.093, 'grad_norm': 45.66397476196289, 'learning_rate': 5e-05, 'epoch': 0.78}


 79%|███████▉  | 1260/1600 [12:11<01:53,  2.99it/s]

{'loss': 0.0125, 'grad_norm': 0.027018796652555466, 'learning_rate': 5e-05, 'epoch': 0.79}


 79%|███████▉  | 1270/1600 [12:14<01:49,  3.02it/s]

{'loss': 0.2588, 'grad_norm': 65.58483123779297, 'learning_rate': 5e-05, 'epoch': 0.79}


 80%|████████  | 1280/1600 [12:18<01:44,  3.07it/s]

{'loss': 0.0696, 'grad_norm': 0.017036356031894684, 'learning_rate': 5e-05, 'epoch': 0.8}


 81%|████████  | 1290/1600 [12:21<01:42,  3.02it/s]

{'loss': 0.1466, 'grad_norm': 0.005321410950273275, 'learning_rate': 5e-05, 'epoch': 0.81}


 81%|████████▏ | 1300/1600 [12:24<01:38,  3.05it/s]

{'loss': 0.1686, 'grad_norm': 102.33625793457031, 'learning_rate': 5e-05, 'epoch': 0.81}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.09465068578720093, 'eval_Accuracy': 0.97375, 'eval_F1': 0.9742647058823529, 'eval_runtime': 25.6584, 'eval_samples_per_second': 62.358, 'eval_steps_per_second': 7.795, 'epoch': 0.81}


 82%|████████▏ | 1310/1600 [12:54<03:07,  1.54it/s]

{'loss': 0.1735, 'grad_norm': 1.5673770904541016, 'learning_rate': 5e-05, 'epoch': 0.82}


 82%|████████▎ | 1320/1600 [12:58<01:35,  2.93it/s]

{'loss': 0.0501, 'grad_norm': 23.285795211791992, 'learning_rate': 5e-05, 'epoch': 0.82}


 83%|████████▎ | 1330/1600 [13:01<01:28,  3.05it/s]

{'loss': 0.2605, 'grad_norm': 0.006262120790779591, 'learning_rate': 5e-05, 'epoch': 0.83}


 84%|████████▍ | 1340/1600 [13:04<01:26,  2.99it/s]

{'loss': 0.1822, 'grad_norm': 0.025823581963777542, 'learning_rate': 5e-05, 'epoch': 0.84}


 84%|████████▍ | 1350/1600 [13:08<01:23,  2.99it/s]

{'loss': 0.0316, 'grad_norm': 0.03786362335085869, 'learning_rate': 5e-05, 'epoch': 0.84}


 85%|████████▌ | 1360/1600 [13:11<01:22,  2.92it/s]

{'loss': 0.145, 'grad_norm': 0.2842312157154083, 'learning_rate': 5e-05, 'epoch': 0.85}


 86%|████████▌ | 1370/1600 [13:14<01:17,  2.96it/s]

{'loss': 0.0371, 'grad_norm': 0.008944390341639519, 'learning_rate': 5e-05, 'epoch': 0.86}


 86%|████████▋ | 1380/1600 [13:18<01:14,  2.96it/s]

{'loss': 0.1743, 'grad_norm': 17.56141471862793, 'learning_rate': 5e-05, 'epoch': 0.86}


 87%|████████▋ | 1390/1600 [13:21<01:09,  3.01it/s]

{'loss': 0.0101, 'grad_norm': 1.1823476552963257, 'learning_rate': 5e-05, 'epoch': 0.87}


 88%|████████▊ | 1400/1600 [13:24<01:06,  2.99it/s]

{'loss': 0.0831, 'grad_norm': 0.010183370672166348, 'learning_rate': 5e-05, 'epoch': 0.88}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.02311863750219345, 'eval_Accuracy': 0.99125, 'eval_F1': 0.9911949685534591, 'eval_runtime': 25.3135, 'eval_samples_per_second': 63.207, 'eval_steps_per_second': 7.901, 'epoch': 0.88}


 88%|████████▊ | 1410/1600 [13:54<02:02,  1.55it/s]

{'loss': 0.0005, 'grad_norm': 0.014876271598041058, 'learning_rate': 5e-05, 'epoch': 0.88}


 89%|████████▉ | 1420/1600 [13:57<01:01,  2.94it/s]

{'loss': 0.125, 'grad_norm': 0.07216288149356842, 'learning_rate': 5e-05, 'epoch': 0.89}


 89%|████████▉ | 1430/1600 [14:01<00:56,  3.00it/s]

{'loss': 0.1016, 'grad_norm': 0.0044329361990094185, 'learning_rate': 5e-05, 'epoch': 0.89}


 90%|█████████ | 1440/1600 [14:04<00:53,  3.01it/s]

{'loss': 0.0483, 'grad_norm': 0.007348118349909782, 'learning_rate': 5e-05, 'epoch': 0.9}


 91%|█████████ | 1450/1600 [14:08<00:50,  2.98it/s]

{'loss': 0.0035, 'grad_norm': 20.992218017578125, 'learning_rate': 5e-05, 'epoch': 0.91}


 91%|█████████▏| 1460/1600 [14:11<00:47,  2.97it/s]

{'loss': 0.0815, 'grad_norm': 39.09765625, 'learning_rate': 5e-05, 'epoch': 0.91}


 92%|█████████▏| 1470/1600 [14:14<00:43,  2.97it/s]

{'loss': 0.0009, 'grad_norm': 0.0065634301863610744, 'learning_rate': 5e-05, 'epoch': 0.92}


 92%|█████████▎| 1480/1600 [14:18<00:39,  3.02it/s]

{'loss': 0.0033, 'grad_norm': 0.000673649599775672, 'learning_rate': 5e-05, 'epoch': 0.93}


 93%|█████████▎| 1490/1600 [14:21<00:37,  2.97it/s]

{'loss': 0.0126, 'grad_norm': 0.0073895929381251335, 'learning_rate': 5e-05, 'epoch': 0.93}


 94%|█████████▍| 1500/1600 [14:24<00:33,  2.96it/s]

{'loss': 0.2313, 'grad_norm': 0.0022630980238318443, 'learning_rate': 5e-05, 'epoch': 0.94}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.03495907783508301, 'eval_Accuracy': 0.9925, 'eval_F1': 0.9924812030075187, 'eval_runtime': 26.363, 'eval_samples_per_second': 60.691, 'eval_steps_per_second': 7.586, 'epoch': 0.94}


 94%|█████████▍| 1510/1600 [14:56<01:00,  1.49it/s]

{'loss': 0.3235, 'grad_norm': 0.17494751513004303, 'learning_rate': 5e-05, 'epoch': 0.94}


 95%|█████████▌| 1520/1600 [14:59<00:28,  2.80it/s]

{'loss': 0.0754, 'grad_norm': 50.98398971557617, 'learning_rate': 5e-05, 'epoch': 0.95}


 96%|█████████▌| 1530/1600 [15:03<00:23,  2.94it/s]

{'loss': 0.0334, 'grad_norm': 0.010274292901158333, 'learning_rate': 5e-05, 'epoch': 0.96}


 96%|█████████▋| 1540/1600 [15:06<00:20,  2.88it/s]

{'loss': 0.0006, 'grad_norm': 0.00922377873212099, 'learning_rate': 5e-05, 'epoch': 0.96}


 97%|█████████▋| 1550/1600 [15:10<00:18,  2.72it/s]

{'loss': 0.0579, 'grad_norm': 0.00984087772667408, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1560/1600 [15:14<00:14,  2.68it/s]

{'loss': 0.0497, 'grad_norm': 0.0018500324804335833, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1570/1600 [15:18<00:11,  2.53it/s]

{'loss': 0.1063, 'grad_norm': 97.18549346923828, 'learning_rate': 5e-05, 'epoch': 0.98}


 99%|█████████▉| 1580/1600 [15:21<00:07,  2.80it/s]

{'loss': 0.0912, 'grad_norm': 80.66292572021484, 'learning_rate': 5e-05, 'epoch': 0.99}


 99%|█████████▉| 1590/1600 [15:25<00:03,  2.67it/s]

{'loss': 0.004, 'grad_norm': 0.0006536103901453316, 'learning_rate': 5e-05, 'epoch': 0.99}


100%|██████████| 1600/1600 [15:29<00:00,  2.77it/s]

{'loss': 0.2452, 'grad_norm': 179.72056579589844, 'learning_rate': 5e-05, 'epoch': 1.0}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.02407854050397873, 'eval_Accuracy': 0.99375, 'eval_F1': 0.9937578027465668, 'eval_runtime': 28.2119, 'eval_samples_per_second': 56.714, 'eval_steps_per_second': 7.089, 'epoch': 1.0}


100%|██████████| 1600/1600 [15:59<00:00,  1.67it/s]
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 959.1021, 'train_samples_per_second': 13.346, 'train_steps_per_second': 1.668, 'train_loss': 0.2649547167739365, 'epoch': 1.0}
***** train metrics *****
  epoch                    =         1.0
  total_flos               = 933951802GF
  train_loss               =       0.265
  train_runtime            =  0:15:59.10
  train_samples_per_second =      13.346
  train_steps_per_second   =       1.668


100%|██████████| 200/200 [00:27<00:00,  7.28it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_Accuracy           =     0.9912
  eval_F1                 =     0.9912
  eval_loss               =     0.0231
  eval_runtime            = 0:00:27.70
  eval_samples_per_second =     57.741
  eval_steps_per_second   =      7.218



