### Imports

In [3]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

print(os.getcwd())
os.chdir("..") # have to go up one directory, can also use os.chdir("..")
print(os.getcwd())

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


c:\Users\metet\OneDrive\Documents\GitHub\thesis\RQ2
c:\Users\metet\OneDrive\Documents\GitHub\thesis
cuda


### Load Datasets and Processor

In [4]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'
mixed_dataset_path = 'archive/datasets/mixed_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)
mixed_ds = load_dataset("imagefolder", data_dir=mixed_dataset_path)

print(faces_ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [5]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)
mixed_ds_transformed = mixed_ds.with_transform(transform)

In [6]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [7]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

### Parameters

In [8]:
# dataset_to_use = faces_ds_transformed
dataset_to_use = art_ds_transformed
# dataset_to_use = mixed_ds_transformed

lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 
chosen_lr = 1

# output_dir_name = "./sdxl-fine-tune"
output_dir_name = "./sdxl-fine-tune-art"
# output_dir_name = "./sdxl-fine-tune-mixed"

### Load Model

In [9]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


In [10]:
# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir=output_dir_name,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=False, # True leads to runtime errors on CUDA
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[chosen_lr],  
    lr_scheduler_type="constant",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)




In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [12]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


  1%|          | 10/1600 [00:03<07:45,  3.41it/s]

{'loss': 2.4939, 'grad_norm': 20.47650146484375, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|▏         | 20/1600 [00:06<07:34,  3.48it/s]

{'loss': 0.7079, 'grad_norm': 21.008949279785156, 'learning_rate': 5e-05, 'epoch': 0.01}


  2%|▏         | 30/1600 [00:09<07:32,  3.47it/s]

{'loss': 0.3526, 'grad_norm': 28.467178344726562, 'learning_rate': 5e-05, 'epoch': 0.02}


  2%|▎         | 40/1600 [00:12<07:27,  3.48it/s]

{'loss': 0.27, 'grad_norm': 16.28025245666504, 'learning_rate': 5e-05, 'epoch': 0.03}


  3%|▎         | 50/1600 [00:15<07:25,  3.48it/s]

{'loss': 0.3398, 'grad_norm': 13.674135208129883, 'learning_rate': 5e-05, 'epoch': 0.03}


  4%|▍         | 60/1600 [00:17<07:26,  3.45it/s]

{'loss': 0.3794, 'grad_norm': 21.211851119995117, 'learning_rate': 5e-05, 'epoch': 0.04}


  4%|▍         | 70/1600 [00:20<07:29,  3.41it/s]

{'loss': 0.4302, 'grad_norm': 19.117023468017578, 'learning_rate': 5e-05, 'epoch': 0.04}


  5%|▌         | 80/1600 [00:23<07:23,  3.43it/s]

{'loss': 0.1778, 'grad_norm': 12.4395751953125, 'learning_rate': 5e-05, 'epoch': 0.05}


  6%|▌         | 90/1600 [00:26<07:19,  3.44it/s]

{'loss': 0.2994, 'grad_norm': 10.705686569213867, 'learning_rate': 5e-05, 'epoch': 0.06}


  6%|▋         | 100/1600 [00:29<07:24,  3.38it/s]

{'loss': 0.2181, 'grad_norm': 26.30120277404785, 'learning_rate': 5e-05, 'epoch': 0.06}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.570534348487854, 'eval_Accuracy': 0.870625, 'eval_F1': 0.8846796657381616, 'eval_runtime': 25.9305, 'eval_samples_per_second': 61.703, 'eval_steps_per_second': 7.713, 'epoch': 0.06}


  7%|▋         | 110/1600 [01:00<16:32,  1.50it/s]  

{'loss': 0.2219, 'grad_norm': 3.5058324337005615, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 120/1600 [01:03<08:48,  2.80it/s]

{'loss': 0.3603, 'grad_norm': 20.233150482177734, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 130/1600 [01:06<08:14,  2.98it/s]

{'loss': 0.2613, 'grad_norm': 15.28314208984375, 'learning_rate': 5e-05, 'epoch': 0.08}


  9%|▉         | 140/1600 [01:10<08:15,  2.94it/s]

{'loss': 0.2896, 'grad_norm': 1.7045447826385498, 'learning_rate': 5e-05, 'epoch': 0.09}


  9%|▉         | 150/1600 [01:13<08:30,  2.84it/s]

{'loss': 0.4431, 'grad_norm': 27.18703842163086, 'learning_rate': 5e-05, 'epoch': 0.09}


 10%|█         | 160/1600 [01:17<08:56,  2.69it/s]

{'loss': 0.3514, 'grad_norm': 11.78331470489502, 'learning_rate': 5e-05, 'epoch': 0.1}


 11%|█         | 170/1600 [01:21<08:52,  2.69it/s]

{'loss': 0.3667, 'grad_norm': 0.2357816994190216, 'learning_rate': 5e-05, 'epoch': 0.11}


 11%|█▏        | 180/1600 [01:25<08:58,  2.64it/s]

{'loss': 0.3685, 'grad_norm': 3.9313182830810547, 'learning_rate': 5e-05, 'epoch': 0.11}


 12%|█▏        | 190/1600 [01:29<09:30,  2.47it/s]

{'loss': 0.1904, 'grad_norm': 6.956241130828857, 'learning_rate': 5e-05, 'epoch': 0.12}


 12%|█▎        | 200/1600 [01:33<09:43,  2.40it/s]

{'loss': 0.2031, 'grad_norm': 18.82649040222168, 'learning_rate': 5e-05, 'epoch': 0.12}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.24563518166542053, 'eval_Accuracy': 0.921875, 'eval_F1': 0.9188838416612589, 'eval_runtime': 29.3309, 'eval_samples_per_second': 54.55, 'eval_steps_per_second': 6.819, 'epoch': 0.12}


 13%|█▎        | 210/1600 [02:07<18:01,  1.28it/s]  

{'loss': 0.3226, 'grad_norm': 41.1370849609375, 'learning_rate': 5e-05, 'epoch': 0.13}


 14%|█▍        | 220/1600 [02:11<09:49,  2.34it/s]

{'loss': 0.2232, 'grad_norm': 51.83809280395508, 'learning_rate': 5e-05, 'epoch': 0.14}


 14%|█▍        | 230/1600 [02:15<09:40,  2.36it/s]

{'loss': 0.2521, 'grad_norm': 2.7887773513793945, 'learning_rate': 5e-05, 'epoch': 0.14}


 15%|█▌        | 240/1600 [02:20<09:54,  2.29it/s]

{'loss': 0.0956, 'grad_norm': 30.034883499145508, 'learning_rate': 5e-05, 'epoch': 0.15}


 16%|█▌        | 250/1600 [02:24<09:23,  2.39it/s]

{'loss': 0.2519, 'grad_norm': 22.267929077148438, 'learning_rate': 5e-05, 'epoch': 0.16}


 16%|█▋        | 260/1600 [02:29<09:33,  2.34it/s]

{'loss': 0.2436, 'grad_norm': 44.835330963134766, 'learning_rate': 5e-05, 'epoch': 0.16}


 17%|█▋        | 270/1600 [02:33<09:23,  2.36it/s]

{'loss': 0.1931, 'grad_norm': 21.36612892150879, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 280/1600 [02:37<08:39,  2.54it/s]

{'loss': 0.1963, 'grad_norm': 29.197067260742188, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 290/1600 [02:42<09:56,  2.19it/s]

{'loss': 0.2529, 'grad_norm': 18.1140193939209, 'learning_rate': 5e-05, 'epoch': 0.18}


 19%|█▉        | 300/1600 [02:46<09:25,  2.30it/s]

{'loss': 0.3978, 'grad_norm': 8.945883750915527, 'learning_rate': 5e-05, 'epoch': 0.19}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.21707499027252197, 'eval_Accuracy': 0.933125, 'eval_F1': 0.9348752282410225, 'eval_runtime': 31.012, 'eval_samples_per_second': 51.593, 'eval_steps_per_second': 6.449, 'epoch': 0.19}


 19%|█▉        | 310/1600 [03:22<17:23,  1.24it/s]  

{'loss': 0.1853, 'grad_norm': 2.3474984169006348, 'learning_rate': 5e-05, 'epoch': 0.19}


 20%|██        | 320/1600 [03:26<09:23,  2.27it/s]

{'loss': 0.1746, 'grad_norm': 10.37153434753418, 'learning_rate': 5e-05, 'epoch': 0.2}


 21%|██        | 330/1600 [03:31<09:39,  2.19it/s]

{'loss': 0.376, 'grad_norm': 16.285429000854492, 'learning_rate': 5e-05, 'epoch': 0.21}


 21%|██▏       | 340/1600 [03:36<11:46,  1.78it/s]

{'loss': 0.3729, 'grad_norm': 1.572410225868225, 'learning_rate': 5e-05, 'epoch': 0.21}


 22%|██▏       | 350/1600 [03:40<09:55,  2.10it/s]

{'loss': 0.1665, 'grad_norm': 21.591901779174805, 'learning_rate': 5e-05, 'epoch': 0.22}


 22%|██▎       | 360/1600 [03:45<10:21,  1.99it/s]

{'loss': 0.3351, 'grad_norm': 39.1898078918457, 'learning_rate': 5e-05, 'epoch': 0.23}


 23%|██▎       | 370/1600 [03:50<09:29,  2.16it/s]

{'loss': 0.1335, 'grad_norm': 23.09355354309082, 'learning_rate': 5e-05, 'epoch': 0.23}


 24%|██▍       | 380/1600 [03:55<09:55,  2.05it/s]

{'loss': 0.0697, 'grad_norm': 5.833469867706299, 'learning_rate': 5e-05, 'epoch': 0.24}


 24%|██▍       | 390/1600 [03:59<09:50,  2.05it/s]

{'loss': 0.3142, 'grad_norm': 98.21681213378906, 'learning_rate': 5e-05, 'epoch': 0.24}


 25%|██▌       | 400/1600 [04:05<10:05,  1.98it/s]

{'loss': 0.1867, 'grad_norm': 31.628881454467773, 'learning_rate': 5e-05, 'epoch': 0.25}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.19846372306346893, 'eval_Accuracy': 0.935, 'eval_F1': 0.9340937896070975, 'eval_runtime': 34.1785, 'eval_samples_per_second': 46.813, 'eval_steps_per_second': 5.852, 'epoch': 0.25}


 26%|██▌       | 410/1600 [04:44<17:46,  1.12it/s]  

{'loss': 0.2359, 'grad_norm': 12.115625381469727, 'learning_rate': 5e-05, 'epoch': 0.26}


 26%|██▋       | 420/1600 [04:49<09:56,  1.98it/s]

{'loss': 0.2816, 'grad_norm': 33.48851013183594, 'learning_rate': 5e-05, 'epoch': 0.26}


 27%|██▋       | 430/1600 [04:54<10:05,  1.93it/s]

{'loss': 0.2054, 'grad_norm': 17.612457275390625, 'learning_rate': 5e-05, 'epoch': 0.27}


 28%|██▊       | 440/1600 [04:59<09:42,  1.99it/s]

{'loss': 0.1571, 'grad_norm': 30.737253189086914, 'learning_rate': 5e-05, 'epoch': 0.28}


 28%|██▊       | 450/1600 [05:04<09:21,  2.05it/s]

{'loss': 0.1933, 'grad_norm': 0.02521095797419548, 'learning_rate': 5e-05, 'epoch': 0.28}


 29%|██▉       | 460/1600 [05:09<10:04,  1.88it/s]

{'loss': 0.1397, 'grad_norm': 7.842732906341553, 'learning_rate': 5e-05, 'epoch': 0.29}


 29%|██▉       | 470/1600 [05:14<09:17,  2.03it/s]

{'loss': 0.2162, 'grad_norm': 0.8537887930870056, 'learning_rate': 5e-05, 'epoch': 0.29}


 30%|███       | 480/1600 [05:19<09:14,  2.02it/s]

{'loss': 0.4505, 'grad_norm': 97.40890502929688, 'learning_rate': 5e-05, 'epoch': 0.3}


 31%|███       | 490/1600 [05:24<09:42,  1.91it/s]

{'loss': 0.0496, 'grad_norm': 2.248506546020508, 'learning_rate': 5e-05, 'epoch': 0.31}


 31%|███▏      | 500/1600 [05:30<09:22,  1.95it/s]

{'loss': 0.1457, 'grad_norm': 14.08669376373291, 'learning_rate': 5e-05, 'epoch': 0.31}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22597812116146088, 'eval_Accuracy': 0.938125, 'eval_F1': 0.9398907103825137, 'eval_runtime': 34.4259, 'eval_samples_per_second': 46.477, 'eval_steps_per_second': 5.81, 'epoch': 0.31}


 32%|███▏      | 510/1600 [06:10<16:37,  1.09it/s]  

{'loss': 0.2856, 'grad_norm': 30.043643951416016, 'learning_rate': 5e-05, 'epoch': 0.32}


 32%|███▎      | 520/1600 [06:15<09:18,  1.93it/s]

{'loss': 0.2688, 'grad_norm': 0.29972368478775024, 'learning_rate': 5e-05, 'epoch': 0.33}


 33%|███▎      | 530/1600 [06:20<08:45,  2.03it/s]

{'loss': 0.1904, 'grad_norm': 0.742117702960968, 'learning_rate': 5e-05, 'epoch': 0.33}


 34%|███▍      | 540/1600 [06:25<08:42,  2.03it/s]

{'loss': 0.0452, 'grad_norm': 0.01265481486916542, 'learning_rate': 5e-05, 'epoch': 0.34}


 34%|███▍      | 550/1600 [06:30<08:19,  2.10it/s]

{'loss': 0.4955, 'grad_norm': 39.546810150146484, 'learning_rate': 5e-05, 'epoch': 0.34}


 35%|███▌      | 560/1600 [06:35<07:48,  2.22it/s]

{'loss': 0.0278, 'grad_norm': 0.005064561031758785, 'learning_rate': 5e-05, 'epoch': 0.35}


 36%|███▌      | 570/1600 [06:40<07:42,  2.23it/s]

{'loss': 0.7406, 'grad_norm': 6.256653308868408, 'learning_rate': 5e-05, 'epoch': 0.36}


 36%|███▋      | 580/1600 [06:45<08:19,  2.04it/s]

{'loss': 0.106, 'grad_norm': 0.32242488861083984, 'learning_rate': 5e-05, 'epoch': 0.36}


 37%|███▋      | 590/1600 [06:49<08:09,  2.06it/s]

{'loss': 0.1725, 'grad_norm': 1.364424467086792, 'learning_rate': 5e-05, 'epoch': 0.37}


 38%|███▊      | 600/1600 [06:54<07:33,  2.21it/s]

{'loss': 0.1236, 'grad_norm': 19.102014541625977, 'learning_rate': 5e-05, 'epoch': 0.38}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.280678391456604, 'eval_Accuracy': 0.93625, 'eval_F1': 0.93625, 'eval_runtime': 32.2349, 'eval_samples_per_second': 49.636, 'eval_steps_per_second': 6.204, 'epoch': 0.38}


 38%|███▊      | 610/1600 [07:32<15:08,  1.09it/s]  

{'loss': 0.531, 'grad_norm': 25.50810432434082, 'learning_rate': 5e-05, 'epoch': 0.38}


 39%|███▉      | 620/1600 [07:37<08:00,  2.04it/s]

{'loss': 0.1711, 'grad_norm': 127.9442138671875, 'learning_rate': 5e-05, 'epoch': 0.39}


 39%|███▉      | 630/1600 [07:42<08:00,  2.02it/s]

{'loss': 0.1447, 'grad_norm': 6.657816410064697, 'learning_rate': 5e-05, 'epoch': 0.39}


 40%|████      | 640/1600 [07:47<07:50,  2.04it/s]

{'loss': 0.2757, 'grad_norm': 18.185190200805664, 'learning_rate': 5e-05, 'epoch': 0.4}


 41%|████      | 650/1600 [07:52<07:42,  2.05it/s]

{'loss': 0.2492, 'grad_norm': 0.027209272608160973, 'learning_rate': 5e-05, 'epoch': 0.41}


 41%|████▏     | 660/1600 [07:57<07:58,  1.96it/s]

{'loss': 0.2623, 'grad_norm': 1.8138127326965332, 'learning_rate': 5e-05, 'epoch': 0.41}


 42%|████▏     | 670/1600 [08:02<08:19,  1.86it/s]

{'loss': 0.1303, 'grad_norm': 0.8884987235069275, 'learning_rate': 5e-05, 'epoch': 0.42}


 42%|████▎     | 680/1600 [08:08<08:36,  1.78it/s]

{'loss': 0.3974, 'grad_norm': 0.3171619176864624, 'learning_rate': 5e-05, 'epoch': 0.42}


 43%|████▎     | 690/1600 [08:13<09:51,  1.54it/s]

{'loss': 0.2658, 'grad_norm': 0.1838996708393097, 'learning_rate': 5e-05, 'epoch': 0.43}


 44%|████▍     | 700/1600 [08:19<08:31,  1.76it/s]

{'loss': 0.1063, 'grad_norm': 0.10839460790157318, 'learning_rate': 5e-05, 'epoch': 0.44}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.17901666462421417, 'eval_Accuracy': 0.944375, 'eval_F1': 0.945830797321972, 'eval_runtime': 35.4357, 'eval_samples_per_second': 45.152, 'eval_steps_per_second': 5.644, 'epoch': 0.44}


 44%|████▍     | 710/1600 [09:00<14:39,  1.01it/s]  

{'loss': 0.3252, 'grad_norm': 14.516400337219238, 'learning_rate': 5e-05, 'epoch': 0.44}


 45%|████▌     | 720/1600 [09:06<08:11,  1.79it/s]

{'loss': 0.2429, 'grad_norm': 0.21417754888534546, 'learning_rate': 5e-05, 'epoch': 0.45}


 46%|████▌     | 730/1600 [09:11<07:50,  1.85it/s]

{'loss': 0.3102, 'grad_norm': 43.12645721435547, 'learning_rate': 5e-05, 'epoch': 0.46}


 46%|████▋     | 740/1600 [09:17<08:30,  1.69it/s]

{'loss': 0.3206, 'grad_norm': 6.2892937660217285, 'learning_rate': 5e-05, 'epoch': 0.46}


 47%|████▋     | 750/1600 [09:23<07:54,  1.79it/s]

{'loss': 0.1496, 'grad_norm': 3.042630434036255, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 760/1600 [09:27<06:58,  2.01it/s]

{'loss': 0.3373, 'grad_norm': 0.5417025089263916, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 770/1600 [09:33<06:48,  2.03it/s]

{'loss': 0.152, 'grad_norm': 8.319583892822266, 'learning_rate': 5e-05, 'epoch': 0.48}


 49%|████▉     | 780/1600 [09:37<06:43,  2.03it/s]

{'loss': 0.201, 'grad_norm': 69.26130676269531, 'learning_rate': 5e-05, 'epoch': 0.49}


 49%|████▉     | 790/1600 [09:43<07:27,  1.81it/s]

{'loss': 0.1381, 'grad_norm': 0.030685966834425926, 'learning_rate': 5e-05, 'epoch': 0.49}


 50%|█████     | 800/1600 [09:48<06:36,  2.02it/s]

{'loss': 0.1662, 'grad_norm': 0.1318170726299286, 'learning_rate': 5e-05, 'epoch': 0.5}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2635065019130707, 'eval_Accuracy': 0.929375, 'eval_F1': 0.9331756357185098, 'eval_runtime': 34.5254, 'eval_samples_per_second': 46.343, 'eval_steps_per_second': 5.793, 'epoch': 0.5}


 51%|█████     | 810/1600 [10:29<12:20,  1.07it/s]  

{'loss': 0.1217, 'grad_norm': 1.1271823644638062, 'learning_rate': 5e-05, 'epoch': 0.51}


 51%|█████▏    | 820/1600 [10:34<07:39,  1.70it/s]

{'loss': 0.1957, 'grad_norm': 62.46738815307617, 'learning_rate': 5e-05, 'epoch': 0.51}


 52%|█████▏    | 830/1600 [10:39<07:01,  1.83it/s]

{'loss': 0.1748, 'grad_norm': 1.3250154256820679, 'learning_rate': 5e-05, 'epoch': 0.52}


 52%|█████▎    | 840/1600 [10:43<06:10,  2.05it/s]

{'loss': 0.134, 'grad_norm': 1.0522265434265137, 'learning_rate': 5e-05, 'epoch': 0.53}


 53%|█████▎    | 850/1600 [10:48<05:49,  2.15it/s]

{'loss': 0.4122, 'grad_norm': 0.09814392775297165, 'learning_rate': 5e-05, 'epoch': 0.53}


 54%|█████▍    | 860/1600 [10:53<06:05,  2.02it/s]

{'loss': 0.4239, 'grad_norm': 0.360393226146698, 'learning_rate': 5e-05, 'epoch': 0.54}


 54%|█████▍    | 870/1600 [10:58<05:41,  2.14it/s]

{'loss': 0.2719, 'grad_norm': 44.34648895263672, 'learning_rate': 5e-05, 'epoch': 0.54}


 55%|█████▌    | 880/1600 [11:03<05:40,  2.11it/s]

{'loss': 0.2851, 'grad_norm': 12.26350212097168, 'learning_rate': 5e-05, 'epoch': 0.55}


 56%|█████▌    | 890/1600 [11:08<05:45,  2.05it/s]

{'loss': 0.217, 'grad_norm': 2.9973645210266113, 'learning_rate': 5e-05, 'epoch': 0.56}


 56%|█████▋    | 900/1600 [11:12<05:14,  2.23it/s]

{'loss': 0.1068, 'grad_norm': 0.13181167840957642, 'learning_rate': 5e-05, 'epoch': 0.56}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.1847003549337387, 'eval_Accuracy': 0.950625, 'eval_F1': 0.9494561740243123, 'eval_runtime': 33.6112, 'eval_samples_per_second': 47.603, 'eval_steps_per_second': 5.95, 'epoch': 0.56}


 57%|█████▋    | 910/1600 [11:52<11:07,  1.03it/s]  

{'loss': 0.0868, 'grad_norm': 0.8552695512771606, 'learning_rate': 5e-05, 'epoch': 0.57}


 57%|█████▊    | 920/1600 [11:58<06:17,  1.80it/s]

{'loss': 0.2608, 'grad_norm': 7.275665283203125, 'learning_rate': 5e-05, 'epoch': 0.57}


 58%|█████▊    | 930/1600 [12:03<05:55,  1.89it/s]

{'loss': 0.5129, 'grad_norm': 8.238337516784668, 'learning_rate': 5e-05, 'epoch': 0.58}


 59%|█████▉    | 940/1600 [12:09<06:17,  1.75it/s]

{'loss': 0.3766, 'grad_norm': 18.281652450561523, 'learning_rate': 5e-05, 'epoch': 0.59}


 59%|█████▉    | 950/1600 [12:14<05:56,  1.82it/s]

{'loss': 0.1879, 'grad_norm': 0.05690150707960129, 'learning_rate': 5e-05, 'epoch': 0.59}


 60%|██████    | 960/1600 [12:19<05:37,  1.90it/s]

{'loss': 0.4086, 'grad_norm': 14.114982604980469, 'learning_rate': 5e-05, 'epoch': 0.6}


 61%|██████    | 970/1600 [12:25<05:30,  1.91it/s]

{'loss': 0.126, 'grad_norm': 10.11025333404541, 'learning_rate': 5e-05, 'epoch': 0.61}


 61%|██████▏   | 980/1600 [12:30<06:12,  1.67it/s]

{'loss': 0.0951, 'grad_norm': 13.997246742248535, 'learning_rate': 5e-05, 'epoch': 0.61}


 62%|██████▏   | 990/1600 [12:36<05:25,  1.87it/s]

{'loss': 0.3566, 'grad_norm': 0.6872771382331848, 'learning_rate': 5e-05, 'epoch': 0.62}


 62%|██████▎   | 1000/1600 [12:41<05:15,  1.90it/s]

{'loss': 0.1919, 'grad_norm': 2.587712049484253, 'learning_rate': 5e-05, 'epoch': 0.62}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2964196503162384, 'eval_Accuracy': 0.93125, 'eval_F1': 0.9276315789473685, 'eval_runtime': 36.8464, 'eval_samples_per_second': 43.424, 'eval_steps_per_second': 5.428, 'epoch': 0.62}


 63%|██████▎   | 1010/1600 [13:24<09:51,  1.00s/it]  

{'loss': 0.162, 'grad_norm': 5.995521545410156, 'learning_rate': 5e-05, 'epoch': 0.63}


 64%|██████▍   | 1020/1600 [13:30<05:04,  1.90it/s]

{'loss': 0.0512, 'grad_norm': 1.0877139568328857, 'learning_rate': 5e-05, 'epoch': 0.64}


 64%|██████▍   | 1030/1600 [13:35<04:50,  1.96it/s]

{'loss': 0.3491, 'grad_norm': 0.5845423936843872, 'learning_rate': 5e-05, 'epoch': 0.64}


 65%|██████▌   | 1040/1600 [13:40<04:55,  1.89it/s]

{'loss': 0.1901, 'grad_norm': 0.4885694086551666, 'learning_rate': 5e-05, 'epoch': 0.65}


 66%|██████▌   | 1050/1600 [13:45<04:55,  1.86it/s]

{'loss': 0.1881, 'grad_norm': 3.529782772064209, 'learning_rate': 5e-05, 'epoch': 0.66}


 66%|██████▋   | 1060/1600 [13:51<04:46,  1.89it/s]

{'loss': 0.1409, 'grad_norm': 76.33988952636719, 'learning_rate': 5e-05, 'epoch': 0.66}


 67%|██████▋   | 1070/1600 [13:56<04:38,  1.90it/s]

{'loss': 0.0721, 'grad_norm': 5.905324935913086, 'learning_rate': 5e-05, 'epoch': 0.67}


 68%|██████▊   | 1080/1600 [14:01<04:26,  1.95it/s]

{'loss': 0.2389, 'grad_norm': 1.0170139074325562, 'learning_rate': 5e-05, 'epoch': 0.68}


 68%|██████▊   | 1090/1600 [14:07<04:45,  1.78it/s]

{'loss': 0.2368, 'grad_norm': 18.140989303588867, 'learning_rate': 5e-05, 'epoch': 0.68}


 69%|██████▉   | 1100/1600 [14:12<04:21,  1.91it/s]

{'loss': 0.4241, 'grad_norm': 84.53778076171875, 'learning_rate': 5e-05, 'epoch': 0.69}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2588144838809967, 'eval_Accuracy': 0.9275, 'eval_F1': 0.9318448883666275, 'eval_runtime': 32.6067, 'eval_samples_per_second': 49.07, 'eval_steps_per_second': 6.134, 'epoch': 0.69}


 69%|██████▉   | 1110/1600 [14:50<07:06,  1.15it/s]  

{'loss': 0.2509, 'grad_norm': 37.51469039916992, 'learning_rate': 5e-05, 'epoch': 0.69}


 70%|███████   | 1120/1600 [14:55<04:08,  1.93it/s]

{'loss': 0.2553, 'grad_norm': 1.3292624950408936, 'learning_rate': 5e-05, 'epoch': 0.7}


 71%|███████   | 1130/1600 [15:00<03:44,  2.10it/s]

{'loss': 0.0615, 'grad_norm': 0.03908376023173332, 'learning_rate': 5e-05, 'epoch': 0.71}


 71%|███████▏  | 1140/1600 [15:05<03:29,  2.20it/s]

{'loss': 0.304, 'grad_norm': 45.926490783691406, 'learning_rate': 5e-05, 'epoch': 0.71}


 72%|███████▏  | 1150/1600 [15:10<04:08,  1.81it/s]

{'loss': 0.1315, 'grad_norm': 3.952631950378418, 'learning_rate': 5e-05, 'epoch': 0.72}


 72%|███████▎  | 1160/1600 [15:15<03:44,  1.96it/s]

{'loss': 0.3698, 'grad_norm': 0.047121014446020126, 'learning_rate': 5e-05, 'epoch': 0.72}


 73%|███████▎  | 1170/1600 [15:19<03:22,  2.12it/s]

{'loss': 0.3782, 'grad_norm': 0.20432572066783905, 'learning_rate': 5e-05, 'epoch': 0.73}


 74%|███████▍  | 1180/1600 [15:24<03:29,  2.01it/s]

{'loss': 0.254, 'grad_norm': 0.3924778997898102, 'learning_rate': 5e-05, 'epoch': 0.74}


 74%|███████▍  | 1190/1600 [15:29<03:17,  2.08it/s]

{'loss': 0.315, 'grad_norm': 35.76231384277344, 'learning_rate': 5e-05, 'epoch': 0.74}


 75%|███████▌  | 1200/1600 [15:34<03:14,  2.06it/s]

{'loss': 0.3073, 'grad_norm': 0.11875338852405548, 'learning_rate': 5e-05, 'epoch': 0.75}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2529813051223755, 'eval_Accuracy': 0.91375, 'eval_F1': 0.9068825910931174, 'eval_runtime': 37.0922, 'eval_samples_per_second': 43.136, 'eval_steps_per_second': 5.392, 'epoch': 0.75}


 76%|███████▌  | 1210/1600 [16:18<06:36,  1.02s/it]  

{'loss': 0.1874, 'grad_norm': 2.4701762199401855, 'learning_rate': 5e-05, 'epoch': 0.76}


 76%|███████▋  | 1220/1600 [16:23<03:38,  1.74it/s]

{'loss': 0.2, 'grad_norm': 10.305328369140625, 'learning_rate': 5e-05, 'epoch': 0.76}


 77%|███████▋  | 1230/1600 [16:29<03:23,  1.81it/s]

{'loss': 0.2079, 'grad_norm': 15.526440620422363, 'learning_rate': 5e-05, 'epoch': 0.77}


 78%|███████▊  | 1240/1600 [16:34<03:34,  1.68it/s]

{'loss': 0.0999, 'grad_norm': 0.1910015195608139, 'learning_rate': 5e-05, 'epoch': 0.78}


 78%|███████▊  | 1250/1600 [16:39<03:02,  1.92it/s]

{'loss': 0.0703, 'grad_norm': 0.04336484894156456, 'learning_rate': 5e-05, 'epoch': 0.78}


 79%|███████▉  | 1260/1600 [16:45<03:01,  1.87it/s]

{'loss': 0.0903, 'grad_norm': 0.04593074321746826, 'learning_rate': 5e-05, 'epoch': 0.79}


 79%|███████▉  | 1270/1600 [16:50<03:02,  1.81it/s]

{'loss': 0.0593, 'grad_norm': 0.5425251722335815, 'learning_rate': 5e-05, 'epoch': 0.79}


 80%|████████  | 1280/1600 [16:56<02:51,  1.87it/s]

{'loss': 0.2794, 'grad_norm': 0.2510324716567993, 'learning_rate': 5e-05, 'epoch': 0.8}


 81%|████████  | 1290/1600 [17:01<02:40,  1.93it/s]

{'loss': 0.3029, 'grad_norm': 2.245424509048462, 'learning_rate': 5e-05, 'epoch': 0.81}


 81%|████████▏ | 1300/1600 [17:06<02:40,  1.86it/s]

{'loss': 0.1307, 'grad_norm': 60.36738586425781, 'learning_rate': 5e-05, 'epoch': 0.81}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.1936429738998413, 'eval_Accuracy': 0.9425, 'eval_F1': 0.9408740359897172, 'eval_runtime': 35.2109, 'eval_samples_per_second': 45.44, 'eval_steps_per_second': 5.68, 'epoch': 0.81}


 82%|████████▏ | 1310/1600 [17:47<04:32,  1.06it/s]

{'loss': 0.1558, 'grad_norm': 34.91121292114258, 'learning_rate': 5e-05, 'epoch': 0.82}


 82%|████████▎ | 1320/1600 [17:53<02:37,  1.78it/s]

{'loss': 0.0459, 'grad_norm': 3.134268283843994, 'learning_rate': 5e-05, 'epoch': 0.82}


 83%|████████▎ | 1330/1600 [17:58<02:28,  1.82it/s]

{'loss': 0.1065, 'grad_norm': 1.4359033107757568, 'learning_rate': 5e-05, 'epoch': 0.83}


 84%|████████▍ | 1340/1600 [18:03<02:14,  1.93it/s]

{'loss': 0.0828, 'grad_norm': 0.006574345286935568, 'learning_rate': 5e-05, 'epoch': 0.84}


 84%|████████▍ | 1350/1600 [18:09<02:13,  1.87it/s]

{'loss': 0.0783, 'grad_norm': 0.045287344604730606, 'learning_rate': 5e-05, 'epoch': 0.84}


 85%|████████▌ | 1360/1600 [18:14<02:09,  1.85it/s]

{'loss': 0.1991, 'grad_norm': 1.4066131114959717, 'learning_rate': 5e-05, 'epoch': 0.85}


 86%|████████▌ | 1370/1600 [18:19<02:02,  1.87it/s]

{'loss': 0.2739, 'grad_norm': 16.729639053344727, 'learning_rate': 5e-05, 'epoch': 0.86}


 86%|████████▋ | 1380/1600 [18:24<01:49,  2.00it/s]

{'loss': 0.1181, 'grad_norm': 0.17160114645957947, 'learning_rate': 5e-05, 'epoch': 0.86}


 87%|████████▋ | 1390/1600 [18:30<02:06,  1.66it/s]

{'loss': 0.2287, 'grad_norm': 7.082812309265137, 'learning_rate': 5e-05, 'epoch': 0.87}


 88%|████████▊ | 1400/1600 [18:35<01:38,  2.04it/s]

{'loss': 0.1052, 'grad_norm': 0.15221285820007324, 'learning_rate': 5e-05, 'epoch': 0.88}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.16104045510292053, 'eval_Accuracy': 0.9475, 'eval_F1': 0.9494584837545126, 'eval_runtime': 32.1628, 'eval_samples_per_second': 49.747, 'eval_steps_per_second': 6.218, 'epoch': 0.88}


 88%|████████▊ | 1410/1600 [19:12<02:39,  1.19it/s]

{'loss': 0.1762, 'grad_norm': 18.07054328918457, 'learning_rate': 5e-05, 'epoch': 0.88}


 89%|████████▉ | 1420/1600 [19:17<01:27,  2.06it/s]

{'loss': 0.5351, 'grad_norm': 54.100433349609375, 'learning_rate': 5e-05, 'epoch': 0.89}


 89%|████████▉ | 1430/1600 [19:22<01:24,  2.00it/s]

{'loss': 0.3012, 'grad_norm': 21.405620574951172, 'learning_rate': 5e-05, 'epoch': 0.89}


 90%|█████████ | 1440/1600 [19:27<01:25,  1.88it/s]

{'loss': 0.3098, 'grad_norm': 12.579462051391602, 'learning_rate': 5e-05, 'epoch': 0.9}


 91%|█████████ | 1450/1600 [19:32<01:22,  1.82it/s]

{'loss': 0.1153, 'grad_norm': 0.9451889991760254, 'learning_rate': 5e-05, 'epoch': 0.91}


 91%|█████████▏| 1460/1600 [19:37<01:11,  1.97it/s]

{'loss': 0.3302, 'grad_norm': 47.68869400024414, 'learning_rate': 5e-05, 'epoch': 0.91}


 92%|█████████▏| 1470/1600 [19:42<01:09,  1.87it/s]

{'loss': 0.2349, 'grad_norm': 2.3205649852752686, 'learning_rate': 5e-05, 'epoch': 0.92}


 92%|█████████▎| 1480/1600 [19:47<01:01,  1.94it/s]

{'loss': 0.1223, 'grad_norm': 0.14261868596076965, 'learning_rate': 5e-05, 'epoch': 0.93}


 93%|█████████▎| 1490/1600 [19:53<01:02,  1.77it/s]

{'loss': 0.0845, 'grad_norm': 24.995637893676758, 'learning_rate': 5e-05, 'epoch': 0.93}


 94%|█████████▍| 1500/1600 [19:58<00:54,  1.84it/s]

{'loss': 0.1385, 'grad_norm': 0.41831889748573303, 'learning_rate': 5e-05, 'epoch': 0.94}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22803020477294922, 'eval_Accuracy': 0.95125, 'eval_F1': 0.9493506493506494, 'eval_runtime': 35.7594, 'eval_samples_per_second': 44.743, 'eval_steps_per_second': 5.593, 'epoch': 0.94}


 94%|█████████▍| 1510/1600 [20:40<01:25,  1.05it/s]

{'loss': 0.1252, 'grad_norm': 68.01288604736328, 'learning_rate': 5e-05, 'epoch': 0.94}


 95%|█████████▌| 1520/1600 [20:45<00:45,  1.78it/s]

{'loss': 0.0284, 'grad_norm': 0.17135672271251678, 'learning_rate': 5e-05, 'epoch': 0.95}


 96%|█████████▌| 1530/1600 [20:51<00:43,  1.60it/s]

{'loss': 0.0075, 'grad_norm': 16.33950424194336, 'learning_rate': 5e-05, 'epoch': 0.96}


 96%|█████████▋| 1540/1600 [20:56<00:30,  1.94it/s]

{'loss': 0.1959, 'grad_norm': 3.991551160812378, 'learning_rate': 5e-05, 'epoch': 0.96}


 97%|█████████▋| 1550/1600 [21:01<00:25,  1.96it/s]

{'loss': 0.3071, 'grad_norm': 0.07547168433666229, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1560/1600 [21:07<00:20,  1.95it/s]

{'loss': 0.2357, 'grad_norm': 21.5242919921875, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1570/1600 [21:12<00:15,  1.97it/s]

{'loss': 0.2929, 'grad_norm': 16.514799118041992, 'learning_rate': 5e-05, 'epoch': 0.98}


 99%|█████████▉| 1580/1600 [21:17<00:10,  1.89it/s]

{'loss': 0.1265, 'grad_norm': 20.905071258544922, 'learning_rate': 5e-05, 'epoch': 0.99}


 99%|█████████▉| 1590/1600 [21:23<00:05,  1.90it/s]

{'loss': 0.242, 'grad_norm': 11.392570495605469, 'learning_rate': 5e-05, 'epoch': 0.99}


100%|██████████| 1600/1600 [21:28<00:00,  1.84it/s]

{'loss': 0.2704, 'grad_norm': 12.915433883666992, 'learning_rate': 5e-05, 'epoch': 1.0}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.12240426242351532, 'eval_Accuracy': 0.9575, 'eval_F1': 0.9584859584859585, 'eval_runtime': 35.1206, 'eval_samples_per_second': 45.557, 'eval_steps_per_second': 5.695, 'epoch': 1.0}


100%|██████████| 1600/1600 [22:04<00:00,  1.21it/s]
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 1324.5287, 'train_samples_per_second': 9.664, 'train_steps_per_second': 1.208, 'train_loss': 0.2481603025691584, 'epoch': 1.0}
***** train metrics *****
  epoch                    =         1.0
  total_flos               = 933951802GF
  train_loss               =      0.2482
  train_runtime            =  0:22:04.52
  train_samples_per_second =       9.664
  train_steps_per_second   =       1.208


100%|██████████| 200/200 [00:33<00:00,  6.05it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_Accuracy           =     0.9575
  eval_F1                 =     0.9585
  eval_loss               =     0.1224
  eval_runtime            = 0:00:33.22
  eval_samples_per_second =     48.151
  eval_steps_per_second   =      6.019



