### Imports

In [1]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

# print(os.getcwd())
os.chdir("C:/Users/metet/OneDrive/Documents/GitHub/thesis") # have to go up one directory, can also use os.chdir("..")
print(os.getcwd())

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


C:\Users\metet\OneDrive\Documents\GitHub\thesis
cuda


### Load Datasets and Processor

In [2]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'
mixed_dataset_path = 'archive/datasets/mixed_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)
mixed_ds = load_dataset("imagefolder", data_dir=mixed_dataset_path)

print(faces_ds)

Downloading data: 100%|██████████| 12800/12800 [00:00<00:00, 80986.75files/s] 
Downloading data: 100%|██████████| 1600/1600 [00:00<00:00, 23762.67files/s]
Downloading data: 100%|██████████| 1600/1600 [00:00<00:00, 23438.41files/s]
Generating train split: 12800 examples [00:00, 17079.88 examples/s]
Generating validation split: 1600 examples [00:00, 17171.43 examples/s]
Generating test split: 1600 examples [00:00, 18336.01 examples/s]


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [3]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)
mixed_ds_transformed = mixed_ds.with_transform(transform)

In [4]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [5]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

### Parameters

In [6]:
# dataset_to_use = faces_ds_transformed
# dataset_to_use = art_ds_transformed
dataset_to_use = mixed_ds_transformed

lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 
chosen_lr = 1

# output_dir_name = "./sdxl-fine-tune"
# output_dir_name = "./sdxl-fine-tune-art"
output_dir_name = "./sdxl-fine-tune-mixed"

### Load Model

In [7]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


In [8]:
# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir=output_dir_name,
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=1,
    fp16=False, # True leads to runtime errors on CUDA
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[chosen_lr],  
    lr_scheduler_type="constant",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)




In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [10]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


  1%|          | 10/1600 [00:04<08:41,  3.05it/s]

{'loss': 2.4939, 'grad_norm': 13.925188064575195, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|▏         | 20/1600 [00:07<08:10,  3.22it/s]

{'loss': 0.7266, 'grad_norm': 17.344152450561523, 'learning_rate': 5e-05, 'epoch': 0.01}


  2%|▏         | 30/1600 [00:10<08:20,  3.14it/s]

{'loss': 0.6119, 'grad_norm': 4.185383319854736, 'learning_rate': 5e-05, 'epoch': 0.02}


  2%|▎         | 40/1600 [00:13<07:56,  3.28it/s]

{'loss': 0.5497, 'grad_norm': 23.416959762573242, 'learning_rate': 5e-05, 'epoch': 0.03}


  3%|▎         | 50/1600 [00:17<07:58,  3.24it/s]

{'loss': 0.5726, 'grad_norm': 12.55972957611084, 'learning_rate': 5e-05, 'epoch': 0.03}


  4%|▍         | 60/1600 [00:20<07:46,  3.30it/s]

{'loss': 0.5134, 'grad_norm': 11.52746868133545, 'learning_rate': 5e-05, 'epoch': 0.04}


  4%|▍         | 70/1600 [00:23<07:39,  3.33it/s]

{'loss': 0.6388, 'grad_norm': 6.0908002853393555, 'learning_rate': 5e-05, 'epoch': 0.04}


  5%|▌         | 80/1600 [00:26<07:35,  3.34it/s]

{'loss': 0.5372, 'grad_norm': 4.773994445800781, 'learning_rate': 5e-05, 'epoch': 0.05}


  6%|▌         | 90/1600 [00:29<07:34,  3.33it/s]

{'loss': 0.4827, 'grad_norm': 8.961148262023926, 'learning_rate': 5e-05, 'epoch': 0.06}


  6%|▋         | 100/1600 [00:32<07:33,  3.31it/s]

{'loss': 0.5239, 'grad_norm': 4.520536422729492, 'learning_rate': 5e-05, 'epoch': 0.06}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4232608675956726, 'eval_Accuracy': 0.779375, 'eval_F1': 0.8022408963585435, 'eval_runtime': 24.4983, 'eval_samples_per_second': 65.311, 'eval_steps_per_second': 8.164, 'epoch': 0.06}


  7%|▋         | 110/1600 [01:00<15:35,  1.59it/s]  

{'loss': 0.611, 'grad_norm': 31.996339797973633, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 120/1600 [01:04<07:46,  3.17it/s]

{'loss': 0.4829, 'grad_norm': 76.7015151977539, 'learning_rate': 5e-05, 'epoch': 0.07}


  8%|▊         | 130/1600 [01:07<07:27,  3.28it/s]

{'loss': 0.5501, 'grad_norm': 12.816143989562988, 'learning_rate': 5e-05, 'epoch': 0.08}


  9%|▉         | 140/1600 [01:10<07:36,  3.20it/s]

{'loss': 0.4466, 'grad_norm': 7.349215507507324, 'learning_rate': 5e-05, 'epoch': 0.09}


  9%|▉         | 150/1600 [01:13<07:31,  3.21it/s]

{'loss': 0.4987, 'grad_norm': 10.939946174621582, 'learning_rate': 5e-05, 'epoch': 0.09}


 10%|█         | 160/1600 [01:16<07:25,  3.23it/s]

{'loss': 0.4189, 'grad_norm': 12.188812255859375, 'learning_rate': 5e-05, 'epoch': 0.1}


 11%|█         | 170/1600 [01:19<07:28,  3.19it/s]

{'loss': 0.363, 'grad_norm': 9.298203468322754, 'learning_rate': 5e-05, 'epoch': 0.11}


 11%|█▏        | 180/1600 [01:22<07:16,  3.25it/s]

{'loss': 0.4514, 'grad_norm': 9.233771324157715, 'learning_rate': 5e-05, 'epoch': 0.11}


 12%|█▏        | 190/1600 [01:25<07:08,  3.29it/s]

{'loss': 0.3842, 'grad_norm': 48.94574737548828, 'learning_rate': 5e-05, 'epoch': 0.12}


 12%|█▎        | 200/1600 [01:28<07:05,  3.29it/s]

{'loss': 0.5595, 'grad_norm': 30.600263595581055, 'learning_rate': 5e-05, 'epoch': 0.12}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.31133976578712463, 'eval_Accuracy': 0.85875, 'eval_F1': 0.8522875816993464, 'eval_runtime': 24.0585, 'eval_samples_per_second': 66.504, 'eval_steps_per_second': 8.313, 'epoch': 0.12}


 13%|█▎        | 210/1600 [01:57<14:31,  1.59it/s]  

{'loss': 0.5422, 'grad_norm': 16.30881690979004, 'learning_rate': 5e-05, 'epoch': 0.13}


 14%|█▍        | 220/1600 [02:00<07:13,  3.18it/s]

{'loss': 0.4361, 'grad_norm': 9.528964042663574, 'learning_rate': 5e-05, 'epoch': 0.14}


 14%|█▍        | 230/1600 [02:03<07:12,  3.17it/s]

{'loss': 0.4182, 'grad_norm': 19.831743240356445, 'learning_rate': 5e-05, 'epoch': 0.14}


 15%|█▌        | 240/1600 [02:06<07:24,  3.06it/s]

{'loss': 0.42, 'grad_norm': 2.4823875427246094, 'learning_rate': 5e-05, 'epoch': 0.15}


 16%|█▌        | 250/1600 [02:10<07:15,  3.10it/s]

{'loss': 0.7429, 'grad_norm': 10.774359703063965, 'learning_rate': 5e-05, 'epoch': 0.16}


 16%|█▋        | 260/1600 [02:13<06:54,  3.23it/s]

{'loss': 0.4195, 'grad_norm': 3.48520565032959, 'learning_rate': 5e-05, 'epoch': 0.16}


 17%|█▋        | 270/1600 [02:16<06:55,  3.20it/s]

{'loss': 0.4861, 'grad_norm': 1.8857401609420776, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 280/1600 [02:19<06:44,  3.26it/s]

{'loss': 0.315, 'grad_norm': 23.483510971069336, 'learning_rate': 5e-05, 'epoch': 0.17}


 18%|█▊        | 290/1600 [02:22<06:42,  3.25it/s]

{'loss': 0.3712, 'grad_norm': 8.945442199707031, 'learning_rate': 5e-05, 'epoch': 0.18}


 19%|█▉        | 300/1600 [02:25<06:46,  3.20it/s]

{'loss': 0.3773, 'grad_norm': 7.268340110778809, 'learning_rate': 5e-05, 'epoch': 0.19}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.28302857279777527, 'eval_Accuracy': 0.878125, 'eval_F1': 0.875717017208413, 'eval_runtime': 24.1771, 'eval_samples_per_second': 66.178, 'eval_steps_per_second': 8.272, 'epoch': 0.19}


 19%|█▉        | 310/1600 [02:54<13:29,  1.59it/s]  

{'loss': 0.3047, 'grad_norm': 28.944496154785156, 'learning_rate': 5e-05, 'epoch': 0.19}


 20%|██        | 320/1600 [02:57<07:00,  3.05it/s]

{'loss': 0.4562, 'grad_norm': 6.502966403961182, 'learning_rate': 5e-05, 'epoch': 0.2}


 21%|██        | 330/1600 [03:00<06:34,  3.22it/s]

{'loss': 0.1679, 'grad_norm': 11.410440444946289, 'learning_rate': 5e-05, 'epoch': 0.21}


 21%|██▏       | 340/1600 [03:03<06:27,  3.26it/s]

{'loss': 0.3752, 'grad_norm': 30.359338760375977, 'learning_rate': 5e-05, 'epoch': 0.21}


 22%|██▏       | 350/1600 [03:06<06:44,  3.09it/s]

{'loss': 0.3213, 'grad_norm': 26.002593994140625, 'learning_rate': 5e-05, 'epoch': 0.22}


 22%|██▎       | 360/1600 [03:10<06:58,  2.96it/s]

{'loss': 0.4231, 'grad_norm': 20.15648078918457, 'learning_rate': 5e-05, 'epoch': 0.23}


 23%|██▎       | 370/1600 [03:13<06:41,  3.06it/s]

{'loss': 0.6162, 'grad_norm': 5.3670830726623535, 'learning_rate': 5e-05, 'epoch': 0.23}


 24%|██▍       | 380/1600 [03:16<06:19,  3.22it/s]

{'loss': 0.3046, 'grad_norm': 5.21773624420166, 'learning_rate': 5e-05, 'epoch': 0.24}


 24%|██▍       | 390/1600 [03:19<06:13,  3.24it/s]

{'loss': 0.3384, 'grad_norm': 14.512846946716309, 'learning_rate': 5e-05, 'epoch': 0.24}


 25%|██▌       | 400/1600 [03:22<06:01,  3.32it/s]

{'loss': 0.4736, 'grad_norm': 15.89256763458252, 'learning_rate': 5e-05, 'epoch': 0.25}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.27051153779029846, 'eval_Accuracy': 0.89375, 'eval_F1': 0.8894668400520156, 'eval_runtime': 23.7808, 'eval_samples_per_second': 67.281, 'eval_steps_per_second': 8.41, 'epoch': 0.25}


 26%|██▌       | 410/1600 [03:50<11:53,  1.67it/s]  

{'loss': 0.4105, 'grad_norm': 20.33751106262207, 'learning_rate': 5e-05, 'epoch': 0.26}


 26%|██▋       | 420/1600 [03:53<05:49,  3.37it/s]

{'loss': 0.3387, 'grad_norm': 13.257974624633789, 'learning_rate': 5e-05, 'epoch': 0.26}


 27%|██▋       | 430/1600 [03:56<05:53,  3.31it/s]

{'loss': 0.2043, 'grad_norm': 15.173238754272461, 'learning_rate': 5e-05, 'epoch': 0.27}


 28%|██▊       | 440/1600 [03:59<05:48,  3.33it/s]

{'loss': 0.2483, 'grad_norm': 19.80600929260254, 'learning_rate': 5e-05, 'epoch': 0.28}


 28%|██▊       | 450/1600 [04:02<05:39,  3.39it/s]

{'loss': 0.3901, 'grad_norm': 39.85683059692383, 'learning_rate': 5e-05, 'epoch': 0.28}


 29%|██▉       | 460/1600 [04:05<05:29,  3.46it/s]

{'loss': 0.3372, 'grad_norm': 12.234780311584473, 'learning_rate': 5e-05, 'epoch': 0.29}


 29%|██▉       | 470/1600 [04:08<05:30,  3.42it/s]

{'loss': 0.2825, 'grad_norm': 6.321074962615967, 'learning_rate': 5e-05, 'epoch': 0.29}


 30%|███       | 480/1600 [04:11<05:25,  3.44it/s]

{'loss': 0.2408, 'grad_norm': 22.072826385498047, 'learning_rate': 5e-05, 'epoch': 0.3}


 31%|███       | 490/1600 [04:14<05:22,  3.44it/s]

{'loss': 0.2958, 'grad_norm': 3.6377885341644287, 'learning_rate': 5e-05, 'epoch': 0.31}


 31%|███▏      | 500/1600 [04:17<05:18,  3.46it/s]

{'loss': 0.1018, 'grad_norm': 10.307220458984375, 'learning_rate': 5e-05, 'epoch': 0.31}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.4183339774608612, 'eval_Accuracy': 0.888125, 'eval_F1': 0.8902513795217658, 'eval_runtime': 23.4457, 'eval_samples_per_second': 68.243, 'eval_steps_per_second': 8.53, 'epoch': 0.31}


 32%|███▏      | 510/1600 [04:44<10:42,  1.70it/s]  

{'loss': 0.5471, 'grad_norm': 44.50299072265625, 'learning_rate': 5e-05, 'epoch': 0.32}


 32%|███▎      | 520/1600 [04:47<05:20,  3.37it/s]

{'loss': 1.0482, 'grad_norm': 94.87635040283203, 'learning_rate': 5e-05, 'epoch': 0.33}


 33%|███▎      | 530/1600 [04:50<05:10,  3.45it/s]

{'loss': 0.2145, 'grad_norm': 3.260503053665161, 'learning_rate': 5e-05, 'epoch': 0.33}


 34%|███▍      | 540/1600 [04:53<05:03,  3.49it/s]

{'loss': 0.273, 'grad_norm': 2.270082473754883, 'learning_rate': 5e-05, 'epoch': 0.34}


 34%|███▍      | 550/1600 [04:56<05:01,  3.48it/s]

{'loss': 0.3532, 'grad_norm': 20.731204986572266, 'learning_rate': 5e-05, 'epoch': 0.34}


 35%|███▌      | 560/1600 [04:59<04:59,  3.48it/s]

{'loss': 0.2666, 'grad_norm': 0.429451584815979, 'learning_rate': 5e-05, 'epoch': 0.35}


 36%|███▌      | 570/1600 [05:02<05:33,  3.09it/s]

{'loss': 0.3452, 'grad_norm': 2.95947527885437, 'learning_rate': 5e-05, 'epoch': 0.36}


 36%|███▋      | 580/1600 [05:05<05:25,  3.13it/s]

{'loss': 0.3825, 'grad_norm': 16.681726455688477, 'learning_rate': 5e-05, 'epoch': 0.36}


 37%|███▋      | 590/1600 [05:08<05:27,  3.08it/s]

{'loss': 0.2725, 'grad_norm': 9.169682502746582, 'learning_rate': 5e-05, 'epoch': 0.37}


 38%|███▊      | 600/1600 [05:12<05:13,  3.19it/s]

{'loss': 0.605, 'grad_norm': 4.436264514923096, 'learning_rate': 5e-05, 'epoch': 0.38}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.30898261070251465, 'eval_Accuracy': 0.87625, 'eval_F1': 0.8638239339752407, 'eval_runtime': 23.8451, 'eval_samples_per_second': 67.1, 'eval_steps_per_second': 8.387, 'epoch': 0.38}


 38%|███▊      | 610/1600 [05:40<10:03,  1.64it/s]  

{'loss': 0.2646, 'grad_norm': 27.496063232421875, 'learning_rate': 5e-05, 'epoch': 0.38}


 39%|███▉      | 620/1600 [05:43<05:08,  3.17it/s]

{'loss': 0.3874, 'grad_norm': 25.984861373901367, 'learning_rate': 5e-05, 'epoch': 0.39}


 39%|███▉      | 630/1600 [05:46<04:40,  3.46it/s]

{'loss': 0.2135, 'grad_norm': 2.5820374488830566, 'learning_rate': 5e-05, 'epoch': 0.39}


 40%|████      | 640/1600 [05:48<04:38,  3.45it/s]

{'loss': 0.1611, 'grad_norm': 7.210771083831787, 'learning_rate': 5e-05, 'epoch': 0.4}


 41%|████      | 650/1600 [05:51<04:35,  3.45it/s]

{'loss': 0.3387, 'grad_norm': 52.318153381347656, 'learning_rate': 5e-05, 'epoch': 0.41}


 41%|████▏     | 660/1600 [05:54<04:30,  3.47it/s]

{'loss': 0.4373, 'grad_norm': 54.739498138427734, 'learning_rate': 5e-05, 'epoch': 0.41}


 42%|████▏     | 670/1600 [05:57<04:28,  3.47it/s]

{'loss': 0.5922, 'grad_norm': 67.9873275756836, 'learning_rate': 5e-05, 'epoch': 0.42}


 42%|████▎     | 680/1600 [06:00<04:27,  3.44it/s]

{'loss': 0.1611, 'grad_norm': 8.063993453979492, 'learning_rate': 5e-05, 'epoch': 0.42}


 43%|████▎     | 690/1600 [06:03<04:24,  3.44it/s]

{'loss': 0.298, 'grad_norm': 1.4247015714645386, 'learning_rate': 5e-05, 'epoch': 0.43}


 44%|████▍     | 700/1600 [06:06<04:21,  3.44it/s]

{'loss': 0.3649, 'grad_norm': 20.284923553466797, 'learning_rate': 5e-05, 'epoch': 0.44}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.3796635866165161, 'eval_Accuracy': 0.888125, 'eval_F1': 0.8940201302545885, 'eval_runtime': 24.3495, 'eval_samples_per_second': 65.71, 'eval_steps_per_second': 8.214, 'epoch': 0.44}


 44%|████▍     | 710/1600 [06:34<08:52,  1.67it/s]  

{'loss': 0.245, 'grad_norm': 33.98774719238281, 'learning_rate': 5e-05, 'epoch': 0.44}


 45%|████▌     | 720/1600 [06:37<04:25,  3.31it/s]

{'loss': 0.2579, 'grad_norm': 19.202991485595703, 'learning_rate': 5e-05, 'epoch': 0.45}


 46%|████▌     | 730/1600 [06:40<04:13,  3.43it/s]

{'loss': 0.2712, 'grad_norm': 7.076483726501465, 'learning_rate': 5e-05, 'epoch': 0.46}


 46%|████▋     | 740/1600 [06:43<04:11,  3.42it/s]

{'loss': 0.2456, 'grad_norm': 223.82003784179688, 'learning_rate': 5e-05, 'epoch': 0.46}


 47%|████▋     | 750/1600 [06:46<04:06,  3.45it/s]

{'loss': 0.3423, 'grad_norm': 36.450531005859375, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 760/1600 [06:49<04:02,  3.46it/s]

{'loss': 0.3755, 'grad_norm': 2.0549046993255615, 'learning_rate': 5e-05, 'epoch': 0.47}


 48%|████▊     | 770/1600 [06:52<04:01,  3.44it/s]

{'loss': 0.286, 'grad_norm': 16.904888153076172, 'learning_rate': 5e-05, 'epoch': 0.48}


 49%|████▉     | 780/1600 [06:55<03:58,  3.44it/s]

{'loss': 0.3329, 'grad_norm': 23.49013328552246, 'learning_rate': 5e-05, 'epoch': 0.49}


 49%|████▉     | 790/1600 [06:58<03:52,  3.48it/s]

{'loss': 0.2143, 'grad_norm': 135.09742736816406, 'learning_rate': 5e-05, 'epoch': 0.49}


 50%|█████     | 800/1600 [07:01<03:52,  3.44it/s]

{'loss': 0.3095, 'grad_norm': 30.65654754638672, 'learning_rate': 5e-05, 'epoch': 0.5}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.21605069935321808, 'eval_Accuracy': 0.92375, 'eval_F1': 0.9251533742331288, 'eval_runtime': 23.6853, 'eval_samples_per_second': 67.553, 'eval_steps_per_second': 8.444, 'epoch': 0.5}


 51%|█████     | 810/1600 [07:28<07:52,  1.67it/s]  

{'loss': 0.2805, 'grad_norm': 17.58245849609375, 'learning_rate': 5e-05, 'epoch': 0.51}


 51%|█████▏    | 820/1600 [07:31<03:56,  3.29it/s]

{'loss': 0.194, 'grad_norm': 17.896427154541016, 'learning_rate': 5e-05, 'epoch': 0.51}


 52%|█████▏    | 830/1600 [07:34<03:46,  3.40it/s]

{'loss': 0.434, 'grad_norm': 13.689750671386719, 'learning_rate': 5e-05, 'epoch': 0.52}


 52%|█████▎    | 840/1600 [07:37<03:40,  3.44it/s]

{'loss': 0.3501, 'grad_norm': 15.066142082214355, 'learning_rate': 5e-05, 'epoch': 0.53}


 53%|█████▎    | 850/1600 [07:40<03:43,  3.36it/s]

{'loss': 0.2254, 'grad_norm': 3.0854132175445557, 'learning_rate': 5e-05, 'epoch': 0.53}


 54%|█████▍    | 860/1600 [07:43<03:36,  3.41it/s]

{'loss': 0.2236, 'grad_norm': 43.55888366699219, 'learning_rate': 5e-05, 'epoch': 0.54}


 54%|█████▍    | 870/1600 [07:46<03:35,  3.38it/s]

{'loss': 0.5714, 'grad_norm': 2.2536611557006836, 'learning_rate': 5e-05, 'epoch': 0.54}


 55%|█████▌    | 880/1600 [07:49<03:32,  3.38it/s]

{'loss': 0.2105, 'grad_norm': 30.37362289428711, 'learning_rate': 5e-05, 'epoch': 0.55}


 56%|█████▌    | 890/1600 [07:52<03:27,  3.42it/s]

{'loss': 0.318, 'grad_norm': 51.22108459472656, 'learning_rate': 5e-05, 'epoch': 0.56}


 56%|█████▋    | 900/1600 [07:55<03:25,  3.40it/s]

{'loss': 0.214, 'grad_norm': 46.44112014770508, 'learning_rate': 5e-05, 'epoch': 0.56}


                                                  
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.20512565970420837, 'eval_Accuracy': 0.92375, 'eval_F1': 0.9263285024154589, 'eval_runtime': 23.9154, 'eval_samples_per_second': 66.903, 'eval_steps_per_second': 8.363, 'epoch': 0.56}


 57%|█████▋    | 910/1600 [08:23<06:53,  1.67it/s]  

{'loss': 0.1274, 'grad_norm': 12.40531063079834, 'learning_rate': 5e-05, 'epoch': 0.57}


 57%|█████▊    | 920/1600 [08:26<03:25,  3.31it/s]

{'loss': 0.2533, 'grad_norm': 17.65537452697754, 'learning_rate': 5e-05, 'epoch': 0.57}


 58%|█████▊    | 930/1600 [08:29<03:13,  3.45it/s]

{'loss': 0.2904, 'grad_norm': 25.103790283203125, 'learning_rate': 5e-05, 'epoch': 0.58}


 59%|█████▉    | 940/1600 [08:32<03:12,  3.43it/s]

{'loss': 0.0826, 'grad_norm': 0.06988082081079483, 'learning_rate': 5e-05, 'epoch': 0.59}


 59%|█████▉    | 950/1600 [08:35<03:13,  3.36it/s]

{'loss': 0.1203, 'grad_norm': 0.07358479499816895, 'learning_rate': 5e-05, 'epoch': 0.59}


 60%|██████    | 960/1600 [08:38<03:12,  3.32it/s]

{'loss': 0.1152, 'grad_norm': 0.8529381155967712, 'learning_rate': 5e-05, 'epoch': 0.6}


 61%|██████    | 970/1600 [08:41<03:12,  3.27it/s]

{'loss': 0.1588, 'grad_norm': 0.9264557361602783, 'learning_rate': 5e-05, 'epoch': 0.61}


 61%|██████▏   | 980/1600 [08:44<03:03,  3.37it/s]

{'loss': 0.2004, 'grad_norm': 52.9069709777832, 'learning_rate': 5e-05, 'epoch': 0.61}


 62%|██████▏   | 990/1600 [08:47<02:59,  3.40it/s]

{'loss': 0.3781, 'grad_norm': 2.362891912460327, 'learning_rate': 5e-05, 'epoch': 0.62}


 62%|██████▎   | 1000/1600 [08:50<03:01,  3.31it/s]

{'loss': 0.2727, 'grad_norm': 33.64863967895508, 'learning_rate': 5e-05, 'epoch': 0.62}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22475643455982208, 'eval_Accuracy': 0.94, 'eval_F1': 0.9382239382239382, 'eval_runtime': 23.9012, 'eval_samples_per_second': 66.942, 'eval_steps_per_second': 8.368, 'epoch': 0.62}


 63%|██████▎   | 1010/1600 [09:18<05:53,  1.67it/s]  

{'loss': 0.4645, 'grad_norm': 12.781903266906738, 'learning_rate': 5e-05, 'epoch': 0.63}


 64%|██████▍   | 1020/1600 [09:21<03:02,  3.17it/s]

{'loss': 0.2731, 'grad_norm': 40.1346321105957, 'learning_rate': 5e-05, 'epoch': 0.64}


 64%|██████▍   | 1030/1600 [09:24<02:52,  3.29it/s]

{'loss': 0.1103, 'grad_norm': 16.299917221069336, 'learning_rate': 5e-05, 'epoch': 0.64}


 65%|██████▌   | 1040/1600 [09:27<02:44,  3.40it/s]

{'loss': 0.2865, 'grad_norm': 65.38807678222656, 'learning_rate': 5e-05, 'epoch': 0.65}


 66%|██████▌   | 1050/1600 [09:30<02:42,  3.39it/s]

{'loss': 0.3831, 'grad_norm': 34.36532974243164, 'learning_rate': 5e-05, 'epoch': 0.66}


 66%|██████▋   | 1060/1600 [09:33<02:42,  3.32it/s]

{'loss': 0.345, 'grad_norm': 12.605340003967285, 'learning_rate': 5e-05, 'epoch': 0.66}


 67%|██████▋   | 1070/1600 [09:36<02:38,  3.35it/s]

{'loss': 0.3811, 'grad_norm': 54.09650802612305, 'learning_rate': 5e-05, 'epoch': 0.67}


 68%|██████▊   | 1080/1600 [09:39<02:39,  3.25it/s]

{'loss': 0.0796, 'grad_norm': 18.762699127197266, 'learning_rate': 5e-05, 'epoch': 0.68}


 68%|██████▊   | 1090/1600 [09:42<02:38,  3.23it/s]

{'loss': 0.2631, 'grad_norm': 28.536617279052734, 'learning_rate': 5e-05, 'epoch': 0.68}


 69%|██████▉   | 1100/1600 [09:45<02:31,  3.29it/s]

{'loss': 0.3203, 'grad_norm': 49.52091979980469, 'learning_rate': 5e-05, 'epoch': 0.69}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.33766159415245056, 'eval_Accuracy': 0.8925, 'eval_F1': 0.9020501138952164, 'eval_runtime': 24.0954, 'eval_samples_per_second': 66.403, 'eval_steps_per_second': 8.3, 'epoch': 0.69}


 69%|██████▉   | 1110/1600 [10:13<04:56,  1.65it/s]  

{'loss': 0.2454, 'grad_norm': 46.88303756713867, 'learning_rate': 5e-05, 'epoch': 0.69}


 70%|███████   | 1120/1600 [10:16<02:26,  3.28it/s]

{'loss': 0.2352, 'grad_norm': 0.6582671403884888, 'learning_rate': 5e-05, 'epoch': 0.7}


 71%|███████   | 1130/1600 [10:19<02:18,  3.39it/s]

{'loss': 0.1709, 'grad_norm': 6.01000452041626, 'learning_rate': 5e-05, 'epoch': 0.71}


 71%|███████▏  | 1140/1600 [10:23<02:16,  3.37it/s]

{'loss': 0.5931, 'grad_norm': 0.2548207938671112, 'learning_rate': 5e-05, 'epoch': 0.71}


 72%|███████▏  | 1150/1600 [10:26<02:14,  3.34it/s]

{'loss': 0.1634, 'grad_norm': 25.127058029174805, 'learning_rate': 5e-05, 'epoch': 0.72}


 72%|███████▎  | 1160/1600 [10:29<02:12,  3.33it/s]

{'loss': 0.1808, 'grad_norm': 1.4642853736877441, 'learning_rate': 5e-05, 'epoch': 0.72}


 73%|███████▎  | 1170/1600 [10:32<02:06,  3.39it/s]

{'loss': 0.1418, 'grad_norm': 2.262974739074707, 'learning_rate': 5e-05, 'epoch': 0.73}


 74%|███████▍  | 1180/1600 [10:35<02:20,  2.99it/s]

{'loss': 0.0887, 'grad_norm': 17.84630012512207, 'learning_rate': 5e-05, 'epoch': 0.74}


 74%|███████▍  | 1190/1600 [10:38<02:01,  3.38it/s]

{'loss': 0.1897, 'grad_norm': 39.81299591064453, 'learning_rate': 5e-05, 'epoch': 0.74}


 75%|███████▌  | 1200/1600 [10:41<02:16,  2.93it/s]

{'loss': 0.6032, 'grad_norm': 37.229915618896484, 'learning_rate': 5e-05, 'epoch': 0.75}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.13484030961990356, 'eval_Accuracy': 0.958125, 'eval_F1': 0.9584109248913718, 'eval_runtime': 24.856, 'eval_samples_per_second': 64.371, 'eval_steps_per_second': 8.046, 'epoch': 0.75}


 76%|███████▌  | 1210/1600 [11:10<04:01,  1.61it/s]

{'loss': 0.0697, 'grad_norm': 26.869001388549805, 'learning_rate': 5e-05, 'epoch': 0.76}


 76%|███████▋  | 1220/1600 [11:13<01:54,  3.33it/s]

{'loss': 0.1742, 'grad_norm': 24.89160919189453, 'learning_rate': 5e-05, 'epoch': 0.76}


 77%|███████▋  | 1230/1600 [11:16<01:47,  3.43it/s]

{'loss': 0.0834, 'grad_norm': 27.75754737854004, 'learning_rate': 5e-05, 'epoch': 0.77}


 78%|███████▊  | 1240/1600 [11:19<01:46,  3.37it/s]

{'loss': 0.0945, 'grad_norm': 1.3320624828338623, 'learning_rate': 5e-05, 'epoch': 0.78}


 78%|███████▊  | 1250/1600 [11:22<01:43,  3.40it/s]

{'loss': 0.2617, 'grad_norm': 57.199195861816406, 'learning_rate': 5e-05, 'epoch': 0.78}


 79%|███████▉  | 1260/1600 [11:25<01:39,  3.42it/s]

{'loss': 0.4401, 'grad_norm': 7.72467041015625, 'learning_rate': 5e-05, 'epoch': 0.79}


 79%|███████▉  | 1270/1600 [11:28<01:36,  3.43it/s]

{'loss': 0.2158, 'grad_norm': 13.536377906799316, 'learning_rate': 5e-05, 'epoch': 0.79}


 80%|████████  | 1280/1600 [11:31<01:34,  3.39it/s]

{'loss': 0.2449, 'grad_norm': 18.19223403930664, 'learning_rate': 5e-05, 'epoch': 0.8}


 81%|████████  | 1290/1600 [11:34<01:31,  3.40it/s]

{'loss': 0.2183, 'grad_norm': 0.05869515240192413, 'learning_rate': 5e-05, 'epoch': 0.81}


 81%|████████▏ | 1300/1600 [11:37<01:31,  3.29it/s]

{'loss': 0.2561, 'grad_norm': 0.6556193828582764, 'learning_rate': 5e-05, 'epoch': 0.81}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.14793890714645386, 'eval_Accuracy': 0.9525, 'eval_F1': 0.9526184538653366, 'eval_runtime': 23.3909, 'eval_samples_per_second': 68.403, 'eval_steps_per_second': 8.55, 'epoch': 0.81}


 82%|████████▏ | 1310/1600 [12:04<02:55,  1.65it/s]

{'loss': 0.1217, 'grad_norm': 31.595930099487305, 'learning_rate': 5e-05, 'epoch': 0.82}


 82%|████████▎ | 1320/1600 [12:07<01:28,  3.18it/s]

{'loss': 0.0303, 'grad_norm': 0.1062818244099617, 'learning_rate': 5e-05, 'epoch': 0.82}


 83%|████████▎ | 1330/1600 [12:10<01:22,  3.29it/s]

{'loss': 0.3674, 'grad_norm': 0.01570832170546055, 'learning_rate': 5e-05, 'epoch': 0.83}


 84%|████████▍ | 1340/1600 [12:13<01:18,  3.33it/s]

{'loss': 0.4491, 'grad_norm': 33.253570556640625, 'learning_rate': 5e-05, 'epoch': 0.84}


 84%|████████▍ | 1350/1600 [12:17<01:16,  3.27it/s]

{'loss': 0.2068, 'grad_norm': 0.064262256026268, 'learning_rate': 5e-05, 'epoch': 0.84}


 85%|████████▌ | 1360/1600 [12:20<01:12,  3.30it/s]

{'loss': 0.3204, 'grad_norm': 19.697368621826172, 'learning_rate': 5e-05, 'epoch': 0.85}


 86%|████████▌ | 1370/1600 [12:23<01:13,  3.14it/s]

{'loss': 0.1713, 'grad_norm': 1.783613920211792, 'learning_rate': 5e-05, 'epoch': 0.86}


 86%|████████▋ | 1380/1600 [12:26<01:05,  3.37it/s]

{'loss': 0.1225, 'grad_norm': 18.568628311157227, 'learning_rate': 5e-05, 'epoch': 0.86}


 87%|████████▋ | 1390/1600 [12:29<01:04,  3.28it/s]

{'loss': 0.154, 'grad_norm': 46.515140533447266, 'learning_rate': 5e-05, 'epoch': 0.87}


 88%|████████▊ | 1400/1600 [12:32<00:59,  3.36it/s]

{'loss': 0.2245, 'grad_norm': 0.007170724682509899, 'learning_rate': 5e-05, 'epoch': 0.88}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.24180664122104645, 'eval_Accuracy': 0.930625, 'eval_F1': 0.933572710951526, 'eval_runtime': 23.6753, 'eval_samples_per_second': 67.581, 'eval_steps_per_second': 8.448, 'epoch': 0.88}


 88%|████████▊ | 1410/1600 [12:59<01:54,  1.65it/s]

{'loss': 0.5237, 'grad_norm': 5.213416576385498, 'learning_rate': 5e-05, 'epoch': 0.88}


 89%|████████▉ | 1420/1600 [13:02<00:54,  3.32it/s]

{'loss': 0.2658, 'grad_norm': 45.02799987792969, 'learning_rate': 5e-05, 'epoch': 0.89}


 89%|████████▉ | 1430/1600 [13:05<00:50,  3.37it/s]

{'loss': 0.3819, 'grad_norm': 23.574384689331055, 'learning_rate': 5e-05, 'epoch': 0.89}


 90%|█████████ | 1440/1600 [13:08<00:47,  3.39it/s]

{'loss': 0.2255, 'grad_norm': 20.384963989257812, 'learning_rate': 5e-05, 'epoch': 0.9}


 91%|█████████ | 1450/1600 [13:11<00:43,  3.41it/s]

{'loss': 0.0927, 'grad_norm': 8.812178611755371, 'learning_rate': 5e-05, 'epoch': 0.91}


 91%|█████████▏| 1460/1600 [13:15<00:40,  3.42it/s]

{'loss': 0.3095, 'grad_norm': 44.572654724121094, 'learning_rate': 5e-05, 'epoch': 0.91}


 92%|█████████▏| 1470/1600 [13:18<00:38,  3.41it/s]

{'loss': 0.2489, 'grad_norm': 36.9034423828125, 'learning_rate': 5e-05, 'epoch': 0.92}


 92%|█████████▎| 1480/1600 [13:21<00:34,  3.47it/s]

{'loss': 0.0869, 'grad_norm': 0.007206608075648546, 'learning_rate': 5e-05, 'epoch': 0.93}


 93%|█████████▎| 1490/1600 [13:23<00:32,  3.38it/s]

{'loss': 0.3813, 'grad_norm': 10.001925468444824, 'learning_rate': 5e-05, 'epoch': 0.93}


 94%|█████████▍| 1500/1600 [13:26<00:29,  3.39it/s]

{'loss': 0.2682, 'grad_norm': 4.3562188148498535, 'learning_rate': 5e-05, 'epoch': 0.94}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.2044125199317932, 'eval_Accuracy': 0.93875, 'eval_F1': 0.9398034398034398, 'eval_runtime': 23.6251, 'eval_samples_per_second': 67.725, 'eval_steps_per_second': 8.466, 'epoch': 0.94}


 94%|█████████▍| 1510/1600 [13:54<00:53,  1.68it/s]

{'loss': 0.3139, 'grad_norm': 9.698267936706543, 'learning_rate': 5e-05, 'epoch': 0.94}


 95%|█████████▌| 1520/1600 [13:57<00:24,  3.27it/s]

{'loss': 0.1325, 'grad_norm': 4.749520778656006, 'learning_rate': 5e-05, 'epoch': 0.95}


 96%|█████████▌| 1530/1600 [14:00<00:20,  3.40it/s]

{'loss': 0.1578, 'grad_norm': 7.694940090179443, 'learning_rate': 5e-05, 'epoch': 0.96}


 96%|█████████▋| 1540/1600 [14:03<00:17,  3.44it/s]

{'loss': 0.2627, 'grad_norm': 0.28473103046417236, 'learning_rate': 5e-05, 'epoch': 0.96}


 97%|█████████▋| 1550/1600 [14:06<00:14,  3.43it/s]

{'loss': 0.2622, 'grad_norm': 0.7592993974685669, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1560/1600 [14:09<00:12,  3.31it/s]

{'loss': 0.1398, 'grad_norm': 34.821685791015625, 'learning_rate': 5e-05, 'epoch': 0.97}


 98%|█████████▊| 1570/1600 [14:12<00:08,  3.40it/s]

{'loss': 0.2597, 'grad_norm': 0.02084101364016533, 'learning_rate': 5e-05, 'epoch': 0.98}


 99%|█████████▉| 1580/1600 [14:15<00:05,  3.40it/s]

{'loss': 0.083, 'grad_norm': 0.0178854838013649, 'learning_rate': 5e-05, 'epoch': 0.99}


 99%|█████████▉| 1590/1600 [14:18<00:02,  3.39it/s]

{'loss': 0.1552, 'grad_norm': 0.23818464577198029, 'learning_rate': 5e-05, 'epoch': 0.99}


100%|██████████| 1600/1600 [14:21<00:00,  3.49it/s]

{'loss': 0.0203, 'grad_norm': 6.915718078613281, 'learning_rate': 5e-05, 'epoch': 1.0}


                                                   
Non-default generation parameters: {'max_length': 128}


{'eval_loss': 0.22189074754714966, 'eval_Accuracy': 0.95, 'eval_F1': 0.9499374217772215, 'eval_runtime': 23.8825, 'eval_samples_per_second': 66.995, 'eval_steps_per_second': 8.374, 'epoch': 1.0}


100%|██████████| 1600/1600 [14:46<00:00,  1.80it/s]
Non-default generation parameters: {'max_length': 128}


{'train_runtime': 886.5052, 'train_samples_per_second': 14.439, 'train_steps_per_second': 1.805, 'train_loss': 0.33325550756417216, 'epoch': 1.0}
***** train metrics *****
  epoch                    =         1.0
  total_flos               = 933951802GF
  train_loss               =      0.3333
  train_runtime            =  0:14:46.50
  train_samples_per_second =      14.439
  train_steps_per_second   =       1.805


100%|██████████| 200/200 [00:23<00:00,  8.61it/s]

***** eval metrics *****
  epoch                   =        1.0
  eval_Accuracy           =     0.9581
  eval_F1                 =     0.9584
  eval_loss               =     0.1348
  eval_runtime            = 0:00:23.39
  eval_samples_per_second =     68.405
  eval_steps_per_second   =      8.551



