### Imports

In [66]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import numpy as np
import evaluate

print(os.getcwd())
os.chdir('e:\\Projects & Temp\\GitHub\\thesis')

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

e:\Projects & Temp\GitHub\thesis


### Load Datasets and Processor

In [67]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)

print(faces_ds)

  3%|▎         | 100/3200 [18:34<9:36:04, 11.15s/it]

[A

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [68]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)

In [69]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [70]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

### Parameters

In [71]:
dataset_to_use = faces_ds_transformed
# dataset_to_use = art_ds_transformed

### Load Model

In [72]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [64]:
lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 

# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir="./sdxl-fine-tune",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    fp16=False,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[2],  
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)




In [62]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)



[A                                              

                                                  
  3%|▎         | 100/3200 [03:13<46:28,  1.11it/s]

{'loss': 1.9925, 'grad_norm': 5.362734317779541, 'learning_rate': 9.981250000000001e-05, 'epoch': 0.01}



[A                                              

                                                  
  3%|▎         | 100/3200 [03:22<46:28,  1.11it/s]

{'loss': 0.6193, 'grad_norm': 12.08908748626709, 'learning_rate': 9.95e-05, 'epoch': 0.03}



[A                                              

                                                  
  3%|▎         | 100/3200 [03:31<46:28,  1.11it/s]

{'loss': 0.7074, 'grad_norm': 11.44068717956543, 'learning_rate': 9.91875e-05, 'epoch': 0.04}



[A                                              

                                                  
  3%|▎         | 100/3200 [03:41<46:28,  1.11it/s]

{'loss': 0.5987, 'grad_norm': 7.1387619972229, 'learning_rate': 9.8875e-05, 'epoch': 0.05}



[A                                              

                                                  
  3%|▎         | 100/3200 [03:50<46:28,  1.11it/s]

{'loss': 0.5599, 'grad_norm': 6.64707612991333, 'learning_rate': 9.85625e-05, 'epoch': 0.06}



[A                                              

                                                  
  3%|▎         | 100/3200 [04:00<46:28,  1.11it/s]

{'loss': 0.6487, 'grad_norm': 20.00424575805664, 'learning_rate': 9.825e-05, 'epoch': 0.07}



[A                                              

                                                  
  3%|▎         | 100/3200 [04:09<46:28,  1.11it/s]

{'loss': 0.5206, 'grad_norm': 4.478532314300537, 'learning_rate': 9.79375e-05, 'epoch': 0.09}



[A                                              

                                                  
  3%|▎         | 100/3200 [04:18<46:28,  1.11it/s]

{'loss': 0.5054, 'grad_norm': 7.102513790130615, 'learning_rate': 9.7625e-05, 'epoch': 0.1}



[A                                              

                                                  
  3%|▎         | 100/3200 [04:28<46:28,  1.11it/s]

{'loss': 0.3012, 'grad_norm': 10.7063570022583, 'learning_rate': 9.73125e-05, 'epoch': 0.11}



[A                                               

                                                  
  3%|▎         | 100/3200 [04:37<46:28,  1.11it/s]

{'loss': 0.4001, 'grad_norm': 19.147008895874023, 'learning_rate': 9.7e-05, 'epoch': 0.12}




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

{'eval_loss': 0.5720576643943787, 'eval_Accuracy': 0.8075, 'eval_F1': 0.8345864661654135, 'eval_runtime': 40.5904, 'eval_samples_per_second': 39.418, 'eval_steps_per_second': 4.927, 'epoch': 0.12}



[A                                                 

                                                  
  3%|▎         | 100/3200 [05:35<46:28,  1.11it/s]

{'loss': 0.4917, 'grad_norm': 7.4598307609558105, 'learning_rate': 9.668750000000001e-05, 'epoch': 0.14}



[A                                               

                                                  
  3%|▎         | 100/3200 [05:46<46:28,  1.11it/s]

{'loss': 0.44, 'grad_norm': 12.901555061340332, 'learning_rate': 9.6375e-05, 'epoch': 0.15}



[A                                               

                                                  
  3%|▎         | 100/3200 [05:58<46:28,  1.11it/s]

{'loss': 0.4052, 'grad_norm': 7.933369159698486, 'learning_rate': 9.60625e-05, 'epoch': 0.16}



[A                                               

                                                  
  3%|▎         | 100/3200 [06:09<46:28,  1.11it/s]

{'loss': 0.3171, 'grad_norm': 16.652332305908203, 'learning_rate': 9.575000000000001e-05, 'epoch': 0.17}



[A                                               

                                                  
  3%|▎         | 100/3200 [06:21<46:28,  1.11it/s]

{'loss': 0.2157, 'grad_norm': 6.40401554107666, 'learning_rate': 9.54375e-05, 'epoch': 0.19}



[A                                               

                                                  
  3%|▎         | 100/3200 [06:33<46:28,  1.11it/s]

{'loss': 0.3197, 'grad_norm': 28.469350814819336, 'learning_rate': 9.512500000000001e-05, 'epoch': 0.2}



[A                                               

                                                  
  3%|▎         | 100/3200 [06:44<46:28,  1.11it/s]

{'loss': 0.1501, 'grad_norm': 7.256320953369141, 'learning_rate': 9.481250000000001e-05, 'epoch': 0.21}



[A                                               

                                                  
  3%|▎         | 100/3200 [06:56<46:28,  1.11it/s]

{'loss': 0.3141, 'grad_norm': 27.230979919433594, 'learning_rate': 9.449999999999999e-05, 'epoch': 0.23}



[A                                               

                                                  
  3%|▎         | 100/3200 [07:08<46:28,  1.11it/s]

{'loss': 0.5008, 'grad_norm': 69.35633850097656, 'learning_rate': 9.41875e-05, 'epoch': 0.24}



[A                                               

                                                  
  3%|▎         | 100/3200 [07:19<46:28,  1.11it/s]

{'loss': 0.3647, 'grad_norm': 10.145727157592773, 'learning_rate': 9.3875e-05, 'epoch': 0.25}




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

{'eval_loss': 0.1823907047510147, 'eval_Accuracy': 0.934375, 'eval_F1': 0.9311475409836065, 'eval_runtime': 40.7585, 'eval_samples_per_second': 39.256, 'eval_steps_per_second': 4.907, 'epoch': 0.25}



[A                                                 

                                                  
  3%|▎         | 100/3200 [08:16<46:28,  1.11it/s]

{'loss': 0.2883, 'grad_norm': 10.142170906066895, 'learning_rate': 9.35625e-05, 'epoch': 0.26}



[A                                               

                                                  
  3%|▎         | 100/3200 [08:27<46:28,  1.11it/s]

{'loss': 0.3068, 'grad_norm': 37.33943557739258, 'learning_rate': 9.325e-05, 'epoch': 0.28}



[A                                               

                                                  
  3%|▎         | 100/3200 [08:39<46:28,  1.11it/s]

{'loss': 0.3348, 'grad_norm': 13.122305870056152, 'learning_rate': 9.29375e-05, 'epoch': 0.29}



[A                                               

                                                  
  3%|▎         | 100/3200 [08:51<46:28,  1.11it/s]

{'loss': 0.474, 'grad_norm': 13.046664237976074, 'learning_rate': 9.2625e-05, 'epoch': 0.3}



[A                                               

                                                  
  3%|▎         | 100/3200 [09:02<46:28,  1.11it/s]

{'loss': 0.4041, 'grad_norm': 2.509150266647339, 'learning_rate': 9.23125e-05, 'epoch': 0.31}




RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
