### Imports

In [1]:
# https://huggingface.co/Organika/sdxl-detector#validation-metrics
# The code below fine-tunes the model on the faces and art datasets (separately) and evaluates the model on their validation sets
# https://huggingface.co/blog/fine-tune-vit

import os
import torch
from transformers import AutoImageProcessor, SwinForImageClassification, TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

print(os.getcwd())
os.chdir("..") # have to go up one directory, can also use os.chdir("..")
print(os.getcwd())

# CUDA check 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


c:\Users\metet\OneDrive\Documents\GitHub\thesis\RQ2
c:\Users\metet\OneDrive\Documents\GitHub\thesis
cuda


### Load Datasets and Processor

In [2]:
# Load model
processor = AutoImageProcessor.from_pretrained("Organika/sdxl-detector")
# model = AutoModelForImageClassification.from_pretrained("Organika/sdxl-detector")

# Load data
art_dataset_path = 'archive/datasets/art_512x512'
faces_dataset_path = 'archive/datasets/faces_512x512'
mixed_dataset_path = 'archive/datasets/mixed_512x512'

art_ds = load_dataset("imagefolder", data_dir=art_dataset_path) # wish i knew about this when i was doing RQ1
faces_ds = load_dataset("imagefolder", data_dir=faces_dataset_path)
mixed_ds = load_dataset("imagefolder", data_dir=mixed_dataset_path)

print(faces_ds)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12800
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 1600
    })
})


### Setting Up

In [3]:
# Transform images to model input
def transform(image_batch):
    inputs = processor(images=image_batch['image'], return_tensors="pt")
    inputs['labels'] = torch.tensor(image_batch['label']).to(device)  # Ensure labels are tensors
    inputs['pixel_values'] = inputs['pixel_values'].to(device) 
    return inputs

art_ds_transformed = art_ds.with_transform(transform)
faces_ds_transformed = faces_ds.with_transform(transform)
mixed_ds_transformed = mixed_ds.with_transform(transform)

print(type(art_ds))
print(type(art_ds_transformed))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.dataset_dict.DatasetDict'>


In [4]:
# Turn dicts into tensors
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]).to(device),
        'labels': torch.tensor([x['labels'] for x in batch]).to(device)
    }

In [5]:
# Define metrics
acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(p):
    acc = acc_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    f1 = f1_metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
    return {"Accuracy": acc["accuracy"], "F1": f1["f1"]}

### Parameters

In [6]:
# dataset_to_use = faces_ds_transformed
# dataset_to_use = art_ds_transformed
dataset_to_use = mixed_ds_transformed

lr_values = [1e-5, 5e-5, 1e-4, 5e-4, 1e-3] 
chosen_lr = 1

# output_dir_name = "./sdxl-fine-tune"
# output_dir_name = "./sdxl-fine-tune-art"
output_dir_name = "./sdxl-fine-tune-mixed"

### Load Model

In [7]:
# Extract the labels
labels = dataset_to_use['train'].features['label'].names
print(labels[0:2])

# Load the pre-trained model
model = SwinForImageClassification.from_pretrained(
    "Organika/sdxl-detector", 
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
).to(device)

['0', '1']


In [9]:
# TrainingArguments: only learning rate is varied, rest are default and unchanged
training_args = TrainingArguments(
    output_dir=output_dir_name,
    per_device_train_batch_size=8,
    eval_strategy="steps",
    num_train_epochs=1,
    fp16=False, # True leads to runtime errors on CUDA
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=lr_values[chosen_lr],  
    lr_scheduler_type="constant",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    dataloader_pin_memory=False, # otherwise it doesn't work with CUDA
)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=dataset_to_use["train"],
    eval_dataset=dataset_to_use["validation"],
    tokenizer=processor,
)

  trainer = Trainer(


### Training and Evaluation

In [11]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

metrics = trainer.evaluate(dataset_to_use['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5239,0.423261,0.779375,0.802241
200,0.5595,0.31134,0.85875,0.852288
300,0.3773,0.283029,0.878125,0.875717
400,0.4736,0.270512,0.89375,0.889467
500,0.1018,0.418334,0.888125,0.890251
600,0.605,0.308983,0.87625,0.863824
700,0.3649,0.379664,0.888125,0.89402
800,0.3095,0.216051,0.92375,0.925153
900,0.214,0.205126,0.92375,0.926329
1000,0.2727,0.224756,0.94,0.938224


Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}
Non-default generation parameters: {'max_length': 128}


***** train metrics *****
  epoch                    =         1.0
  total_flos               = 933951802GF
  train_loss               =      0.3333
  train_runtime            =  0:13:47.75
  train_samples_per_second =      15.464
  train_steps_per_second   =       1.933


***** eval metrics *****
  epoch                   =        1.0
  eval_Accuracy           =     0.9581
  eval_F1                 =     0.9584
  eval_loss               =     0.1348
  eval_runtime            = 0:00:22.91
  eval_samples_per_second =     69.818
  eval_steps_per_second   =      8.727
