# Benchmark Vision Models on CIFake Dataset

This notebook benchmarks convolutional and transformer-based networks defined in `vision_models.py` on the [CIFake dataset](https://www.kaggle.com/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images).


In [1]:
import os
from pathlib import Path
import numpy as np
import torch
from torchvision import datasets, transforms
from pipelines_torch.benchmark import BenchmarkRunner
from pipelines_torch.base import SimplePredictor
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from pipelines_torch.vision_models import MODEL_REGISTRY
from utils.metrics import METRIC_REGISTRY

DATA_DIR = Path('cifake_data')

In [2]:
# Requires Kaggle API credentials available as environment variables
# KAGGLE_USERNAME and KAGGLE_KEY. See https://www.kaggle.com/docs/api.
if not DATA_DIR.exists():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    !kaggle datasets download -d birdy654/cifake-real-and-ai-generated-synthetic-images -p $DATA_DIR --unzip

Dataset URL: https://www.kaggle.com/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images
License(s): other
Downloading cifake-real-and-ai-generated-synthetic-images.zip to cifake_data
  0%|                                                | 0.00/105M [00:00<?, ?B/s]Downloading cifake-real-and-ai-generated-synthetic-images.zip to cifake_data
  0%|                                                | 0.00/105M [00:00<?, ?B/s]
100%|████████████████████████████████████████| 105M/105M [00:00<00:00, 2.21GB/s]

100%|████████████████████████████████████████| 105M/105M [00:00<00:00, 2.21GB/s]


In [3]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
])

train_ds = datasets.ImageFolder(DATA_DIR / 'train', transform=transform)
X = torch.stack([img for img, _ in train_ds]).numpy()
y = np.array(train_ds.targets)
val_ds = datasets.ImageFolder(DATA_DIR / 'test', transform=transform)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

class_names = train_ds.classes
num_classes = len(class_names)

In [2]:
# Build model configs for BenchmarkRunner
model_configs = []
for name, model_class in MODEL_REGISTRY.items():
    # Skip models that might require special handling
    if name in ['qwen2_vl_qlora']:  # Skip complex models for now
        continue
    model_configs.append({
        "name": name,
        "class": model_class,
        "params": {"num_classes": 2}
    })

# Set up metrics
metrics = [METRIC_REGISTRY["accuracy"], METRIC_REGISTRY["f1"]]

In [None]:

# Configure BenchmarkRunner
runner = BenchmarkRunner(
    model_configs=model_configs,
    augmentations=[None],  # No augmentations for now
    metrics=metrics,
    task_type="classification",
    device="cuda" if torch.cuda.is_available() else "cpu",
    epochs=10,  # Reduced for faster testing
    batch_size=64,
    use_kfold=False,  # Single train/val split for speed
    learning_rate=1e-3,
    random_state=42
)

# Run benchmark
print(f"Running benchmark on {len(X)} samples with {num_classes} classes")
print(f"Data shape: {X.shape}")
print(f"Using device: {runner.device}")

results_df = runner.run(X, y)
print("\nBenchmark Results:")
print(results_df)

Running benchmark on 100000 samples with 2 classes
Data shape: (100000, 3, 32, 32)
Using device: cpu


Models:   0%|          | 0/6 [00:00<?, ?it/s]


Running Model: simple_cnn | Augmentation: none
Class distribution: {np.int64(0): np.int64(50000), np.int64(1): np.int64(50000)}
Computed class weights: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0)}
Computed class weights: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0)}


Models:   0%|          | 0/6 [00:09<?, ?it/s]       [A



KeyboardInterrupt: 

In [3]:
from utils.utils import save_model, save_metrics, load_model
import pandas as pd
RESULTS_DIR = "results"
TEST_DATA_DIR = Path('cifake_data/test')
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
])

test_ds = datasets.ImageFolder(TEST_DATA_DIR, transform=transform)
X_test = torch.stack([img for img, _ in test_ds]).numpy()
y_test = np.array(test_ds.targets) # True labels for evaluation
print(f"Test Data shape: {X_test.shape}")
# Evaluate each model on test set
# Evaluate all models
test_results = []
for model_cfg in model_configs:
    model_name = model_cfg['name']
    if model_name == 'qwen2_vl_qlora':
        continue  # Skip complex models for now
    # Load trained model
    model = load_model(model_cfg['class'], model_name, model_cfg['params'], path_start='vision_weights')
    is_torch_model = hasattr(model, 'parameters')
    pipeline = SimplePredictor(
        model=model,
        task_type='classification'
    )
    y_pred = pipeline.predict(X_test)
    # Collect metrics
    result = {}
    for metric in metrics:
        metric_key = getattr(metric, 'name', None) or getattr(metric, '__name__', None) or str(metric)
        result[metric_key] = metric(y_test, y_pred)
    result['model'] = model_name
    test_results.append(result)

# Save test results
test_results_df = pd.DataFrame(test_results)
test_results_df.to_csv(os.path.join(RESULTS_DIR, "vision_test_results.csv"), index=False)
print("Test results saved to", os.path.join(RESULTS_DIR, "vision_test_results.csv"))

import matplotlib.pyplot as plt
test_results_df.set_index('model').plot(kind='bar', figsize=(12,6))
plt.ylabel("Score")
plt.xlabel("Model")
plt.xticks(rotation=45)
plt.legend(title="Metric")
plt.tight_layout()
plt.show()

test_results_df

Test Data shape: (20000, 3, 32, 32)


RuntimeError: Error(s) in loading state_dict for SimpleCNN:
	Missing key(s) in state_dict: "features.0.weight", "features.0.bias", "features.3.weight", "features.3.bias", "classifier.1.weight", "classifier.1.bias", "classifier.3.weight", "classifier.3.bias". 
	Unexpected key(s) in state_dict: "model_state_dict", "optimizer_state_dict", "scheduler_state_dict", "class_weights", "task_type", "history", "epochs", "batch_size", "device". 