<a href="https://colab.research.google.com/github/nisharani-dev/real_vs_ai_image_classifier/blob/main/real_vs_ai_work_in_prog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Kaggle API
!pip install kaggle --quiet

from google.colab import files
files.upload()  # Upload kaggle.json

# Setup kaggle.json
!mkdir -p /root/.kaggle
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

# Create directory for extra datasets
!mkdir -p extra_data

# Download datasets (note: -p is lowercase)
!kaggle datasets download -d shahzaibshazoo/detect-ai-generated-faces-high-quality-dataset -p extra_data
!kaggle datasets download -d hamzaboulahia/hardfakevsrealfaces -p extra_data
!kaggle datasets download -d dullaz/1m-ai-generated-faces-128x128 -p extra_data
!kaggle datasets download -d jessicali9530/celeba-dataset -p extra_data  # Optional real set
!kaggle datasets download -d xhlulu/140k-real-and-fake-faces


# Unzip everything
!unzip -q "extra_data/*.zip" -d extra_data


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/shahzaibshazoo/detect-ai-generated-faces-high-quality-dataset
License(s): apache-2.0
Downloading detect-ai-generated-faces-high-quality-dataset.zip to extra_data
 86% 99.0M/116M [00:00<00:00, 1.03GB/s]
100% 116M/116M [00:00<00:00, 1.06GB/s] 
Dataset URL: https://www.kaggle.com/datasets/hamzaboulahia/hardfakevsrealfaces
License(s): CC0-1.0
Downloading hardfakevsrealfaces.zip to extra_data
  0% 0.00/15.3M [00:00<?, ?B/s]
100% 15.3M/15.3M [00:00<00:00, 1.07GB/s]
Dataset URL: https://www.kaggle.com/datasets/dullaz/1m-ai-generated-faces-128x128
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading 1m-ai-generated-faces-128x128.zip to extra_data
100% 4.06G/4.08G [00:35<00:00, 130MB/s]
100% 4.08G/4.08G [00:35<00:00, 123MB/s]
Dataset URL: https://www.kaggle.com/datasets/jessicali9530/celeba-dataset
License(s): other
Downloading celeba-dataset.zip to extra_data
 97% 1.29G/1.33G [00:08<00:

In [None]:
import os
print(os.listdir("extra_data"))


['hardfakevsrealfaces.zip', 'real_vs_fake', 'AI-face-detection-Dataset', 'detect-ai-generated-faces-high-quality-dataset.zip', 'test.csv', 'hardfakevsrealfaces', '1m-ai-generated-faces-128x128', 'real', 'valid.csv', 'list_attr_celeba.csv', 'fake', 'celeba-dataset.zip', 'list_landmarks_align_celeba.csv', 'dataset.csv', 'list_eval_partition.csv', 'celeba-dataset', 'detect-ai-generated-faces-high-quality-dataset', 'img_align_celeba', '140k-real-and-fake-faces.zip', 'train.csv', 'fake_faces_dataset', 'data.csv', 'list_bbox_celeba.csv', '1m-ai-generated-faces-128x128.zip']


In [None]:
import os

# Paths to the extracted folders
extracted_folders = [
    "extra_data/hardfakevsrealfaces",
    "extra_data/detect-ai-generated-faces-high-quality-dataset",
    "extra_data/1m-ai-generated-faces-128x128",
    "extra_data/celeba-dataset",
    "extra_data/fake_faces_dataset"   # probably corresponds to 140k dataset
]

def list_classes(dataset_path):
    return [folder for folder in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, folder))]

for folder in extracted_folders:
    classes = list_classes(folder)
    print(f"Dataset '{folder}' classes: {classes}")


Dataset 'extra_data/hardfakevsrealfaces' classes: ['real', 'fake']
Dataset 'extra_data/detect-ai-generated-faces-high-quality-dataset' classes: ['AI-face-detection-Dataset']
Dataset 'extra_data/1m-ai-generated-faces-128x128' classes: ['fake_faces_dataset']
Dataset 'extra_data/celeba-dataset' classes: ['img_align_celeba']
Dataset 'extra_data/fake_faces_dataset' classes: []


In [None]:
def load_images_fixed(dataset_path, real_class=None, fake_class=None, num_images=2500):
    images, labels = [], []

    def load_from_path(path, label_val):
        if not os.path.exists(path):
            return
        # Only take files, ignore directories
        all_files = [f for f in glob(os.path.join(path, "*")) if os.path.isfile(f)]
        selected_files = random.sample(all_files, min(len(all_files), num_images))
        for img_path in selected_files:
            try:
                img = Image.open(img_path).convert('RGB')
                images.append(img)
                labels.append(label_val)
            except:
                pass  # skip corrupted files

    # Load real images
    if real_class:
        load_from_path(os.path.join(dataset_path, real_class), 0)
    elif real_class is None and fake_class is None:  # treat all images in folder as real
        load_from_path(dataset_path, 0)

    # Load fake images
    if fake_class:
        load_from_path(os.path.join(dataset_path, fake_class), 1)
    elif fake_class is None and real_class is None:  # treat all images as fake
        load_from_path(dataset_path, 1)

    return images, labels


In [None]:
# Corrected dataset info
datasets_info = [
    ("extra_data/hardfakevsrealfaces", "real", "fake"),
    ("extra_data/real_vs_fake/real-vs-fake", "train/real", "train/fake"),
    ("extra_data/celeba-dataset/img_align_celeba", None, None),
    ("extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset", None, None),
    ("extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset", None, None)
]

# Load all datasets
X_all, y_all = [], []

for dataset_path, real_class, fake_class in datasets_info:
    X, y = load_images_fixed(dataset_path, real_class, fake_class, num_images=2500)
    print(f"Loaded {len(X)} images from {dataset_path}")
    X_all.extend(X)
    y_all.extend(y)

print(f"\nTotal images loaded: {len(X_all)}")


Loaded 1289 images from extra_data/hardfakevsrealfaces
Loaded 5000 images from extra_data/real_vs_fake/real-vs-fake
Loaded 0 images from extra_data/celeba-dataset/img_align_celeba
Loaded 0 images from extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset
Loaded 5000 images from extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset

Total images loaded: 11289


In [None]:
import os
import random
from PIL import Image

def load_images_recursive(dataset_path, real_class=None, fake_class=None, num_images=2500):
    """
    Load images from a dataset, recursively searching all subfolders.
    """
    images, labels = [], []

    def find_all_images(path):
        # Recursively collect all image files
        image_files = []
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    image_files.append(os.path.join(root, file))
        return image_files

    def load_from_path(path, label_val):
        all_files = find_all_images(path)
        selected_files = random.sample(all_files, min(len(all_files), num_images))
        for img_path in selected_files:
            try:
                img = Image.open(img_path).convert('RGB')
                images.append(img)
                labels.append(label_val)
            except:
                pass  # skip corrupted files

    # Load real images
    if real_class:
        load_from_path(os.path.join(dataset_path, real_class), 0)
    elif real_class is None and fake_class is None:
        # Treat all images in folder as real if no classes provided
        load_from_path(dataset_path, 0)

    # Load fake images
    if fake_class:
        load_from_path(os.path.join(dataset_path, fake_class), 1)
    elif fake_class is None and real_class is None:
        # Treat all images as fake if no classes provided
        load_from_path(dataset_path, 1)

    return images, labels

# Corrected dataset info
datasets_info = [
    ("extra_data/hardfakevsrealfaces", "real", "fake"),
    ("extra_data/real_vs_fake/real-vs-fake", "train/real", "train/fake"),
    ("extra_data/celeba-dataset/img_align_celeba", None, None),
    ("extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset", None, None),
    ("extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset", None, None)
]

# Load all datasets
X_all, y_all = [], []

for dataset_path, real_class, fake_class in datasets_info:
    X, y = load_images_recursive(dataset_path, real_class, fake_class, num_images=2500)
    print(f"Loaded {len(X)} images from {dataset_path}")
    X_all.extend(X)
    y_all.extend(y)

print(f"\nTotal images loaded: {len(X_all)}")


Loaded 1289 images from extra_data/hardfakevsrealfaces
Loaded 5000 images from extra_data/real_vs_fake/real-vs-fake
Loaded 5000 images from extra_data/celeba-dataset/img_align_celeba
Loaded 5000 images from extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset
Loaded 5000 images from extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset

Total images loaded: 21289


In [None]:
import os

def count_labels(dataset_path, real_class=None, fake_class=None):
    """
    Count number of real and fake images in a dataset folder.
    """
    def count_images(path):
        count = 0
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    count += 1
        return count

    real_count = count_images(os.path.join(dataset_path, real_class)) if real_class else 0
    fake_count = count_images(os.path.join(dataset_path, fake_class)) if fake_class else 0

    # If both classes are None, assume all images belong to one type
    if real_class is None and fake_class is None:
        total_files = count_images(dataset_path)
        # You can decide to treat them all as real or fake
        # Here we check which dataset it is to label correctly
        dataset_name = os.path.basename(dataset_path).lower()
        if "celeba" in dataset_name:
            real_count = total_files
        else:
            fake_count = total_files

    return real_count, fake_count

# Dataset info
datasets_info = [
    ("extra_data/hardfakevsrealfaces", "real", "fake"),
    ("extra_data/real_vs_fake/real-vs-fake", "train/real", "train/fake"),
    ("extra_data/celeba-dataset/img_align_celeba", None, None),
    ("extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset", None, None),
    ("extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset", None, None)
]

# Count images
for dataset_path, real_class, fake_class in datasets_info:
    real_count, fake_count = count_labels(dataset_path, real_class, fake_class)
    print(f"Dataset: {dataset_path}")
    print(f"  Real images: {real_count}")
    print(f"  Fake images: {fake_count}\n")


Dataset: extra_data/hardfakevsrealfaces
  Real images: 589
  Fake images: 700

Dataset: extra_data/real_vs_fake/real-vs-fake
  Real images: 50000
  Fake images: 50000

Dataset: extra_data/celeba-dataset/img_align_celeba
  Real images: 202599
  Fake images: 0

Dataset: extra_data/detect-ai-generated-faces-high-quality-dataset/AI-face-detection-Dataset
  Real images: 0
  Fake images: 3203

Dataset: extra_data/1m-ai-generated-faces-128x128/fake_faces_dataset
  Real images: 0
  Fake images: 1120885



In [None]:
import numpy as np
from PIL import Image

rows = []
print("Extracting features for all images...")
for idx, (img, label) in tqdm(enumerate(zip(X_all, y_all)), total=len(X_all)):
    try:
        # Convert PIL images to RGB NumPy array if needed
        if isinstance(img, Image.Image):
            img = np.array(img.convert("RGB"), dtype=np.float32) / 255.0
        # Ensure shape is HxWx3
        if img.ndim != 3 or img.shape[2] != 3:
            print(f"Skipping image index {idx}: invalid shape {img.shape}")
            continue

        feats = extract_features_img(img)
    except Exception as e:
        print(f"Skipping image index {idx}:", e)
        continue

    entry = {f"f{i}": float(feats[i]) for i in range(len(feats))}
    entry['label'] = label  # 0=fake, 1=real
    entry['index'] = idx
    rows.append(entry)

feats_df = pd.DataFrame(rows)
print("Feature matrix shape:", feats_df.shape)


Extracting features for all images...


100%|██████████| 21289/21289 [02:43<00:00, 129.99it/s]


Feature matrix shape: (21289, 27)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

# 1) Prepare X and y
feat_cols = [c for c in feats_df.columns if c.startswith('f')]
X = feats_df[feat_cols].values
y = feats_df['label'].values  # 0=fake, 1=real

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)

# 2) Split into train / validation / test (stratified)
RANDOM_SEED = 42
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=RANDOM_SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_SEED
)

print("Train shape:", X_train.shape, "Val shape:", X_val.shape, "Test shape:", X_test.shape)

# 3) Scale features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

# 4) Train RandomForest
clf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=RANDOM_SEED,
    n_jobs=-1
)
print("Training RandomForest...")
clf.fit(X_train_s, y_train)

# 5) Evaluate
y_val_pred = clf.predict(X_val_s)
y_test_pred = clf.predict(X_test_s)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))

# 6) Save model and scaler
MODEL_FILE = "rf_forensic_model_combined.joblib"
SCALER_FILE = "rf_forensic_scaler_combined.joblib"
joblib.dump(clf, MODEL_FILE)
joblib.dump(scaler, SCALER_FILE)
print(f"Saved model -> {MODEL_FILE} and scaler -> {SCALER_FILE}")


Feature matrix shape: (21289, 25)
Labels shape: (21289,)
Train shape: (14902, 25) Val shape: (3193, 25) Test shape: (3194, 25)
Training RandomForest...
Validation Accuracy: 0.4901346695897275
Test Accuracy: 0.4715090795241077

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.47      0.44      0.46      1589
           1       0.48      0.50      0.49      1605

    accuracy                           0.47      3194
   macro avg       0.47      0.47      0.47      3194
weighted avg       0.47      0.47      0.47      3194

Confusion Matrix (Test):
 [[706 883]
 [805 800]]
Saved model -> rf_forensic_model_combined.joblib and scaler -> rf_forensic_scaler_combined.joblib


In [None]:
from google.colab import files
import cv2
import numpy as np

# Upload an image
uploaded = files.upload()
for filename in uploaded.keys():
    print("Uploaded file:", filename)

    # Prediction
    label, prob = detect_ai_image(filename)
    print(f"Prediction: {label}, Probability of real: {prob:.4f}")


Saving ChatGPT Image Aug 12, 2025, 08_01_02 PM.png to ChatGPT Image Aug 12, 2025, 08_01_02 PM (6).png
Uploaded file: ChatGPT Image Aug 12, 2025, 08_01_02 PM (6).png
Prediction: real, Probability of real: 0.7287


In [None]:
from google.colab import files

# Upload image
uploaded = files.upload()
for filename in uploaded.keys():
    label, prob = detect_ai_image(filename)
    print(f"Prediction: {label}, Probability of real: {prob:.4f}")

Saving test3.jpeg to test3.jpeg
Prediction: fake, Probability of real: 0.4716


In [None]:
from google.colab import files

# Upload image
uploaded = files.upload()
for filename in uploaded.keys():
    label, prob = detect_ai_image(filename)
    print(f"Prediction: {label}, Probability of real: {prob:.4f}")

Saving Test2.jpg to Test2.jpg
Prediction: real, Probability of real: 0.5010
