<a href="https://colab.research.google.com/github/osherlock1/AI-Image-Detection-Model/blob/main/Initial_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Imports**

In [28]:
from google.colab import drive
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import os

from torch.utils.data import Dataset
from PIL import Image
import os

from torchvision import transforms


from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader

from torch.utils.data import ConcatDataset


## **Load in AI Dataset**

In [5]:

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
image_dir = "/content/drive/MyDrive/URI_spr25/ELE392/AI_Detection_Model/Datasets/AI_Images"

In [19]:

class AIDogDataset(Dataset):
    def __init__(self, image_dir, label=1, transform=None):
        self.image_dir = image_dir
        self.filenames = sorted([
            f for f in os.listdir(image_dir)
            if f.lower().endswith(('.png', '.jpg', '.jpeg'))
        ])
        self.label = label  # Fixed label for all images
        self.transform = transform

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        fname = self.filenames[idx]
        img_path = os.path.join(self.image_dir, fname)
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, self.label

In [21]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor()
])


dataset = AIDogDataset(image_dir=image_dir, label=1, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

images, labels = next(iter(dataloader))
print(f"Batch shape: {images.shape}. Labels: {labels[:5]}")

Batch shape: torch.Size([32, 3, 224, 224]). Labels: tensor([1, 1, 1, 1, 1])


## **Real Dog Dataset**

In [25]:
real_dogs_path = "/content/drive/MyDrive/URI_spr25/ELE392/AI_Detection_Model/Datasets/Images"  # Update if different

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

real_dataset = ImageFolder(root=real_dogs_path, transform=transform)

In [26]:
print(f"Total real images: {len(real_dataset)}")


Total real images: 20580


In [32]:
# Path to Stanford Dogs (update if needed)


class FixedLabelDataset(Dataset):
    def __init__(self, base_dataset, fixed_label):
        self.base = base_dataset
        self.label = fixed_label

    def __len__(self):
        return len(self.base)

    def __getitem__(self, idx):
        x, _ = self.base[idx]
        return x, self.label

In [40]:
# Subset 1000 real images from Stanford Dogs
real_subset = torch.utils.data.Subset(real_dataset, range(1200))
real_fixed = FixedLabelDataset(real_subset, fixed_label=0)

# AI dataset is already using label=1, so we keep that as is
ai_fixed = dataset  # your AI dataset with label=1

# Combine
from torch.utils.data import ConcatDataset, DataLoader

combined = ConcatDataset([real_fixed, ai_fixed])
dataloader = DataLoader(combined, batch_size=32, shuffle=True)

# Test again
images, labels = next(iter(dataloader))
print("Batch shape:", images.shape)
print("Label sample:", labels[:10])

Batch shape: torch.Size([32, 3, 224, 224])
Label sample: tensor([0, 1, 1, 0, 0, 0, 1, 1, 0, 1])


## **Combine Datasets**

In [41]:
balanced_dataset = ConcatDataset([real_dataset_1000, dataset])
dataloader = DataLoader(balanced_dataset, batch_size=32, shuffle=True)

# Preview labels
images, labels = next(iter(dataloader))
print("Batch shape:", images.shape)
print("Label sample:", labels[:10])

Batch shape: torch.Size([32, 3, 224, 224])
Label sample: tensor([0, 1, 1, 0, 1, 2, 1, 5, 1, 1])


In [42]:
len(balanced_dataset)

2260

## **Save the Dataset**

In [44]:
from torchvision.utils import save_image

# 📁 Set destination folder in Drive
save_root = "/content/drive/MyDrive/URI_spr25/ELE392/AI_Detection_Model/Datasets/Processed_Dataset"
real_dir = os.path.join(save_root, "real")
ai_dir = os.path.join(save_root, "ai")
os.makedirs(real_dir, exist_ok=True)
os.makedirs(ai_dir, exist_ok=True)

# 💾 Save images into appropriate folders
for i, (img, label) in enumerate(combined):
    label = int(label)
    folder = real_dir if label == 0 else ai_dir
    filename = f"{'real' if label == 0 else 'ai'}_{i:04}.png"
    save_path = os.path.join(folder, filename)
    save_image(img, save_path)

print(f"✅ Saved {len(combined)} images to {save_root}")

✅ Saved 2460 images to /content/drive/MyDrive/URI_spr25/ELE392/AI_Detection_Model/Datasets/Processed_Dataset
