In [None]:
import os
import numpy as np
import torch
import shutil
import random
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define paths
RAW_DATA_DIR = "quickdraw_raw"
PROCESSED_DATA_DIR = "quickdraw_processed"
NEW_DATA_DIR = "/content/drive/MyDrive/processed_new_data/"
OUTPUT_DIR = "/content/drive/MyDrive/final_dataset/"

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(NEW_DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# Selected Quick Draw classes
CLASSES = ["apple", "bee", "cat", "eyeglasses", "fish", "flower", "house", "pencil", "pizza"]
SAMPLES_PER_CLASS = 500  # Can be adjusted

In [None]:
# Function to download and load QuickDraw data
def load_quickdraw_class(class_name, num_samples):
    url = f"https://storage.googleapis.com/quickdraw_dataset/full/numpy_bitmap/{class_name}.npy"
    file_path = os.path.join(RAW_DATA_DIR, f"{class_name}.npy")

    if not os.path.exists(file_path):
        os.system(f"wget -O {file_path} {url}")

    data = np.load(file_path)
    return data[:num_samples]  # Take only the first num_samples

# Load QuickDraw dataset
quickdraw_data = {}
for cls in CLASSES:
    quickdraw_data[cls] = load_quickdraw_class(cls, SAMPLES_PER_CLASS)

# Save raw QuickDraw data for backup
np.save(os.path.join(RAW_DATA_DIR, "quickdraw_raw.npy"), quickdraw_data)

In [None]:
# Function to load new processed images
def load_new_images(data_dir):
    images = []
    labels = []

    transform = transforms.Compose([
        transforms.Grayscale(),
        transforms.Resize((28, 28)),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])

    for idx, cls in enumerate(CLASSES):
        class_dir = os.path.join(data_dir, cls)
        if os.path.exists(class_dir):
            for img_file in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_file)
                try:
                    image = Image.open(img_path).convert("L")  # Convert to grayscale
                    image = transform(image)
                    images.append(image.squeeze(0).numpy())  # Remove extra channel
                    labels.append(idx)
                except Exception as e:
                    print(f"Error loading {img_file}: {e}")

    return np.array(images), np.array(labels)

In [None]:
# Load new processed dataset
new_images, new_labels = load_new_images(NEW_DATA_DIR)

In [None]:
# Convert QuickDraw data to numpy arrays
quickdraw_images = []
quickdraw_labels = []

for idx, cls in enumerate(CLASSES):
    images = quickdraw_data[cls].reshape(-1, 28, 28)  # Reshape for consistency
    quickdraw_images.append(images)
    quickdraw_labels.extend([idx] * len(images))

quickdraw_images = np.vstack(quickdraw_images)  # Stack into one array
quickdraw_labels = np.array(quickdraw_labels)

In [None]:
# Combine QuickDraw and new processed images
combined_images = np.concatenate((quickdraw_images, new_images))
combined_labels = np.concatenate((quickdraw_labels, new_labels))

In [None]:
print(f"Total dataset size: {len(combined_images)} images")

Total dataset size: 5463 images


In [None]:
# Split dataset into Train (60%), Val (20%), Test (20%)
train_images, temp_images, train_labels, temp_labels = train_test_split(
    combined_images, combined_labels, test_size=0.4, random_state=42
)

val_images, test_images, val_labels, test_labels = train_test_split(
    temp_images, temp_labels, test_size=0.5, random_state=42
)

In [None]:
# Save dataset in .npy format
np.save(os.path.join(OUTPUT_DIR, "train_images.npy"), train_images)
np.save(os.path.join(OUTPUT_DIR, "train_labels.npy"), train_labels)
np.save(os.path.join(OUTPUT_DIR, "val_images.npy"), val_images)
np.save(os.path.join(OUTPUT_DIR, "val_labels.npy"), val_labels)
np.save(os.path.join(OUTPUT_DIR, "test_images.npy"), test_images)
np.save(os.path.join(OUTPUT_DIR, "test_labels.npy"), test_labels)

print(f"Train size: {len(train_images)}, Val size: {len(val_images)}, Test size: {len(test_images)}")

Train size: 3277, Val size: 1093, Test size: 1093
