<a href="https://colab.research.google.com/github/lorenzrossi/MachineLearning-CatsDogs-NN/blob/master/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download the Cats and Dogs Dataset

The dataset used is the famous **Dogs vs Cats** dataset from Kaggle:
- **Kaggle Competition**: https://www.kaggle.com/c/dogs-vs-cats
- **Direct Dataset Link**: https://www.kaggle.com/c/dogs-vs-cats/data

The dataset contains 25,000 images of cats and dogs (12,500 each).

### Option 1: Download using Kaggle API (Recommended)

To use this method, you need to:
1. Go to https://www.kaggle.com/account and create an API token
2. Upload the `kaggle.json` file to your Colab session


In [None]:
# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Setup Kaggle API credentials check FIRST (before installing/importing kaggle)
import os
DOWNLOAD_DATASET = False
kaggle_json_path = None

if IN_COLAB:
    # In Colab, we'll check after upload
    kaggle_json_path = os.path.join(os.path.expanduser('~'), '.kaggle', 'kaggle.json')
    # Assume we'll upload it
    DOWNLOAD_DATASET = True
else:
    # For local systems, check if kaggle.json exists BEFORE trying to use kaggle
    kaggle_dir = os.path.join(os.path.expanduser('~'), '.kaggle')
    kaggle_json_path = os.path.join(kaggle_dir, 'kaggle.json')
    os.makedirs(kaggle_dir, exist_ok=True)
    
    if os.path.exists(kaggle_json_path):
        print(f"✓ Found Kaggle credentials at: {kaggle_json_path}")
        DOWNLOAD_DATASET = True
    else:
        print("=" * 60)
        print("⚠️  Kaggle API credentials not found!")
        print("=" * 60)
        print(f"\nTo download the dataset, you need to:")
        print(f"1. Go to https://www.kaggle.com/account")
        print(f"2. Scroll down to 'API' section")
        print(f"3. Click 'Create New API Token' to download kaggle.json")
        print(f"4. Place it at: {kaggle_json_path}")
        print(f"\nAlternatively, you can:")
        print(f"- Skip this cell and manually download the dataset")
        print(f"- Use the pre-prepared pickle files from Google Drive (see README.md)")
        print("=" * 60)
        print("\n⚠️  Skipping dataset download. You can process existing data in the next cell.")
        DOWNLOAD_DATASET = False

# Only install/use Kaggle if we have credentials or are in Colab
if DOWNLOAD_DATASET:
    # Install Kaggle API
    if IN_COLAB:
        !pip install -q kaggle
        # Upload your kaggle.json file (run this cell and upload the file when prompted)
        from google.colab import files
        uploaded = files.upload()
        
        # Move kaggle.json to the correct location
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    else:
        import subprocess
        import sys
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kaggle"], 
                                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        except:
            pass  # Kaggle might already be installed

# Download the dataset (only if credentials are available)
if DOWNLOAD_DATASET:
    try:
        if IN_COLAB:
            !kaggle competitions download -c dogs-vs-cats
            download_dir = '/content'
            base_dir = '/content/drive/MyDrive/CatsDogs'
        else:
            import subprocess
            result = subprocess.run(['kaggle', 'competitions', 'download', '-c', 'dogs-vs-cats'], 
                                  capture_output=True, text=True)
            if result.returncode != 0:
                print(f"Error downloading dataset: {result.stderr}")
                raise Exception("Kaggle download failed")
            # Use current directory for local systems
            download_dir = os.getcwd()
            base_dir = os.path.join(os.getcwd(), 'CatsDogs')
        
        # Unzip the training data
        import zipfile
        import shutil
        
        # Unzip dogs-vs-cats.zip
        zip_path = os.path.join(download_dir, 'dogs-vs-cats.zip')
        if os.path.exists(zip_path):
            print("Extracting dogs-vs-cats.zip...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(download_dir)
        
        # Unzip train.zip
        train_zip_path = os.path.join(download_dir, 'train.zip')
        train_dir = os.path.join(download_dir, 'train')
        if os.path.exists(train_zip_path):
            print("Extracting train.zip...")
            with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
                zip_ref.extractall(download_dir)
        
        # Create the directory structure expected by the code
        os.makedirs(os.path.join(base_dir, 'Cats'), exist_ok=True)
        os.makedirs(os.path.join(base_dir, 'Dogs'), exist_ok=True)
        os.makedirs(os.path.join(base_dir, 'Pickles'), exist_ok=True)
        
        # Organize images into Cats and Dogs folders
        import glob
        
        if os.path.exists(train_dir):
            print("Organizing images into Cats and Dogs folders...")
            # Move cat images (files starting with 'cat.')
            cat_files = glob.glob(os.path.join(train_dir, 'cat.*'))
            for file in cat_files:
                shutil.move(file, os.path.join(base_dir, 'Cats'))
            
            # Move dog images (files starting with 'dog.')
            dog_files = glob.glob(os.path.join(train_dir, 'dog.*'))
            for file in dog_files:
                shutil.move(file, os.path.join(base_dir, 'Dogs'))
            
            print("Dataset downloaded and organized successfully!")
            print(f"Base directory: {base_dir}")
            print(f"Number of cat images: {len(os.listdir(os.path.join(base_dir, 'Cats')))}")
            print(f"Number of dog images: {len(os.listdir(os.path.join(base_dir, 'Dogs')))}")
        else:
            print(f"⚠️  Training directory not found at {train_dir}")
    except Exception as e:
        print(f"⚠️  Error during download: {e}")
        print("You can skip this step and process existing data in the next cell.")
else:
    # Set base directory even if not downloading
    if IN_COLAB:
        base_dir = '/content/drive/MyDrive/CatsDogs'
    else:
        base_dir = os.path.join(os.getcwd(), 'CatsDogs')
    
    # Create directory structure
    os.makedirs(os.path.join(base_dir, 'Cats'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'Dogs'), exist_ok=True)
    os.makedirs(os.path.join(base_dir, 'Pickles'), exist_ok=True)
    print(f"\nBase directory set to: {base_dir}")
    print("You can process existing data in the next cell.")


### Option 2: Direct Download (Alternative method)

If you prefer not to use Kaggle API, you can download the dataset manually:
1. Visit https://www.kaggle.com/c/dogs-vs-cats/data
2. Download `train.zip` (requires Kaggle account)
3. Extract it and organize images:
   - **For Google Colab**: Extract to `/content/drive/MyDrive/CatsDogs/` and organize into `Cats/` and `Dogs/` subdirectories
   - **For Local Systems**: Extract to a `CatsDogs/` folder in your current directory and organize into `Cats/` and `Dogs/` subdirectories

**Note**: The dataset requires a Kaggle account to download. If you already have the dataset organized, you can skip the download cells above.


In [None]:
import os
from os import listdir
import pandas as pd
# Basic packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pickle

# Image preprocessing
import pathlib
import PIL
import PIL.Image
import cv2

# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    from google.colab import drive
    drive.mount('/content/drive')
    # Use Google Drive path for Colab
    base_dir = '/content/drive/MyDrive/CatsDogs'
except ImportError:
    IN_COLAB = False
    # Use current directory for local systems (Windows, Mac, Linux)
    base_dir = os.path.join(os.getcwd(), 'CatsDogs')

# Create directories if they don't exist
os.makedirs(base_dir, exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Cats'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Dogs'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Pickles'), exist_ok=True)

print(f"Working directory: {base_dir}")
print(f"Running in Colab: {IN_COLAB}")

# Import of Data

# I define some parameters in order to resize images in a uniform way
# Note: I set channels = 1 for grayscale images (using IMREAD_GRAYSCALE)
# If you want RGB images, change channels to 3 and use cv2.IMREAD_COLOR instead
# The images will be normalized to [0, 1] range by dividing by 255.0 (important for neural networks)
img_width = 100
img_height = 100
img_size = (img_width, img_height)
channels = 1  # 1 for grayscale, 3 for RGB

# Pets is the list in which all the images will be stored
pets = []

# I define the two categories of pets (related to the sub-directories)
categories = ['Cats','Dogs']

# I create a function in order to load the data, transform them into arrays, assign them to a class and store them in the pets list
def create_data():
    # Determine OpenCV read mode based on channels
    read_mode = cv2.IMREAD_GRAYSCALE if channels == 1 else cv2.IMREAD_COLOR
    
    print(f"Loading images from {base_dir}...")
    print(f"Image size: {img_size}, Channels: {channels}")
    
    # the function iterates through the two sub-directories
    for category in categories:
        path = os.path.join(base_dir, category)
        
        # Check if directory exists
        if not os.path.exists(path):
            print(f"Warning: Directory {path} does not exist. Please download the dataset first.")
            continue

        # assign 0 to cat and 1 to dog, according to the index of the categories
        pet_class = categories.index(category)

        # Filter image files by extension (only process valid image files)
        image_files = [f for f in os.listdir(path) 
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        print(f"Processing {len(image_files)} {category.lower()} images...")

        # the function then iterates through each image in both the folders. I used the opencv (cv2 when imported) package to read and load them. 
        # IMREAD_COLOR keeps the color in the image, whereas IMREAD_GRAYSCALE would turn them into grayscale images.
        # The function resizes the images as defined by the previous parameters and finally appends the array of the image, 
        # together with the class value (creating a sub-list of two values for each image), to the pets list.
        # The function skips those images which seem broken.
        for img_file in image_files:
            img_path = os.path.join(path, img_file)
            try:
                img_array = cv2.imread(img_path, read_mode)
                # Check if image was loaded successfully
                if img_array is None:
                    continue
                # Resize image
                resized = cv2.resize(img_array, img_size)
                pets.append([resized, pet_class])
            except Exception as e:
                # Skip broken images silently
                pass

create_data()

print(f"Total number of images loaded: {len(pets)}")

# since the previous function iterates linearly through the two sub-directories, I shuffle the data in order to mix the order of the images
print("Shuffling data...")
random.shuffle(pets)

# split the data arrays and the labels for the classification task
X = []
y = []

for image, label in pets:
    X.append(image)
    y.append(label)

# Convert to numpy arrays with proper dtypes
X = np.array(X, dtype=np.float64)
X = X.reshape(-1, img_height, img_width, channels)
y = np.array(y, dtype=np.int32)

# Normalize pixel values to [0, 1] range (important for neural networks)
print("Normalizing pixel values to [0, 1] range...")
X = X / 255.0

print(f"Data shape: X = {X.shape}, y = {y.shape}")
print(f"X dtype: {X.dtype}, y dtype: {y.dtype}")
print(f"X range: [{X.min():.3f}, {X.max():.3f}]")

# Save the processed data as pickle files
pickles_dir = os.path.join(base_dir, 'Pickles')
x_pickle_path = os.path.join(pickles_dir, 'X.pickle')
y_pickle_path = os.path.join(pickles_dir, 'y.pickle')

print(f"\nSaving X to {x_pickle_path}...")
with open(x_pickle_path, 'wb') as f:
    pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Saving y to {y_pickle_path}...")
with open(y_pickle_path, 'wb') as f:
    pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL)

# Verify saved files
file_size_x = os.path.getsize(x_pickle_path) / (1024 * 1024)  # MB
file_size_y = os.path.getsize(y_pickle_path) / (1024 * 1024)  # MB

print(f"\nPickle files saved successfully!")
print(f"X.pickle size: {file_size_x:.2f} MB")
print(f"y.pickle size: {file_size_y:.2f} MB")

# Load the pickled data to verify
print("\nVerifying saved files...")
with open(x_pickle_path, 'rb') as f:
    X_loaded = pickle.load(f)

with open(y_pickle_path, 'rb') as f:
    y_loaded = pickle.load(f)

print(f"Verification successful!")
print(f"Loaded data: X = {X_loaded.shape}, y = {y_loaded.shape}")
print(f"X dtype: {X_loaded.dtype}, y dtype: {y_loaded.dtype}")
print(f"X range: [{X_loaded.min():.3f}, {X_loaded.max():.3f}]")

Mounted at /content/drive
24978
0
0
0
1
0
0
0
1
0
