<a href="https://colab.research.google.com/github/lorenzrossi/MachineLearning-CatsDogs-NN/blob/master/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download the Cats and Dogs Dataset

The dataset used is the famous **Dogs vs Cats** dataset from Kaggle:
- **Kaggle Competition**: https://www.kaggle.com/c/dogs-vs-cats
- **Direct Dataset Link**: https://www.kaggle.com/c/dogs-vs-cats/data

The dataset contains 25,000 images of cats and dogs (12,500 each).

### Option 1: Download using Kaggle API (Recommended)

To use this method, you need to:
1. Go to https://www.kaggle.com/account and create an API token
2. Upload the `kaggle.json` file to your Colab session


In [None]:
# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# Install Kaggle API
if IN_COLAB:
    !pip install -q kaggle
else:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "kaggle"])

# Setup Kaggle API credentials
import os
if IN_COLAB:
    # Upload your kaggle.json file (run this cell and upload the file when prompted)
    from google.colab import files
    uploaded = files.upload()
    
    # Move kaggle.json to the correct location
    !mkdir -p ~/.kaggle
    !mv kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
else:
    # For local systems, ensure kaggle.json is in ~/.kaggle/ directory
    kaggle_dir = os.path.join(os.path.expanduser('~'), '.kaggle')
    os.makedirs(kaggle_dir, exist_ok=True)
    print(f"Please ensure your kaggle.json file is located at: {os.path.join(kaggle_dir, 'kaggle.json')}")

# Download the dataset
if IN_COLAB:
    !kaggle competitions download -c dogs-vs-cats
    download_dir = '/content'
    base_dir = '/content/drive/MyDrive/CatsDogs'
else:
    import subprocess
    subprocess.run(['kaggle', 'competitions', 'download', '-c', 'dogs-vs-cats'], check=True)
    # Use current directory for local systems
    download_dir = os.getcwd()
    base_dir = os.path.join(os.getcwd(), 'CatsDogs')

# Unzip the training data
import zipfile
import shutil

# Unzip dogs-vs-cats.zip
zip_path = os.path.join(download_dir, 'dogs-vs-cats.zip')
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(download_dir)

# Unzip train.zip
train_zip_path = os.path.join(download_dir, 'train.zip')
train_dir = os.path.join(download_dir, 'train')
if os.path.exists(train_zip_path):
    with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
        zip_ref.extractall(download_dir)

# Create the directory structure expected by the code
os.makedirs(os.path.join(base_dir, 'Cats'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Dogs'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Pickles'), exist_ok=True)

# Organize images into Cats and Dogs folders
import glob

# Move cat images (files starting with 'cat.')
cat_files = glob.glob(os.path.join(train_dir, 'cat.*'))
for file in cat_files:
    shutil.move(file, os.path.join(base_dir, 'Cats'))

# Move dog images (files starting with 'dog.')
dog_files = glob.glob(os.path.join(train_dir, 'dog.*'))
for file in dog_files:
    shutil.move(file, os.path.join(base_dir, 'Dogs'))

print("Dataset downloaded and organized successfully!")
print(f"Base directory: {base_dir}")
print(f"Number of cat images: {len(os.listdir(os.path.join(base_dir, 'Cats')))}")
print(f"Number of dog images: {len(os.listdir(os.path.join(base_dir, 'Dogs')))}")


### Option 2: Direct Download (Alternative method)

If you prefer not to use Kaggle API, you can download the dataset manually:
1. Visit https://www.kaggle.com/c/dogs-vs-cats/data
2. Download `train.zip` (requires Kaggle account)
3. Extract it and organize images:
   - **For Google Colab**: Extract to `/content/drive/MyDrive/CatsDogs/` and organize into `Cats/` and `Dogs/` subdirectories
   - **For Local Systems**: Extract to a `CatsDogs/` folder in your current directory and organize into `Cats/` and `Dogs/` subdirectories

**Note**: The dataset requires a Kaggle account to download. If you already have the dataset organized, you can skip the download cells above.


In [None]:
import os
from os import listdir
import pandas as pd
# Basic packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pickle

# Image preprocessing
import pathlib
import PIL
import PIL.Image
import cv2

# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    from google.colab import drive
    drive.mount('/content/drive')
    # Use Google Drive path for Colab
    base_dir = '/content/drive/MyDrive/CatsDogs'
except ImportError:
    IN_COLAB = False
    # Use current directory for local systems (Windows, Mac, Linux)
    base_dir = os.path.join(os.getcwd(), 'CatsDogs')

# Create directories if they don't exist
os.makedirs(base_dir, exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Cats'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Dogs'), exist_ok=True)
os.makedirs(os.path.join(base_dir, 'Pickles'), exist_ok=True)

print(f"Working directory: {base_dir}")
print(f"Running in Colab: {IN_COLAB}")

# Import of Data

# I define some parameters in order to resize images in a uniform way
# Note: I set channels = 1 for grayscale images (using IMREAD_GRAYSCALE)
# If you want RGB images, change channels to 3 and use cv2.IMREAD_COLOR instead
img_width = 100
img_height = 100
img_size = (img_width, img_height)
channels = 1  # 1 for grayscale, 3 for RGB

# Pets is the list in which all the images will be stored
pets = []

# I define the two categories of pets (related to the sub-directories)
categories = ['Cats','Dogs']

# I create a function in order to load the data, transform them into arrays, assign them to a class and store them in the pets list
def create_data():
    # the function iterates through the two sub-directories
    for category in categories:
        path = os.path.join(base_dir, category)
        
        # Check if directory exists
        if not os.path.exists(path):
            print(f"Warning: Directory {path} does not exist. Please download the dataset first.")
            continue

        # assign 0 to cat and 1 to dog, according to the index of the categories
        pet_class = categories.index(category)

        # the function then iterates through each image in both the folders. I used the opencv (cv2 when imported) package to read and load them. 
        # IMREAD_COLOR keeps the color in the image, whereas IMREAD_GRAYSCALE would turn them into grayscale images.
        # The function resizes the images as defined by the previous parameters and finally appends the array of the image, 
        # together with the class value (creating a sub-list of two values for each image), to the pets list.
        # The function skips those images which seem broken.
        for img in os.listdir(path):
            try:
                img_array = cv2.imread(os.path.join(path,img), cv2.IMREAD_GRAYSCALE)
                # Check if image was loaded successfully
                if img_array is None:
                    continue
                #img_array = cv2.bitwise_not(img_array)
                new_array = cv2.resize(img_array, img_size)
                pets.append([new_array, pet_class])
            except Exception as e:
                # Print error for debugging (optional - can be removed)
                # print(f"Error loading {img}: {e}")
                pass

create_data()

print(f"Total number of images loaded: {len(pets)}")

# since the previous function iterates linearly through the two sub-directories, I shuffle the data in order to mix the order of the images
random.shuffle(pets)

for sample in pets[:9]:
    print(sample[1])

# split the data arrays and the labels for the classification task

X = []
y = []

for image, label in pets:
    X.append(image)
    y.append(label)

X = np.array(X).reshape(-1, img_width, img_height, channels)

# Save the processed data as pickle files
pickles_dir = os.path.join(base_dir, 'Pickles')
x_pickle_path = os.path.join(pickles_dir, 'X.pickle')
y_pickle_path = os.path.join(pickles_dir, 'y.pickle')

pickle_out = open(x_pickle_path, 'wb')
pickle.dump(X, pickle_out)
pickle_out.close()

pickle_out = open(y_pickle_path, 'wb')
pickle.dump(y, pickle_out)
pickle_out.close()

# Load the pickled data to verify
pickle_in = open(x_pickle_path, 'rb')
X = pickle.load(pickle_in)

pickle_in = open(y_pickle_path, 'rb')
y = pickle.load(pickle_in)

print(f"Data shape: X = {X.shape}, y = {len(y)} labels")

Mounted at /content/drive
24978
0
0
0
1
0
0
0
1
0
