The dataset can be downloaded here:

https://www.kaggle.com/c/dogs-vs-cats/data

In [15]:
import os
import glob
import shutil

import numpy as np

In [12]:
INP_TRAIN_DIR = "data/download/train"
INP_TEST_DIR = "data/download/test1"

DATASET_DIR = "data/dataset_"

In [8]:
train_dog_files = glob.glob(os.path.join(INP_TRAIN_DIR, "dog*.jpg"))
train_cat_files = glob.glob(os.path.join(INP_TRAIN_DIR, "cat*.jpg"))

np.random.seed(1)
np.random.shuffle(train_dog_files)
np.random.shuffle(train_cat_files)


test_files = glob.glob(os.path.join(INP_TEST_DIR, "*.jpg"))

len(train_dog_files), len(train_cat_files), len(test_files)

(12500, 12500, 12500)

In [9]:
train_dog_files[:5]

['data/download/train/dog.1094.jpg',
 'data/download/train/dog.10705.jpg',
 'data/download/train/dog.784.jpg',
 'data/download/train/dog.669.jpg',
 'data/download/train/dog.1588.jpg']

In [10]:
train_cat_files[:5]

['data/download/train/cat.5044.jpg',
 'data/download/train/cat.6924.jpg',
 'data/download/train/cat.3709.jpg',
 'data/download/train/cat.5157.jpg',
 'data/download/train/cat.9790.jpg']

In [11]:
test_files[:5]

['data/download/test1/10447.jpg',
 'data/download/test1/3337.jpg',
 'data/download/test1/7556.jpg',
 'data/download/test1/5646.jpg',
 'data/download/test1/10264.jpg']

Since the true labels for test set are not available from Kaggle page, we will split the original train set into `train`, `validation` and `test` sets.

In [13]:
TRAIN_SIZE = 20_000
VAL_SIZE = 2500
TEST_SIZE = 2500

TRAIN_SIZE_HALF = TRAIN_SIZE // 2
VAL_SIZE_HALF = VAL_SIZE // 2
TEST_SIZE_HALF = TEST_SIZE // 2

In [14]:
train_dir = os.path.join(DATASET_DIR, "train")
valid_dir = os.path.join(DATASET_DIR, "validation")
test_dir = os.path.join(DATASET_DIR, "test")

train_cats_dir = os.path.join(train_dir, "cats")
train_dogs_dir = os.path.join(train_dir, "dogs")
os.makedirs(train_cats_dir, exist_ok=True)
os.makedirs(train_dogs_dir, exist_ok=True)

valid_cats_dir = os.path.join(valid_dir, "cats")
valid_dogs_dir = os.path.join(valid_dir, "dogs")
os.makedirs(valid_cats_dir, exist_ok=True)
os.makedirs(valid_dogs_dir, exist_ok=True)

test_cats_dir = os.path.join(test_dir, "cats")
test_dogs_dir = os.path.join(test_dir, "dogs")
os.makedirs(test_cats_dir, exist_ok=True)
os.makedirs(test_dogs_dir, exist_ok=True)

In [16]:
# copy files for train/dogs and train/cats
for src in train_dog_files[: TRAIN_SIZE_HALF]:
    fname = os.path.basename(src)
    dst = os.path.join(train_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
for src in train_cat_files[: TRAIN_SIZE_HALF]:
    fname = os.path.basename(src)
    dst = os.path.join(train_cats_dir, fname)
    shutil.copyfile(src, dst)

In [18]:
# copy files for validation/dogs and validation/cats
for src in train_dog_files[TRAIN_SIZE_HALF: TRAIN_SIZE_HALF + VAL_SIZE_HALF]:
    fname = os.path.basename(src)
    dst = os.path.join(valid_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
for src in train_cat_files[TRAIN_SIZE_HALF: TRAIN_SIZE_HALF + VAL_SIZE_HALF]:
    fname = os.path.basename(src)
    dst = os.path.join(valid_cats_dir, fname)
    shutil.copyfile(src, dst)

In [19]:
# copy files for test/dogs and test/cats
for src in train_dog_files[TRAIN_SIZE_HALF + VAL_SIZE_HALF: ]:
    fname = os.path.basename(src)
    dst = os.path.join(test_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
for src in train_cat_files[TRAIN_SIZE_HALF + VAL_SIZE_HALF: ]:
    fname = os.path.basename(src)
    dst = os.path.join(test_cats_dir, fname)
    shutil.copyfile(src, dst)