In [37]:
import os
import pathlib
import pandas as pd
import shutil

RAW_DATA_PATH = pathlib.Path("../data/raw/")
RAW_IMAGES_PATH = RAW_DATA_PATH / "JPEGImages"
FINAL_DATA_PATH = pathlib.Path("../data/final/")
FINAL_IMAGES_PATH = FINAL_DATA_PATH / "images"

We want to make a database with pictures of dogs and cats.
# Identification of pictures

In [28]:
cat_pictures = set()
with open(RAW_DATA_PATH / "ImageSets/Main/cat_trainval.txt") as file:
    for line in file.readlines():
        if line.split()[1] == "1":
            cat_pictures.add(line.split()[0] + ".jpg")

dog_pictures = set()
with open(RAW_DATA_PATH / "ImageSets/Main/dog_trainval.txt") as file:
    for line in file.readlines():
        if line.split()[1] == "1":
            dog_pictures.add(line.split()[0] + ".jpg")



In [29]:
print(f"{len(cat_pictures)} pictures of cats and {len(dog_pictures)} pictures of dogs.")

337 pictures of cats and 421 pictures of dogs.


In [30]:
print(f"{cat_pictures.intersection(dog_pictures)} pictures in common")

{'007417.jpg', '006026.jpg', '002268.jpg', '009763.jpg', '001211.jpg', '004035.jpg', '001940.jpg', '002034.jpg'} pictures in common


In [31]:
cat_pictures, dog_pictures = cat_pictures.difference(dog_pictures), dog_pictures.difference(cat_pictures)

In [32]:
print(f"{len(cat_pictures)} pictures of cats and {len(dog_pictures)} pictures of dogs.")

329 pictures of cats and 413 pictures of dogs.


# Making the new database
## Pictures dataset

In [34]:
images = cat_pictures.union(dog_pictures)

In [39]:
if not FINAL_IMAGES_PATH.exists():
    os.makedirs(FINAL_IMAGES_PATH)
for image in images:
    shutil.copy(RAW_IMAGES_PATH / image, FINAL_IMAGES_PATH / image)

## Labels

In [40]:
cat_pictures = list(cat_pictures)
dog_pictures = list(dog_pictures)

labels = pd.DataFrame(
    dict(
        path = cat_pictures,
        label = ["cat"] * len(cat_pictures)
    )
)

labels = labels.append(
    pd.DataFrame(
        dict(
            path=dog_pictures,
            label = ["dog"] * len(dog_pictures)
        )
    )
)

In [48]:
labels = labels.sample(frac=1)

In [50]:
labels.to_csv(FINAL_DATA_PATH / "labels.csv", index=False)