# Data Management

*This notebook is to manage data from our two sources and create train/eval/test samples.*

**⚠️ Please start by making a copy of downloaded images to avoid any problem!**

In [1]:
import os
import random
import warnings
import pandas as pd
from tqdm import tqdm
import PIL.Image

In [2]:
DATA_PATH = "../../data/"
SOURCES = ["google_image", "open_images"]

## 1 - Remove errors

First step is to remove all images with potentials errors.

In [3]:
def remove_errors(sources):
    warnings.filterwarnings("error")  # to catch warnings as errors
    for source in sources:
        good, errors = 0, 0
        source_path = DATA_PATH + source
        labels = set(os.listdir(source_path)) - set([".DS_Store"])
        for label in tqdm(labels):
            for image in os.listdir(f"{source_path}/{label}"):
                image_path = f"{source_path}/{label}/{image}"
                try:
                    with PIL.Image.open(image_path) as img:
                        img.convert("RGB")
                        exif_data = img._getexif()
                        good += 1
                except:
                    os.remove(image_path)
                    errors += 1
        print(f"{source}: {errors} images removed - {good} kept.")
    warnings.resetwarnings()

In [4]:
remove_errors(SOURCES)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [01:24<00:00,  2.82s/it]


google_image: 181 images removed - 37852 kept.


100%|██████████| 30/30 [02:29<00:00,  5.00s/it]

open_images: 0 images removed - 81558 kept.





## 2 - Distributions

In [5]:
distribution = pd.DataFrame()

for source in SOURCES:
    image_path = DATA_PATH + source
    labels = set(os.listdir(image_path)) - set([".DS_Store"])
    for label in labels:
        label_path = f"{image_path}/{label}"
        distribution.loc[label, source] = len(os.listdir(label_path))

distribution.sort_index()
distribution.loc["total"] = distribution[SOURCES].sum()
distribution["total"] = distribution[SOURCES].sum(axis=1)
distribution = distribution.astype(int)

distribution

Unnamed: 0,google_image,open_images,total
rainy,1489,444,1933
cooking,1238,8123,9361
nature,1279,6708,7987
reading,1904,3038,4942
art,1320,524,1844
running,1234,2861,4095
gym,1513,1811,3324
board_games,1146,1124,2270
bedroom,1552,2724,4276
beer,939,3062,4001


## 3 - Train/valid/test split

In [6]:
TRAIN_PATH = DATA_PATH + "train/"
VALID_PATH = DATA_PATH + "valid/"
TEST_PATH = DATA_PATH + "test/"

In [7]:
class_limit = 4000  # Maximum number of images per class

abbr = {"google_image": "GOOGLE", "open_images": "OPEN"}

labels = set(os.listdir(DATA_PATH + "google_image")) - set([".DS_Store"])
for label in tqdm(labels):

    # Creates repositories
    if not os.path.exists(TRAIN_PATH + label):
        os.makedirs(TRAIN_PATH + label)
    if not os.path.exists(VALID_PATH + label):
        os.makedirs(VALID_PATH + label)
    if not os.path.exists(TEST_PATH + label):
        os.makedirs(TEST_PATH + label)

    n_google = len(os.listdir(DATA_PATH + f"google_image/{label}"))
    n_open = len(os.listdir(DATA_PATH + f"open_images/{label}"))

    for source in SOURCES:

        label_path = DATA_PATH + f"{source}/{label}"
        source_images = os.listdir(label_path)

        # Try to keep balanced datasets
        if (source == "open_images") & (n_google + n_open > class_limit):
            source_images = random.sample(source_images, class_limit - n_google)

        # Move images to the right directory
        for image_idx, image_name in enumerate(source_images):
            new_name = f"{abbr[source]}_{image_name}"
            if image_idx % 13 == 0:
                os.rename(f"{label_path}/{image_name}", VALID_PATH + f"{label}/{new_name}")
            elif image_idx % 13 == 1:
                os.rename(f"{label_path}/{image_name}", TEST_PATH + f"{label}/{new_name}")
            else:
                os.rename(f"{label_path}/{image_name}", TRAIN_PATH + f"{label}/{new_name}")

100%|██████████| 30/30 [00:06<00:00,  4.35it/s]


In [8]:
distribution = pd.DataFrame()

for sample in ["train", "valid", "test"]:

    sample_path = DATA_PATH + sample
    labels = set(os.listdir(sample_path)) - set([".DS_Store"])
    for label in labels:
        label_path = f"{sample_path}/{label}"
        distribution.loc[label, sample] = len(os.listdir(label_path))

distribution.sort_index()
distribution["total"] = distribution.sum(axis=1)
distribution.loc["total"] = distribution.sum()
distribution = distribution.astype(int)

distribution

Unnamed: 0,train,valid,test,total
rainy,1633,150,150,1933
cooking,3382,309,309,4000
nature,3382,309,309,4000
reading,3382,309,309,4000
art,1558,143,143,1844
running,3384,308,308,4000
gym,2810,257,257,3324
board_games,1918,176,176,2270
bedroom,3382,309,309,4000
beer,3382,309,309,4000
