# Data Aqcquisition - 17 and 102 Flowers Datasets

Execute this notebook to download both datasets and have them prepared into the necessary folders for training, validation, and test.

## Original datasets

### Download

In [None]:
import os

os.makedirs("data/17flowers", exist_ok=True)
os.makedirs("data/102flowers", exist_ok=True)

Download datasets

In [None]:
!wget -P data/17flowers https://thor.robots.ox.ac.uk/datasets/flowers-17/17flowers.tgz
!wget -P data/17flowers https://thor.robots.ox.ac.uk/datasets/flowers-17/datasplits.mat
!wget -P data/17flowers https://thor.robots.ox.ac.uk/datasets/flowers-17/trimaps.tgz

In [None]:
!wget -P data/102flowers https://thor.robots.ox.ac.uk/datasets/flowers-102/102flowers.tgz
!wget -P data/102flowers https://thor.robots.ox.ac.uk/datasets/flowers-102/imagelabels.mat
!wget -P data/102flowers https://thor.robots.ox.ac.uk/datasets/flowers-102/setid.mat

Extract

In [None]:
!tar -xzf data/17flowers/17flowers.tgz -C data/17flowers
!tar -xzf data/102flowers/102flowers.tgz -C data/102flowers

Clean-up

In [None]:
!rm data/102flowers/102flowers.tgz
!rm data/17flowers/17flowers.tgz

### Train/Val/Test splits

Imports

In [None]:
import scipy.io
import pathlib
import shutil
import re  # Regex
import os

Utility functions

In [None]:
def create_subfolders(path, labels):
    """
    Create train/val/test subfolders with label folders.
    """
    for split_name in ["train/", "val/", "test/"]:
        for label in labels:
            os.makedirs(path + split_name + label, exist_ok=True)
    print(f"All subfolders created at {path}.")

def move_images_to_subfolders_17f(path, labels, train_ids, val_ids, test_ids):
    """
    Copy images from `path/jpg` to `path` subfolders train/val/test and their labels.
    """
    src_path = path + "jpg/"
    for filename in os.listdir(src_path):
        if filename.endswith(".jpg"):
            # Get the id of the image from its filename
            file_id = int(re.findall(r'\d+', filename)[0])

            # Check which split the file belongs to
            if file_id in train_ids:
                split = "train/"
            elif file_id in val_ids:
                split = "val/"
            elif file_id in test_ids:
                split = "test/"
            else:
                print(f"{filename} isn't associated with any splits.")

            # calculate the subfolder to move the image into
            subfolder_id = (file_id-1) // 80  # File ids start from 1, so subtract one. 80 images per label
            dst_path = path + split + labels[subfolder_id]

            # move the image into the subfolder
            shutil.copy(os.path.join(src_path, filename), os.path.join(dst_path, filename))
    print(f"Images copied successfully to {path} train/val/test subfolders.")

# move the images into the subfolders
def move_images_to_subfolders_102f(path, labels, mat_labels, train_ids, val_ids, test_ids):
    """
    Copy images from `path/jpg` their class subfolders.
    """
    src_path = path + "jpg/"
    for filename in os.listdir(src_path):
        if filename.endswith(".jpg"):
            # Get the id of the image from its filename
            file_id = int(re.findall(r'\d+', filename)[0])

            # Get the class for the image
            file_class = labels[mat_labels[file_id-1] - 1]

            # Check which split the file belongs to
            if file_id in train_ids:
                split = "train/"
            elif file_id in val_ids:
                split = "val/"
            elif file_id in test_ids:
                split = "test/"
            else:
                print(f"{filename} isn't associated with any splits.")

            dst_path = path + split + file_class

            # Copy the image into the subfolder
            shutil.copy(os.path.join(src_path, filename), os.path.join(dst_path, filename))
    print(f"Images copied successfully to {path}.")

Organize 17 Flowers into train/val/test:

In [None]:
# list of class labels
with open("17flowers_labels.txt", "r") as f:
    flower_labels_17 = [line.strip() for line in f]

# Splits file
mat = scipy.io.loadmat('./data/17flowers/datasplits.mat')

# The image ids for each split
train_ids = mat["trn1"][0]
val_ids = mat["val1"][0]
test_ids = mat["tst1"][0]

path_to_data = "data/17flowers/"

create_subfolders(path_to_data, flower_labels_17)
move_images_to_subfolders_17f(path_to_data, flower_labels_17, train_ids, val_ids, test_ids)

# Split directories
train_dir = pathlib.Path(path_to_data + "train")
val_dir = pathlib.Path(path_to_data + "val")
test_dir = pathlib.Path(path_to_data + "test")

# Print useful information
train_size = len(train_ids)
val_size = len(val_ids)
test_size = len(test_ids)
train_count = len(list(train_dir.glob('*/*.jpg')))
val_count = len(list(val_dir.glob('*/*.jpg')))
test_count = len(list(test_dir.glob('*/*.jpg')))

print(f"Number of images at {path_to_data}: {train_count}/{train_size} (train), {val_count}/{val_size} (val), {test_count}/{test_size} (test)")

# Assertions
assert train_count == train_size, f"Expected {train_size} images, but {train_dir} only has {train_count}"
assert val_count == val_size, f"Expected {train_size} images, but {val_dir} only has {val_count}"
assert test_count == test_size, f"Expected {train_size} images, but {test_dir} only has {test_count}"

Organize 102 Flowers into train/val/test:

In [None]:
# set the path to the folder containing the images
path_to_data = "data/102flowers/"

# Load splits
mat_splits = scipy.io.loadmat('./data/102flowers/setid.mat')
train_ids = mat_splits["trnid"][0]
val_ids = mat_splits["valid"][0]
test_ids = mat_splits["tstid"][0]

# list of class labels
with open("102flowers_labels.txt", "r") as f:
    flower_labels_102 = [line.strip() for line in f]

# Class labels
mat_labels = scipy.io.loadmat('./data/102flowers/imagelabels.mat')["labels"][0]

create_subfolders(path_to_data, flower_labels_102)
move_images_to_subfolders_102f(path_to_data, flower_labels_102, mat_labels, train_ids, val_ids, test_ids) 

## Color constancy dataset using fc4

Be sure to initialize and update the submodules:

```sh
git init submodule
git update
```

In [None]:
import os

os.makedirs("data/cc/17flowers", exist_ok=True)
os.makedirs("data/cc/102flowers", exist_ok=True)

### 17flowers

Execute FC4 to get the CC'ed dataset.

In [None]:
%cd thirdparty/fc4-python3

In [None]:
import subprocess
import sys
import cv2
import os

from fcn import FCN
from config import *
from utils import get_session


def run_fc4(src_path, fcn):
    for filename in os.listdir(src_path):
        if filename.endswith(".jpg"):
            path = 'data/17flowers/jpg/'+ filename
            base = os.path.dirname(os.path.abspath('./../')) # 実行ファイルのディレクトリ名
            target_path = os.path.join(base, path) # パスの連結
            img = cv2.imread(target_path)
            # reverse gamma correction for sRGB
            img = (img / 255.0) ** 2.2 * 65536
            images = [img]
            fcn.test_external(images=images, fns=[target_path])


with get_session() as sess:
    fcn = FCN(sess=sess, name='../../pretrained_colorchecker/colorchecker_fold1and2.ckpt')
    fcn.load_absolute('../../pretrained_colorchecker/colorchecker_fold1and2.ckpt')
    src_path = '../../data/17flowers/jpg/'
    run_fc4(src_path, fcn)


Transfer files to the correct directory and clean-up: 

In [None]:
!mv cc_outputs/* ../../data/17flowers/cc/jpg
!rm -rf cc_outputs

### 102 Flowers

In [None]:
src_path =  '../../data/102flowers/jpg/'

with get_session() as sess:
    fcn = FCN(sess=sess, name='../../pretrained_colorchecker/colorchecker_fold1and2.ckpt')
    fcn.load_absolute('../../pretrained_colorchecker/colorchecker_fold1and2.ckpt')
    run_fc4(src_path, fcn)

Transfer and clean-up:

In [None]:
!mv cc_outputs/* ../../data/102flowers/cc/jpg
!rm -rf cc_outputs

Organize the directories.

In [None]:
%cd ../..

In [None]:
for path_to_data_cc in ["data/17flowers/cc/", "data/102flowers/cc/"]:
    create_subfolders(path_to_data_cc, flower_labels_17)
    move_images_to_subfolders_17f(path_to_data_cc, flower_labels_17, train_ids, val_ids, test_ids)

    train_dir_cc = pathlib.Path(path_to_data_cc + "train")
    val_dir_cc = pathlib.Path(path_to_data_cc + "val")
    test_dir_cc = pathlib.Path(path_to_data_cc + "test")

    train_count_cc = len(list(train_dir_cc.glob('*/*.jpg')))
    val_count_cc = len(list(val_dir_cc.glob('*/*.jpg')))
    test_count_cc = len(list(test_dir_cc.glob('*/*.jpg')))

    print(f"Number of images at {path_to_data_cc}: {train_count_cc}/{train_size} (train), {val_count_cc}/{val_size} (val), {test_count_cc}/{test_size} (test)")

    assert train_count_cc == train_size, f"Expected {train_size} images, but {train_dir_cc} only has {train_count_cc}"
    assert val_count_cc == val_size, f"Expected {train_size} images, but {val_dir_cc} only has {val_count_cc}"
    assert test_count_cc == test_size, f"Expected {train_size} images, but {test_dir_cc} only has {test_count_cc}"

## Clean-up

In [None]:
!rm -r data/17flowers/jpg data/102flowers/jpg data/17flowers/cc/jpg data/102flowers/cc/jpg
!rm data/17flowers/datasplits.mat data/102flowers/setid.mat