### Prepare csvs for autoencoder data loader

Creates a csv of a dataset for hyperspectral images to be fed into the autoencoder. Each row contains the following information:

1. type of data (exr or png)
2. path to directory containing the data
3. root name of the data file (if png) or the name of the exr file if (exr)
4. Row of upper left corner of patch
5. Col of upper left corner of patch
6. Final side length of patch (should be square)
7. scale factor (1, 0.5, or 2, according to whether the patch is to be used as-is, downsampled by 0.5, or upscaled by 2)
8. normal/flipped - whether or not the image is to be flipped across the vertical axis or not.

See implementation of HyperspectralDataset for the way to use the resulting .csv files.

In [14]:
import csv
import numpy as np
import os
import os.path
from collections import defaultdict

In [15]:
# KAIST
import OpenEXR as exr
KAISTdir = "data/KAIST"
KAISTimages = ["scene{:02}_reflectance.exr".format(i) for i in range(1,31)]

In [20]:
# CAVE
from PIL import Image
CAVEdir = "data/CAVE"
g = os.walk(CAVEdir)
_, CAVEimages, _ = next(g)

In [21]:
def generate_random_samples(samples_per_image, imagetype, imagedir, imagename, side_length, nrows, ncols):
    data = []
    for scale in [1, 0.5, 2]:
        for flip in [False, True]:
            for _ in range(samples_per_image):
                sample = {}
                sample["type"] = imagetype
                sample["dir"] = imagedir
                sample["name"] = imagename
                sample["side"] = side_length
                # Sample a random coordinate for the top left corner.
                # Adjust appropriately according to the scale factor
                row = np.random.randint(0, nrows-(side/scale))
                col = np.random.randint(0, ncols-(side/scale))
                sample["row"] = row
                sample["col"] = col
                sample["scale"] = scale
                sample["flip"] = flip
                data.append(sample)
    return data

In [22]:
# Function for splitting into training, val, and test datasets
def split_train_val_test(ntotal, train, val):
    """total: int - number of examples
    train: float in [0, 1] - fraction of total that are training
    val: float in [0, 1-train] - fraction of remaining that are val
    """
    x = np.random.permutation(range(ntotal))
    traincutoff = int(ntotal*train)
    valcutoff = int(ntotal*(train+val))
    # Train, Val, Test
    return x[:traincutoff].tolist(), x[traincutoff:valcutoff].tolist(), x[valcutoff:].tolist()

# Function for writing a subset of a dataset to a file
def write_dataset(alldata, fieldnames, selection, filepath):
    """alldata is a list of lists, where each sublist is a list of 
    entries to be written to the csv file."""
    with open(filepath, "w") as f:     
        writer = csv.DictWriter(f, fieldnames)
        writer.writeheader()
        for i in selection:
            for entry in alldata[i]:
                writer.writerow(entry)

In [49]:
# Sampling parameters
side = 96
np.random.seed(0)

In [50]:
# Write KAIST data only
np.random.seed(0) # Controls data split
KAISTdata = [] # List of lists
samples_per_image = 120
fieldnames = ["type", "dir", "name", "row", "col", "side", "scale", "flip"]
for image in KAISTimages:
    # Load image and extract dimensions:
    file = exr.InputFile(os.path.join(KAISTdir, image))
    header = file.header()
    ncols = header["displayWindow"].max.x+1
    nrows = header["displayWindow"].max.y+1
    file.close()
    KAISTdata.append(generate_samples(samples_per_image, "exr", KAISTdir, image, side, nrows, ncols))
                
# train, val, test = split_train_val_test(len(KAISTdata), 0.8, 0.1)

# # Train
# write_dataset(KAISTdata, fieldnames, train, "data/kaist_set/kaist_train_large.csv")
# # Val
# write_dataset(KAISTdata, fieldnames, val, "data/kaist_set/kaist_val_large.csv")
# # Test
# write_dataset(KAISTdata, fieldnames, test, "data/kaist_set/kaist_test_large.csv")


In [51]:
CAVEdata = []
# Images with weird behavior:
blacklist = ["watercolors_ms"]
samples_per_image = 30
fieldnames = ["type", "dir", "name", "row", "col", "side", "scale", "flip"]
for image in CAVEimages:
    if image in blacklist:
        continue
    # Load image and extract dimensions:
    imagedir = os.path.join(CAVEdir, image, image)
    imagefile = "{}_01.png".format(image)
    img = Image.open(os.path.join(imagedir, imagefile))
    nrows, ncols = img.size
    CAVEdata.append(generate_random_samples(samples_per_image, "png", imagedir, image, side, nrows, ncols))
               
# with open("data/cave_data.csv", "w") as f:     
#     writer = csv.DictWriter(f, fieldnames)
#     writer.writeheader()
#     for entry in CAVEdata:
#         writer.writerow(entry)


In [52]:
alldata = KAISTdata + CAVEdata

train, val, test = split_train_val_test(len(alldata), 0.8, 0.1)

# Train
write_dataset(alldata, fieldnames, train, "data/train_large.csv")
# Val
write_dataset(alldata, fieldnames, val, "data/val_large.csv")
# Test
write_dataset(alldata, fieldnames, test, "data/test_large.csv")