# Create data file structures and sets:

This script creates csv files with file names and labels for all used data sets.
The tensorflow pipeline reads the csv files later and imports images in batches.

Used data sets:
    - intrinsic imigas in the wild (iiw)
    - MPI Sintel data set
    - MIT data set

In [2]:
import os
import glob
import pandas as pd

In [15]:
# directory where to save csv files:
save_csvs = 'data/'

In [3]:
def create_datasets(df, p_train, p_valid, p_test):
    """
    Splits a data set df into training, validation and testing data set with relative cardinality
    p_train, p_valid and p_test, respectively.
    :param df: complete data set which should be split into training, validation and testing sets
    :type df: pd.DataFrame()
    :param p_train: relative cardinality of training data set
    :type p_train: float (\elem [0,1])
    :param p_valid: relative cardinality of validation data set
    :type p_valid: float (\elem [0,1])
    :param p_test: relative cardinality of testing data set
    :type p_test: float (\elem [0,1])
    :return: training, validation and testing data sets
    :type: [pd.DataFrame, pd.DataFrame, pd.DataFrame]
    """
    # make sure we have consistancy:
    assert p_train + p_valid + p_test  == 1, 'p_train, p_valid, p_test must add up to 1'
    # this data set will be the training data set in the end:
    df_train = df.copy()
    # sampling data to get testing set:
    df_test = df_train.sample(n=int(p_test * df.shape[0]), frac=None, replace=False, weights=None, random_state=42, axis=0)
    # drop these sampled test data (we do not want them in the other data sets):
    df_train.drop(df_test.index, inplace=True)
    # sampling data to get validation set:
    df_valid = df_train.sample(n=int(p_valid * df.shape[0]), frac=None, replace=False, weights=None, random_state=42, axis=0)
    # drop these sampled valid data (we do not want them in the training set):
    df_train.drop(df_valid.index, inplace=True)
    return df_train, df_valid, df_test

## Data set iiw:

In [16]:
# directory of data
data_dir_iiw = 'data/iiw-dataset/data/'

In [69]:
# import file names of data directory:
df_iiw = pd.DataFrame([[int(os.path.splitext(os.path.basename(x))[0]), 
                        os.path.relpath(x, save_csvs), 
                        os.path.splitext(os.path.relpath(x, save_csvs))[0]+'.json'] for x in glob.glob(data_dir_iiw + '/*.png')], 
                      columns=['file_id', 'image_path', 'label_path'])
# sort by file ids (we can sort these files because they are shuffled during training in tf anyways):
df_iiw.sort_values(by='file_id', inplace=True)
# reset indices of pd.DataFrame:
df_iiw.reset_index(drop=True, inplace=True)

In [71]:
# get training validation and testing data set of the iiw data:
df_iiw_train, df_iiw_valid, df_iiw_test = create_datasets(df=df_iiw, p_train=0.8, p_valid=0.1, p_test=0.1)

In [73]:
# save complete data set, training data set, validation data set and testing data set in separate data files:
df_iiw.to_csv(path_or_buf=save_csvs + 'data_iiw_complete.csv', sep=',', columns=['image_path', 'label_path'], index=False, header=False)
df_iiw_train.to_csv(path_or_buf=save_csvs + 'data_iiw_train.csv', sep=',', columns=['image_path', 'label_path'], index=False, header=False)
df_iiw_valid.to_csv(path_or_buf=save_csvs + 'data_iiw_valid.csv', sep=',', columns=['image_path', 'label_path'], index=False, header=False)
df_iiw_test.to_csv(path_or_buf=save_csvs + 'data_iiw_test.csv', sep=',', columns=['image_path', 'label_path'], index=False, header=False)

## Sintel data set:

In [9]:
data_dir_sintel = 'data/mpi-sintel-complete/'

In [83]:
# use 'clean pass' images (see narihira2015: p.3: "'final images' [...] are the result of additional computer 
# graphics tricks which dristract from our application."):
df_sintel = pd.DataFrame([[os.path.relpath(x, save_csvs), 
                           os.path.relpath(x, save_csvs).replace('clean', 'albedo')
                          ] for x in glob.glob(data_dir_sintel + 'training/clean/**/*.png')],
                         columns=['image_path', 'label_path'])

In [85]:
# get training validation and testing data set of the mpi-sintel data:
df_sintel_train, df_sintel_valid, df_sintel_test = create_datasets(df=df_sintel, p_train=0.8, p_valid=0.1, p_test=0.1)

In [87]:
# save complete data set, training data set, validation data set and testing data set in separate data files:
df_sintel.to_csv(path_or_buf=save_csvs + 'data_sintel_complete.csv', sep=',',
                 columns=['image_path', 'label_path'], index=False, header=False)
df_sintel_train.to_csv(path_or_buf=save_csvs + 'data_sintel_train.csv', sep=',',
                       columns=['image_path', 'label_path'], index=False, header=False)
df_sintel_valid.to_csv(path_or_buf=save_csvs + 'data_sintel_valid.csv', sep=',', 
                       columns=['image_path', 'label_path'], index=False, header=False)
df_sintel_test.to_csv(path_or_buf=save_csvs + 'data_sintel_test.csv', sep=',', 
                      columns=['image_path', 'label_path'], index=False, header=False)

In [98]:
# also save (unknown) test files:
df_sintel_test_unknown = pd.DataFrame([[os.path.relpath(x, save_csvs),
                                        None
                                       ] for x in glob.glob(data_dir_sintel + 'test/clean/**/*.png')],
                                      columns=['image_path', 'label_path'])
df_sintel_test_unknown.to_csv(path_or_buf=save_csvs + 'data_sintel_test_unknown.csv', sep=',',
                              columns=['image_path', 'label_path'], index=False, header=False)

## MIT data set:

In [89]:
data_dir_mit = 'data/mit_intrinsic/data/'

In [103]:
df_mit = pd.DataFrame([[os.path.relpath(x, save_csvs), 
                        os.path.relpath(x, save_csvs).replace('original', 'reflectance'), 
                        os.path.relpath(x, save_csvs).replace('original', 'shading')
                       ] for x in glob.glob(data_dir_mit + '**/original.png')],
                      columns=['image_path', 'albedo_label_path', 'shading_label_path'])

In [104]:
# get training validation and testing data set of the mit data:
df_mit_train, df_mit_valid, df_mit_test = create_datasets(df=df_mit, p_train=0.8, p_valid=0.1, p_test=0.1)

In [106]:
# save complete data set, training data set, validation data set and testing data set in separate data files:
df_mit.to_csv(path_or_buf=save_csvs + 'data_mit_complete.csv', sep=',', 
              columns=['image_path', 'albedo_label_path', 'shading_label_path'], index=False, header=False)
df_mit_train.to_csv(path_or_buf=save_csvs + 'data_mit_train.csv', sep=',', 
                    columns=['image_path', 'albedo_label_path', 'shading_label_path'], index=False, header=False)
df_mit_valid.to_csv(path_or_buf=save_csvs + 'data_mit_valid.csv', sep=',', 
                    columns=['image_path', 'albedo_label_path', 'shading_label_path'], index=False, header=False)
df_mit_test.to_csv(path_or_buf=save_csvs + 'data_mit_test.csv', sep=',', 
                   columns=['image_path', 'albedo_label_path', 'shading_label_path'], index=False, header=False)