In [None]:
%pylab inline
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as pl
import h5py
import os

import sys
sys.path.insert(0, '../')
from gantools.data.load import load_params_dataset
from gantools.data import transformation
from gantools import utils
from gantools import plot
from gantools.metric import stats

# Create dataset

Create dataset from individual file. The big h5 file has two datasets, called "train_maps" and "train_labels", containing the maps and corresponding parameters respectively. Moreover, cosmologies have to be grouped. E.g.:

map1 param1

map2 param1

...

map100 param1

map101 param2

map102 param2

...

In [None]:
path = "/store/sdsc/sd01/cosmology/KiDs450_maps/maps/"
fileout = "/scratch/snx3000/smarcon/preprocessed_data/kids.h5" # TODO: adjust

In [None]:
def get_params(filename):
    words = filename.split('_')
    return [float(words[3]), float(words[4])]

In [None]:
files = os.listdir(path)
files.sort()
first = True
for file in files:
    maps = np.load(path + file) # Load maps
    params = get_params(file) # Parse parameters
    params = np.tile(np.array(params), [len(maps), 1])
    utils.append_h5(fileout, maps, params=params, overwrite=first)
    first = False

# Analyse dataset

In [None]:
dataset = load_params_dataset('kids.h5', batch=12000, sorted=True, shape=[128, 128])

In [None]:
dataset.N

In [None]:
diff_params = dataset.get_different_params()

In [None]:
print(diff_params.shape)
print(diff_params)

In [None]:
pl.scatter(diff_params[:,0], diff_params[:,1])
pl.xlabel('$\Omega_M$', fontsize=14)
pl.ylabel('$\sigma_8$', fontsize=14)

In [None]:
vmin, vmax = utils.find_minmax(dataset)

In [None]:
histo, x = utils.produce_histogram(dataset, lim=(vmin, vmax))

In [None]:
plot.plot.plot_histogram(x, histo)
print('min: {}'.format(vmin))
print('max: {}'.format(vmax))

In [None]:
vmax = 0.125

In [None]:
fig, ax = pl.subplots(nrows=4, ncols=4, figsize=(15,15))
idx = 0
imgs = dataset.get_samples(N=16)[0]
params = dataset.get_samples(N=16)[1]
for row in ax:
    for col in row:
        plot.plot_img(imgs[idx], vmin=vmin, vmax=vmax, ax=col, title=params[idx])
        idx = idx + 1
fig.tight_layout()

# Smoothing

In [None]:
def smoothing(x):
    return transformation.smooth(x, 2)

In [None]:
dataset = load_params_dataset('kids.h5', batch=12000, sorted=True, shape=[128, 128], transform=smoothing)

In [None]:
vmin, vmax = utils.find_minmax(dataset)

In [None]:
histo, x = utils.produce_histogram(dataset, lim=(vmin, vmax))

In [None]:
plot.plot.plot_histogram(x, histo)
print('min: {}'.format(vmin))
print('max: {}'.format(vmax))

In [None]:
vmax = 0.1

In [None]:
fig, ax = pl.subplots(nrows=4, ncols=4, figsize=(15,15))
idx = 0
imgs = dataset.get_samples(N=16)[0]
params = dataset.get_samples(N=16)[1]
for row in ax:
    for col in row:
        plot.plot_img(imgs[idx], vmin=vmin, vmax=vmax, ax=col, title=params[idx])
        idx = idx + 1
fig.tight_layout()

In [None]:
psd, k = stats.power_spectrum_batch_phys(dataset.get_samples(2000)[0], bin_k=50, box_l=(5*np.pi)/180, log_sampling=False, multiply=True)

In [None]:
plot.plot_single(k, psd, confidence='std', shade=True)

# Divide into test and training set

Manual division

In [None]:
test_params = [[0.137, 1.23],
               [0.196, 1.225], # extr
               [0.127, 0.836], # extr
               [0.25, 0.658],
               [0.311, 0.842],
               [0.199, 0.87],
               [0.254, 0.852],
               [0.312, 0.664],
               [0.356, 0.614],
               [0.421, 0.628],
               [0.487, 0.643]] # extr
test_params = np.array(test_params)

In [None]:
params_map = dict()
for i in range(len(diff_params)):
    params_map[str(diff_params[i])] = i

In [None]:
test_dic = dict()
for p in test_params:
    if str(p) in params_map.keys():
        test_dic[params_map[str(p)]] = True

Random division

In [None]:
nr_test_params = 11

In [None]:
test_dic = dict()
for i in range(nr_test_params):
    idx = np.random.randint(0, len(diff_params))
    while idx in test_dic.keys():
        idx = np.random.randint(0, len(diff_params))
    test_dic[idx] = True

Create two sets and plot

In [None]:
test_params = []
train_params = []
for i in range(len(diff_params)):
    if i in test_dic.keys():
        test_params.append(diff_params[i])
    else:
        train_params.append(diff_params[i])
test_params = np.array(test_params)
train_params = np.array(train_params)
print(test_params.shape)
print(train_params.shape)

In [None]:
pl.scatter(train_params[:,0], train_params[:,1])
pl.scatter(test_params[:, 0], test_params[:, 1], color='r')
pl.xlabel('$\Omega_M$', fontsize=14)
pl.ylabel('$\sigma_8$', fontsize=14)

In [None]:
path = '/scratch/snx3000/smarcon/preprocessed_data/'

In [None]:
with h5py.File(path + 'train_test_params_kids.h5', 'w') as f:
    f.create_dataset('train', data=train_params)
    f.create_dataset('test', data=test_params)

In [None]:
first = True
for p in test_params:
    X, par = dataset.get_data_for_params(p)
    utils.append_h5(path + 'kids_test_smooth.h5', X, par, overwrite=first)
    first = False

In [None]:
first = True
for p in train_params:
    X, par = dataset.get_data_for_params(p)
    utils.append_h5(path + 'kids_train_smooth.h5', X, par, overwrite=first)
    first = False

# Shuffle dataset

Shuffle training set. Note: this requires a lot of memory.

In [None]:
utils.shuffle_h5(path + 'kids_train.h5', path + 'kids_train_shuffled.h5')

# Regressor

Prepare data for regressor

In [None]:
dataset = load_params_dataset('kids_train_shuffled.h5', batch=12000, shape=[128, 128])

In [None]:
batch_size = 12000
test_prob = 0.2

In [None]:
train_file = '/scratch/snx3000/smarcon/preprocessed_data/kids_reg_train.h5'
test_file = '/scratch/snx3000/smarcon/preprocessed_data/kids_reg_test.h5'

In [None]:
first = True
X_test = []
p_test = []
X_train = []
p_train = []
idx = 0
for b in dataset:
    if np.random.rand() < test_prob:
        X_test.append(b[0, 0])
        p_test.append(b[0, 1])
    else:
        X_train.append(b[0, 0])
        p_train.append(b[0, 1])
    idx = idx + 1
    if idx % batch_size == 0:
        utils.append_h5(test_file, np.array(X_test), np.array(p_test), overwrite=first)
        utils.append_h5(train_file, np.array(X_train), np.array(p_train), overwrite=first)
        first = False
        X_test = []
        p_test = []
        X_train = []
        p_train = []
if len(X_test) > 0:
    utils.append_h5(test_file, np.array(X_test), np.array(p_test), overwrite=first)
if len(X_train) > 0:
    utils.append_h5(train_file, np.array(X_train), np.array(p_train), overwrite=first)