In [1]:
import os
from os.path import expanduser
home_dir = expanduser("~")
module_path = home_dir + '/modules/'
import sys
sys.path.append(module_path)
import time
import importlib
import random
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
import model_management
from scipy.special import comb
import datetime
import codecs, json
%load_ext autoreload
%autoreload 1
%aimport environmental_density
from environmental_density import get_density_periodic

np.random.seed(999)
random.seed(999)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
save_figs = 1
fig_dir = 'figures/'



In [3]:
if run_on_cpu:
    os.environ["CUDA_VISIBLE_DEVICES"] = ""

NameError: name 'run_on_cpu' is not defined

# Load and preprocess the data

In [4]:
def load_galfile(galfile_directory='/home/magnus/code/useful_code/special_functions/test_galcat_w_log_densities_3e5.h5'):
    # '/scratch/data/galcats/P200/galaxies.Z01.h5'
    galfile = pd.read_hdf(galfile_directory)
    galaxies = galfile.as_matrix()
    gal_header = galfile.keys().tolist()

    ### Remove data points with halo mass below 10.5
    galaxies = galaxies[galaxies[:,6] > 10.5, :]
    
    data_keys = {'X_pos': 0, 'Y_pos': 1, 'Z_pos': 2, 'X_vel': 3, 'Y_vel': 4, 'Z_vel': 5, 'Halo_mass': 6, 
             'Stellar_mass': 7, 'SFR': 8, 'Intra_cluster_mass': 9, 'Halo_mass_peak': 10, 'Stellar_mass_obs': 11, 
             'SFR_obs': 12, 'Halo_radius': 13, 'Concentration': 14, 'Halo_spin': 15, 'Scale_peak_mass': 16, 
             'Scale_half_mass': 17, 'Scale_last_MajM': 18, 'Type': 19, 'Environmental_density': 20}
    unit_dict = {'X_pos': '', 'Y_pos': '', 'Z_pos': '', 'X_vel': '', 'Y_vel': '', 
             'Z_vel': '', 'Halo_mass': 'log($M_{G}/M_{S}$)', 'Stellar_mass': 'log($M_{G}/M_{S}$)', 'SFR': '', 
             'Intra_cluster_mass': '', 'Halo_mass_peak': 'log($M_{G}/M_{S}$)', 
             'Stellar_mass_obs': '', 'SFR_obs': '', 'Halo_radius': '', 
             'Concentration': '', 'Halo_spin': '', 'Scale_peak_mass': 'a', 
             'Scale_half_mass': 'a', 'Scale_last_MajM': 'a', 'Type': '', 
             'Environmental_density': 'log($M_{G}/M_{S}/Mpc^3$)'}
    
    return galaxies, data_keys, unit_dict

In [5]:
def divide_train_data(galaxies, data_keys, input_features, output_features, total_set_size, train_size, test_size):
    
    n_data_points = galaxies.shape[0]
    subset_indices = np.random.choice(n_data_points, total_set_size, replace=False)
    train_indices = subset_indices[: int(train_size)]
    val_indices = subset_indices[int(train_size) : int(train_size+val_size)]
    test_indices = subset_indices[int(train_size+val_size) :]

    x_train = np.zeros((len(train_indices), len(input_features)))
    x_val = np.zeros((len(val_indices), len(input_features)))
    x_test = np.zeros((len(test_indices), len(input_features)))
    y_train = np.zeros((len(train_indices), len(output_features)))
    y_val = np.zeros((len(val_indices), len(output_features)))
    y_test = np.zeros((len(test_indices), len(output_features)))

    for i in range(len(input_features)):
        x_train[:,i] = galaxies[train_indices, data_keys[input_features[i]]]
        x_val[:,i] = galaxies[val_indices, data_keys[input_features[i]]]
        x_test[:,i] = galaxies[test_indices, data_keys[input_features[i]]]

    for i in range(len(output_features)):
        y_train[:,i] = galaxies[train_indices, data_keys[output_features[i]]]
        y_val[:,i] = galaxies[val_indices, data_keys[output_features[i]]]
        y_test[:,i] = galaxies[test_indices, data_keys[output_features[i]]]
        
    training_data_dict = {
        'x_train': x_train,
        'x_val': x_val,
        'x_test': x_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test        
    }
    
    return training_data_dict

In [11]:
def normalise_data(training_data_dict, norm):
    
    x_train = training_data_dict['x_train']
    x_val = training_data_dict['x_val']
    x_test = training_data_dict['x_test']
    y_train = training_data_dict['y_train']
    y_val = training_data_dict['y_val']
    y_test = training_data_dict['y_test']
    
    if norm == 'none':
        
        training_data_dict['norm'] = norm

    elif norm == 'zero_mean_unit_std':

        for i in range(np.size(x_train, 1)):
            x_data_means = np.mean(x_train, 0)
            x_data_stds = np.std(x_train, 0)

            x_train_norm = (x_train - x_data_means) / x_data_stds
            x_val_norm = (x_val - x_data_means) / x_data_stds
            x_test_norm = (x_test - x_data_means) / x_data_stds

        for i in range(np.size(y_train, 1)):
            y_data_means = np.mean(y_train, 0)
            y_data_stds = np.std(y_train, 0)

            y_train_norm = (y_train - y_data_means) / y_data_stds
            y_val_norm = (y_val - y_data_means) / y_data_stds
            y_test_norm = (y_test - y_data_means) / y_data_stds
            
        training_data_dict['norm'] = norm
            
        training_data_dict['x_data_means'] = x_data_means
        training_data_dict['x_data_stds'] = x_data_stds
        training_data_dict['y_data_means'] = y_data_means
        training_data_dict['y_data_stds'] = y_data_stds
        
        training_data_dict['x_train_norm'] = x_train_norm
        training_data_dict['x_val_norm'] = x_val_norm
        training_data_dict['x_test_norm'] = x_test_norm
        training_data_dict['y_train_norm'] = y_train_norm
        training_data_dict['y_val_norm'] = y_val_norm
        training_data_dict['y_test_norm'] = y_test_norm
        


    elif norm == 'zero_to_one':

        for i in range(np.size(x_train, 1)):
            x_data_max = np.max(x_train, 0)
            x_data_min = np.min(x_train, 0)

            x_train_norm = (x_train - x_data_min) / (x_data_max - x_data_min)
            x_val_norm = (x_val - x_data_min) / (x_data_max - x_data_min)
            x_test_norm = (x_test - x_data_min) / (x_data_max - x_data_min)

        for i in range(np.size(y_train, 1)):
            y_data_max = np.max(y_train, 0)
            y_data_min = np.min(y_train, 0)

            y_train_norm = (y_train - y_data_min) / (y_data_max - y_data_min)
            y_val_norm = (y_val - y_data_min) / (y_data_max - y_data_min)
            y_test_norm = (y_test - y_data_min) / (y_data_max - y_data_min)
            
        training_data_dict['norm'] = norm
            
        training_data_dict['x_data_max'] = x_data_max
        training_data_dict['x_data_min'] = x_data_min
        training_data_dict['y_data_max'] = y_data_max
        training_data_dict['y_data_min'] = y_data_min
        
        training_data_dict['x_train_norm'] = x_train_norm
        training_data_dict['x_val_norm'] = x_val_norm
        training_data_dict['x_test_norm'] = x_test_norm
        training_data_dict['y_train_norm'] = y_train_norm
        training_data_dict['y_val_norm'] = y_val_norm
        training_data_dict['y_test_norm'] = y_test_norm
       
    else:
        print('Incorrect norm provided: ', norm)    
        
    
    
    return training_data_dict

# Set parameter string

In [7]:
### Set name ending with parameters for figures to be saved
param_string = 'nLayers_%d_nNeurons_%d_actFun_%s_lossFunc_%s_nTrainSamples_%d_nEpochs_%d_batchSize_%d' % (
    nLayers, neuronsPerLayer, activationFunction, loss_function, train_size, nEpochs, batchSize)
print(param_string)

NameError: name 'nLayers' is not defined

In [None]:
print(np.mean(y_test_norm, 0))
print(np.std(y_test_norm, 0))
print(np.min(x_test_norm, 0))
print(np.max(x_test_norm, 0))

In [None]:
### Get a feel for the data
for i in range(len(input_features)):
    print(input_features[i],': min: %.2e, max: %.2e.' % (np.min(x_train[:,i]), np.max(x_train[:,i])))
for i in range(len(output_features)):
    print(output_features[i],': min: %.2e, max: %.2e.' % (np.min(y_train[:,i]), np.max(y_train[:,i])))

In [8]:
from keras import backend as K
def weighted_mse_1(y_true, y_pred):
    
    return K.mean(K.log(y_true+1.5) + K.square(y_pred - y_true), axis=-1)
loss_func_dict = {
    'mse': 'mse',
    'mae': 'mae',
    'weighted_mse_1': weighted_mse_1
}

In [None]:
### Visualisation for when we have 2 input features
%matplotlib notebook
input_feat_1 = 0
input_feat_2 = 1
output_feat = 1

fig = plt.figure(1, figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x_train_norm[:500,input_feat_1], x_train_norm[:500,input_feat_2], 
           y_train_norm[:500,output_feat])
ax.set_xlabel('%s log($M_{H}/M_{S}$)' % (input_features[input_feat_1]))
ax.set_ylabel('%s log($M_{H}/M_{S}$)' % (input_features[input_feat_2]))
ax.set_zlabel('%s log($M_{G}/M_{S}$)' % (output_features[output_feat]))
plt.show()

# Load an existing model

In [None]:
### Search for the model that you want
importlib.reload(model_management)
search_dict = {
    'training_method': 'backprop'
}
[model_dicts, description_dicts] = model_management.SearchModel(search_dict, get_hits=True)
print(description_dicts)
print('\n')
for key in model_dicts:
    print(key)
    print(model_dicts[key])
    print('\n')


In [None]:
importlib.reload(model_management)
model, model_dict, description = model_management.LoadModel(search_dict, 1)

# Create a new model

In [9]:
### General parameters
nr_steps = 1e4
batch_size = 4e4
total_set_size = 300000 # how many examples will be used for training+validation+testing
train_size = 280000
val_size = 10000
test_size = 10000

norm = 'zero_mean_unit_std' # 'none',   'zero_mean_unit_std',   'zero_to_one'

input_features = ['Halo_mass', 'Scale_half_mass']
output_features = ['Stellar_mass', 'SFR']

nr_epochs = nr_steps * batch_size / train_size

### Network parameters
nLayers = 10
activationFunction = 'tanh'
neuronsPerLayer = 10
loss_function = 'mse' # 'mse', 'weighted_mse_1' 'mae'

In [12]:
# load the selected galaxyfile
galaxies, data_keys, unit_dict = load_galfile()
    
# prepare the training data
training_data_dict = divide_train_data(galaxies, data_keys, input_features, output_features, 
                                       total_set_size, train_size, test_size)
training_data_dict = normalise_data(training_data_dict, norm)

In [16]:
# create model
model = Sequential()
model.add(Dense(neuronsPerLayer, input_dim = len(input_features), activation = activationFunction))

for i in range(0, nLayers-1): # -1 because one layer is added automatically with the input layer
    model.add(Dense(neuronsPerLayer, activation = activationFunction))

model.add(Dense(len(output_features), activation = 'tanh'))

# Compile model
model.compile(loss=loss_func_dict[loss_function], metrics=['mse'], optimizer='adam')#, metrics=[loss_function])

# Fit the model
if training_data_dict['norm'] == 'none':
    history = model.fit(training_data_dict['x_train'], training_data_dict['y_train'], 
                        validation_data=(training_data_dict['x_val'], training_data_dict['y_val']), 
                        epochs=int(nr_epochs), batch_size=int(batch_size))
else:
    history = model.fit(training_data_dict['x_train_norm'] , training_data_dict['y_train_norm'], 
                        validation_data=(training_data_dict['x_val_norm'], training_data_dict['y_val_norm']), 
                        epochs=int(nr_epochs), batch_size=int(batch_size))

Train on 280000 samples, validate on 10000 samples
Epoch 1/1428
Epoch 2/1428
Epoch 3/1428
Epoch 4/1428
Epoch 5/1428
Epoch 6/1428
Epoch 7/1428
Epoch 8/1428
Epoch 9/1428
Epoch 10/1428
Epoch 11/1428
Epoch 12/1428
Epoch 13/1428
Epoch 14/1428
Epoch 15/1428
Epoch 16/1428
Epoch 17/1428
Epoch 18/1428
Epoch 19/1428
Epoch 20/1428
Epoch 21/1428
Epoch 22/1428
Epoch 23/1428
Epoch 24/1428
Epoch 25/1428
Epoch 26/1428
Epoch 27/1428
Epoch 28/1428
Epoch 29/1428
Epoch 30/1428
Epoch 31/1428
Epoch 32/1428
Epoch 33/1428
Epoch 34/1428
Epoch 35/1428
Epoch 36/1428
Epoch 37/1428
Epoch 38/1428
Epoch 39/1428
Epoch 40/1428
Epoch 41/1428
Epoch 42/1428
Epoch 43/1428
Epoch 44/1428
Epoch 45/1428
Epoch 46/1428
Epoch 47/1428
Epoch 48/1428


Epoch 49/1428
Epoch 50/1428
Epoch 51/1428
Epoch 52/1428
Epoch 53/1428
Epoch 54/1428
Epoch 55/1428
Epoch 56/1428
Epoch 57/1428
Epoch 58/1428
Epoch 59/1428
Epoch 60/1428
Epoch 61/1428
Epoch 62/1428
Epoch 63/1428
Epoch 64/1428
Epoch 65/1428
Epoch 66/1428
Epoch 67/1428
Epoch 68/1428
Epoch 69/1428
Epoch 70/1428
Epoch 71/1428
Epoch 72/1428
Epoch 73/1428
Epoch 74/1428
Epoch 75/1428
Epoch 76/1428
Epoch 77/1428
Epoch 78/1428
Epoch 79/1428
Epoch 80/1428
Epoch 81/1428
Epoch 82/1428
Epoch 83/1428
Epoch 84/1428
Epoch 85/1428
Epoch 86/1428
Epoch 87/1428
Epoch 88/1428
Epoch 89/1428
Epoch 90/1428
Epoch 91/1428
Epoch 92/1428
Epoch 93/1428
Epoch 94/1428
Epoch 95/1428


Epoch 96/1428
Epoch 97/1428
Epoch 98/1428
Epoch 99/1428
Epoch 100/1428
Epoch 101/1428
Epoch 102/1428
Epoch 103/1428
Epoch 104/1428
Epoch 105/1428
Epoch 106/1428
Epoch 107/1428
Epoch 108/1428
Epoch 109/1428
Epoch 110/1428
Epoch 111/1428
Epoch 112/1428
Epoch 113/1428
Epoch 114/1428
Epoch 115/1428
Epoch 116/1428
Epoch 117/1428
Epoch 118/1428
Epoch 119/1428
Epoch 120/1428
Epoch 121/1428
Epoch 122/1428
Epoch 123/1428
Epoch 124/1428
Epoch 125/1428
Epoch 126/1428
Epoch 127/1428
Epoch 128/1428
Epoch 129/1428
Epoch 130/1428
Epoch 131/1428
Epoch 132/1428
Epoch 133/1428
Epoch 134/1428
Epoch 135/1428
Epoch 136/1428
Epoch 137/1428
Epoch 138/1428
Epoch 139/1428
Epoch 140/1428
Epoch 141/1428
Epoch 142/1428

KeyboardInterrupt: 

# Do a batch run to see which input parameters gives the best score

In [None]:
core_input_features = ['Halo_mass']
tested_input_features = ['Halo_mass_peak', 'Concentration', 'Type', 'Scale_peak_mass', 'Scale_half_mass', 
                 'Scale_last_MajM', 'Environmental_density']
output_features = ['Stellar_mass', 'SFR']
nr_extra_params_list = [1, 2, 3]
nr_runs_per_comb = 10

nr_steps = 1e4
batch_size = 4e4

nr_epochs = nr_steps * batch_size / train_size
parameter_dictionary = {
    'fixed_input_features': core_input_features,
    'tested_input_features': tested_input_features,
    'output_features': output_features,
    'nr_extra_parameter_combinations': nr_extra_params_list,
    'nr_steps': [nr_steps],
    'batch_size': [batch_size],
    'nr_epochs': [nr_epochs],
    'nr_training_samples': [train_size],
    'nr_validation_samples': [val_size],
    'nr_test_samples': [test_size],
    'data_normalization': norm,
    'activation_function': activationFunction,
    'neurons_per_layer': [neuronsPerLayer],
    'nr_hidden_layers': [nLayers],
    'description': 'Each parameter setting is represented by one list containing three objects. The first one is ' + \
    'the input parameters. The second one is the mse test scores obtained for the different runs evaluated on the ' +\
    'original units of the data set. The third one is the loss histories for the different runs [training_loss, ' +\
    'validation_loss].'
}
results_list = [parameter_dictionary]
nr_combs_total = 0
for nr_extra_params in nr_extra_params_list:
    nr_combs_total += comb(len(tested_input_features), nr_extra_params)

with open('model_comparisons/progress.txt', 'w+') as f:
    
    date_string_proper = datetime.datetime.now().strftime("%H:%M, %Y-%m-%d")
    f.write('Benchmark done on input parameters at ' + date_string_proper + '\n\n')
    f.flush()
    
    # load the selected galaxyfile
    galaxies, data_keys, unit_dict = load_galfile()
    
    for i_nr_extra_params, nr_extra_params in enumerate(nr_extra_params_list):
        
        extra_param_combs = list(combinations(tested_input_features, nr_extra_params))
        
        date_string_proper = datetime.datetime.now().strftime("%H:%M, %Y-%m-%d")
        f.write(date_string_proper + '    Testing %d extra parameters. %d/%d extra parameter count tested. \n\n' %
                (nr_extra_params, i_nr_extra_params+1, len(nr_extra_params_list)))
        f.flush()
    
        for i_comb, param_comb in enumerate(extra_param_combs):
            input_features = core_input_features.copy()
            input_features.extend(param_comb)
            
            # prepare the training data
            training_data_dict = divide_train_data(galaxies, data_keys, input_features, output_features, 
                                                   total_set_size, train_size, test_size)
            training_data_dict = normalise_data(training_data_dict, norm)
            
            date_string_proper = datetime.datetime.now().strftime("%H:%M, %Y-%m-%d")
            f.write(date_string_proper + '        Testing combination %d/%d. \n\n' % (i_comb+1, nr_combs_total))
            f.flush()
            
            scores = []
            histories = []

            for i_run in range(nr_runs_per_comb):

                # create model
                model = Sequential()
                model.add(Dense(neuronsPerLayer, input_dim = len(input_features), activation = activationFunction))

                for i in range(0, nLayers-1): # -1 because one layer is added automatically with the input layer
                    model.add(Dense(neuronsPerLayer, activation = activationFunction))

                model.add(Dense(len(output_features), activation = activationFunction))

                # Compile model
                model.compile(loss=loss_func_dict[loss_function], metrics=['mse'], optimizer='adam')

                # Fit the model
                if norm == 'none':
                    history = model.fit(x_train_norm , y_train_norm, validation_data=(x_val_norm, y_val_norm), 
                                        epochs=int(nr_epochs), batch_size=int(batch_size), verbose=0)
                else:
                    history = model.fit(x_train , y_train, validation_data=(x_val, y_val), 
                                        epochs=int(nr_epochs), batch_size=int(batch_size), verbose=0)

                # Evaluate the model on test data
                if norm == 'zero_mean_unit_std_norm':
                    predicted_norm_points = model.predict(x_test_norm)
                    predicted_points = predicted_norm_points * y_data_stds + y_data_means

                elif norm == 'zero_to_one_norm':
                    predicted_norm_points = model.predict(x_test_norm)
                    predicted_points = predicted_norm_points * (y_data_max - y_data_min) + y_data_min
                    
                elif norm == 'none':
                    predicted_points = model.predict(x_test)
                    
                else:
                    print('Incorrect normalisation provided: ' + norm)
                    
                ### Get mse for the real predictions
                n_points, n_outputs = np.shape(predicted_points)
                x_minus_y = predicted_points - y_test

                feature_scores = np.sum(np.power(x_minus_y, 2), 0) / n_points
                total_score = np.sum(feature_scores) / n_outputs
                
                histories.append([history.history['loss'], history.history['val_loss']])
                scores.append(total_score)
                
            results_list.append([input_features, scores, histories])
            
    date_string_proper = datetime.datetime.now().strftime("%H:%M, %Y-%m-%d")
    f.write('Benchmark completed at ' + date_string_proper + '\n')
    f.flush()
        

In [None]:
print(results_list[1][2][0])

In [None]:
comb_nr = 5
title = results_list[comb_nr][0]
train_loss = results_list[comb_nr][2][0][0]
val_loss = results_list[comb_nr][2][0][1]
test_loss = results_list[comb_nr][1]
#print(train_loss)
print('Lowest train/val/test loss: %.2f, %.2f, %.2f' % (np.amin(train_loss), np.amin(val_loss), np.amin(test_loss)))

In [None]:
### Save the result
date_string = datetime.datetime.now().strftime('%Y-%m-%d--%H-%M-%S')
custom_string = 'aborted_long_run_may_10'
with open('model_comparisons/' + custom_string + '.json', 'w+') as f:
    json.dump(results_list, f)
f.close()


In [None]:
### Load a result
loaded_list_string = '2018-05-09--17-15-18'
with open('model_comparisons/' + loaded_list_string + '.json', 'r') as f:
    loaded_list = json.load(f)
f.close()
print(loaded_list)

# Plot loss history

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for loss
%matplotlib inline
fig = plt.figure(5, figsize=(8,8))
plt.plot(train_loss, 'b')
plt.plot(val_loss, 'r')
plt.yscale('log')
plt.title(title)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

# Evaluate the model

In [None]:
# On preprocessed data
test_loss, test_mse = model.evaluate(x_test_norm, y_test_norm, verbose=0)
print('MSE for the processed data: %.4f' % (test_mse))

In [None]:
### Predict real value of points
if norm == 'zero_mean_unit_std_norm':
    predicted_norm_points = model.predict(x_test_norm)
    predicted_points = predicted_norm_points * y_data_stds + y_data_means
    
if norm == 'zero_to_one_norm':
    predicted_norm_points = model.predict(x_test_norm)
    predicted_points = predicted_norm_points * (y_data_max - y_data_min) + y_data_min


In [None]:
### Get mse for the real predictions
n_points = np.shape(predicted_points)[0]
x_minus_y = predicted_points - y_test

feature_scores = np.sum(np.power(x_minus_y, 2), 0) / n_points
total_score = np.sum(feature_scores) / 2

print('MSE for the unprocessed data: %.4f' % (total_score))

# Save a model

In [None]:
### Save the model if it is useful
importlib.reload(model_management)
model_dictionary = {
    'training_method': 'backprop',
    'input_features': input_features,
    'output_features': output_features,
    'number_of_epochs': nEpochs,
    'batch_size': batchSize,
    'number_of_layers': nLayers,
    'neurons_per_layer': neuronsPerLayer,
    'activation_function': activationFunction,
    'train_set_size': train_size,
    'loss_function': loss_function,
    'test_loss': test_loss,
    'test_mse': test_mse,
    'preprocess_data': preprocess_data
}
description = 'First network trained on preprocessed data.'
model_management.SaveModel(model, model_dictionary, description)

In [None]:
%matplotlib notebook
#x1 = np.linspace(np.min(x_test[:,0]), np.max(x_test[:,0]), 30)
#x2 = np.linspace(np.min(x_test[:,1]), np.max(x_test[:,1]), 30)
#X1, X2 = np.meshgrid(x1, x2)
#Z = np.zeros(X1.shape)
#for i in range(30):
#    for j in range(30):
#        Z[i, j] = model.predict(np.array([X1[i,j], X2[i,j]])) TODO varför funkar inte det här??
        
#fig = plt.figure(4)
#ax = plt.axes(projection='3d')
#ax.contour3D(X, Y, Z, 50, cmap='binary')
#ax.set_xlabel('x')
#ax.set_ylabel('y')
#ax.set_zlabel('z')
        
### Old visualisation way
### Visualisation of prediction strength for when we have 2 input features
if plot_threeD and len(input_features) == 2:
    predictedY = model.predict(x_test_norm)
    predictedY = predictedY * y_data_stds + y_data_means
    fig = plt.figure(2, figsize=(8,8))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(x_test[:,0], x_test[:,1], 
               y_test[:,0], s=3)
    ax.scatter(x_test[:,0], x_test[:,1], 
               predictedY, s=3)
    ax.set_xlabel('%s log($M_{H}/M_{S}$)' % (input_features[0]))
    ax.set_ylabel('%s log($M_{H}/M_{S}$)' % (input_features[1]))
    ax.set_zlabel('%s log($M_{G}/M_{S}$)' % (output_features[0]))

# Scatterplots and boxplots

In [None]:
%matplotlib inline
#from pylab import plot, show, savefig, xlim, figure, \
#                hold, ylim, legend, boxplot, setp, axes
nBins = 8
bin_edges = np.linspace(halo_min_mass, halo_max_mass, nBins+1)

predictedY = predicted_points

for i, feat in enumerate(output_features):
    
    
    ### Plot 1
    fig = plt.figure(figsize=(8,8))
    
    plt.plot(y_test[:,i], y_test[:,i], 'k.')
    plt.plot(predictedY[:,i], y_test[:,i], 'g.')
    plt.ylabel('True %s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.xlabel('Predicted %s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.legend(['Ideal result', 'predicted ' + feat], loc='upper center')
    plt.title('nEpochs: %d, batch size: %d, training set size: %d, test mse score: %.2e\n' % (nEpochs, 
        batchSize, train_size, test_mse) + 
        '%d input feature(s): [%s]\n%d output feature(s): [%s]\n%d test data points (test) shown' % (
        len(input_features), ', '.join(input_features), len(output_features), ', '.join(output_features),
        test_size), y=1.03, fontsize=20)
    plt.show
        
    if save_figs:
        fig.savefig(fig_dir+'bp_output_scatter_%d_plot_from_' % (i+1)+'_and_'.join(input_features)+'_to_'+
            '_and_'.join(output_features)+'_with_'+param_string+'.png', bbox_inches = 'tight')
    
    ### Plot 2 - boxplot
    
    # bin_means contain (0: mean of the binned values, 1: bin edges, 2: numbers pointing each example to a bin)
    bin_means_true = stats.binned_statistic(x_test[:,i], y_test[:,i], bins=bin_edges)
    bin_means_pred = stats.binned_statistic(x_test[:,i], predictedY[:,i].flatten(), bins=bin_edges)
    bin_centers = []
    for iBin in range(nBins):
        bin_center = (bin_means_true[1][iBin] + bin_means_true[1][iBin+1]) / 2
        bin_centers.append('%.2f' % (bin_center))
    sorted_true_y_data = []
    sorted_pred_y_data = []
    for iBin in range(1,nBins+1):
        sorted_true_y_data.append(y_test[bin_means_true[2] == iBin, i])
        sorted_pred_y_data.append(predictedY[bin_means_pred[2] == iBin,i])
        
    fig = plt.figure(figsize=(16,8))
    ax = plt.subplot(111)

    bin_pos = np.array([-2,-1]) # (because this makes it work)
    x_label_centers = []
    for iBin in range(nBins):
        # Every boxplot adds 2 boxes, one from the true data and one from the predicted data
        bin_pos += 3 
        plt.boxplot([sorted_true_y_data[iBin], sorted_pred_y_data[iBin]] , positions = bin_pos, widths = 0.9)
        x_label_centers.append(np.mean(bin_pos))
    
    plt.ylabel('%s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.xlabel('True Halo mass log($M_{G}/M_{S}$)', fontsize=15)
    ax.set_xlim(left=x_label_centers[0]-2, right=x_label_centers[-1]+2)
    #xlim(0,bin_pos[1] + 1)
    plt.xticks(x_label_centers, bin_centers)
    plt.text(12,7,'Left: true data. Right: predicted data.', fontsize=20)
    
    if feat == 'SFR':
        ax.axhline(y=0, linestyle='--')
    
    plt.title('nEpochs: %d, batch size: %d, training set size: %d, test mse score: %.2e\n' % (nEpochs, 
        batchSize, train_size, test_mse) + 
        '%d input feature(s): [%s]\n%d output feature(s): [%s]\n%d test data points (test) shown' % (
        len(input_features), ', '.join(input_features), len(output_features), ', '.join(output_features),
        test_size), y=1.03, fontsize=20)
    
    plt.show()
    
    if save_figs:
        fig.savefig(fig_dir+'bp_output_boxplot_%d_from_' % (i+1)+'_and_'.join(input_features)+'_to_'+
            '_and_'.join(output_features)+'_with_'+param_string+'.png', bbox_inches = 'tight')

# Scatterplots and plots with errorbars

In [None]:
%matplotlib inline
#from pylab import plot, show, savefig, xlim, figure, \
#                hold, ylim, legend, boxplot, setp, axes
nBins = 8
bin_edges = np.linspace(halo_min_mass, halo_max_mass, nBins+1)

predictedY = predicted_points

for i, feat in enumerate(output_features):
    
    
    ### Plot 1
    fig = plt.figure(figsize=(8,8))
    
    plt.plot(y_test[:,i], y_test[:,i], 'k.')
    plt.plot(predictedY[:,i], y_test[:,i], 'g.')
    plt.ylabel('True %s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.xlabel('Predicted %s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.legend(['Ideal result', 'predicted ' + feat], loc='upper center')
    plt.title('nEpochs: %d, batch size: %d, training set size: %d, test mse score: %.2e\n' % (nEpochs, 
        batchSize, train_size, test_mse) + 'loss function: %s\n' % (loss_function) +
        '%d input feature(s): [%s]\n%d output feature(s): [%s]\n%d data points (test) shown' % (
        len(input_features), ', '.join(input_features), len(output_features), ', '.join(output_features),
        test_size), y=1.03, fontsize=20)
    plt.show
        
    if save_figs:
        fig.savefig(fig_dir+'bp_output_scatter_%d_plot_from_' % (i+1)+'_and_'.join(input_features)+'_to_'+
            '_and_'.join(output_features)+'_with_'+param_string+'.png', bbox_inches = 'tight')
    
    ### Plot 2 - boxplot
    
    # bin_means contain (0: mean of the binned values, 1: bin edges, 2: numbers pointing each example to a bin)
    bin_means_true = stats.binned_statistic(x_test[:,i], y_test[:,i], bins=bin_edges)
    bin_means_pred = stats.binned_statistic(x_test[:,i], predictedY[:,i].flatten(), bins=bin_edges)
    bin_centers = []
    for iBin in range(nBins):
        bin_center = (bin_means_true[1][iBin] + bin_means_true[1][iBin+1]) / 2
        bin_centers.append('%.2f' % (bin_center))
    sorted_true_y_data = []
    sorted_pred_y_data = []
    for iBin in range(1,nBins+1):
        sorted_true_y_data.append(y_test[bin_means_true[2] == iBin, i])
        sorted_pred_y_data.append(predictedY[bin_means_pred[2] == iBin,i])
    
    # get standard deviations of the binned values
    stds_true = np.zeros((nBins))
    stds_pred = np.zeros((nBins))
    for iBin in range(nBins):
        stds_true[iBin] = np.std(sorted_true_y_data[iBin])
        stds_pred[iBin] = np.std(sorted_pred_y_data[iBin])
        
    fig = plt.figure(figsize=(16,8))
    ax = plt.subplot(111)

    bin_pos = np.array([-2,-1]) # (because this makes it work)
    x_label_centers = []
    for iBin in range(nBins):
        # Every plot adds 2 distributions, one from the true data and one from the predicted data
        bin_pos += 3 
        plt.errorbar(bin_pos[0], bin_means_true[0][iBin], yerr=stds_true[iBin], fmt = 'bo', capsize=5)
        plt.errorbar(bin_pos[1], bin_means_pred[0][iBin], yerr=stds_pred[iBin], fmt = 'ro', capsize=5)
        x_label_centers.append(np.mean(bin_pos))
    
    plt.ylabel('%s %s' % (feat, unit_dict[feat]), fontsize=15)
    plt.xlabel('True Halo mass log($M_{G}/M_{S}$)', fontsize=15)
    plt.legend(['True data $\pm 1 \sigma$', 'Predicted data $\pm 1 \sigma$'], loc='upper left', fontsize='xx-large')
    ax.set_xlim(left=x_label_centers[0]-2, right=x_label_centers[-1]+2)
    #xlim(0,bin_pos[1] + 1)
    plt.xticks(x_label_centers, bin_centers)
    
    plt.title('nEpochs: %d, batch size: %d, training set size: %d, test mse score: %.2e\n' % (nEpochs, 
        batchSize, train_size, test_mse) + 'loss function: %s\n' % (loss_function) +
        '%d input feature(s): [%s]\n%d output feature(s): [%s]\n%d data points (test) shown' % (
        len(input_features), ', '.join(input_features), len(output_features), ', '.join(output_features),
        test_size), y=1.03, fontsize=20)
    
    plt.show()
    
    if save_figs:
        fig.savefig(fig_dir+'bp_output_boxplot_%d_from_' % (i+1)+'_and_'.join(input_features)+'_to_'+
            '_and_'.join(output_features)+'_with_'+param_string+'.png', bbox_inches = 'tight')

In [None]:
#print(predictedY.flatten())
#bin_means = stats.binned_statistic(predictedY.flatten(), y_test[:,0], bins=10)
#bin_stds = stats.binned_statistic(predictedY.flatten(), y_test[:,0], bins=10, statistic=GetSTD)
#print(bin_means[0])
#print(bin_stds[0])

# Plot SFR vs Stellar mass

In [None]:
predictedY = model.predict(x_test_norm)
predictedY = predictedY * y_data_stds + y_data_means

fig = plt.figure(figsize=(12,8))
ax = plt.subplot(121)

plt.plot(predictedY[:,0], predictedY[:,1], 'b.', markersize=2)
plt.ylabel('Predicted SFR %s' % (unit_dict['SFR']), fontsize=15)
plt.xlabel('Predicted Stellar Mass %s' % (unit_dict['Stellar_mass']), fontsize=15)
ymin, ymax = ax.get_ylim()
ax.set_ylim(bottom=ymin, top=ymax)

ax = plt.subplot(122)
plt.plot(y_test[:,0], y_test[:,1], 'k.', markersize=2)
plt.ylabel('True SFR %s' % (unit_dict['SFR']), fontsize=15)
plt.xlabel('True Stellar Mass %s' % (unit_dict['Stellar_mass']), fontsize=15)
ax.set_ylim(bottom=ymin, top=ymax)
plt.suptitle('nEpochs: %d, batch size: %d, training set size: %d, test mse score: %.2e\n' % (nEpochs, 
    batchSize, train_size, total_score) + 'loss function: %s\n' % (loss_function) +
    '%d input feature(s): [%s]\n%d output feature(s): [%s]\n%d data points (test) shown' % (
    len(input_features), ', '.join(input_features), len(output_features), ', '.join(output_features),
    test_size), y=1.17, fontsize=20)
plt.tight_layout()
plt.show

In [None]:
### Save the figure
fig.savefig(fig_dir+'bp_sfr_to_stellar_mass_inputs_' + '_and_'.join(input_features)+
            '_with_'+param_string+'.png', bbox_inches = 'tight')

# Plot loss history

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for loss
%matplotlib inline
fig = plt.figure(5, figsize=(8,8))
plt.plot(history.history['loss'], 'b')
plt.plot(history.history['val_loss'], 'r')
plt.yscale('log')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

# Check to see how the MSE is calculated 

In [None]:
predicted_points = model.predict(x_test)
print(np.shape(predicted_points))
n_points = np.shape(predicted_points)[0]
x_minus_y = predicted_points - y_test

feature_scores = np.sum(np.power(x_minus_y, 2), 0) / n_points
total_score = np.sum(feature_scores) / 2

print(total_score)

keras_scores = model.evaluate(x_test, y_test, verbose=0)
print(keras_scores)

In [None]:
# summarize history for loss
%matplotlib inline
fig = plt.figure(5, figsize=(8,8))
plt.plot(history.history['loss'], 'b')
plt.plot(history.history['val_loss'], 'r')
plt.yscale('log')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

# Plot loss history

In [None]:
# list all data in history
print(history.history.keys())

In [None]:
# summarize history for loss
%matplotlib inline
fig = plt.figure(5, figsize=(8,8))
plt.plot(history.history['loss'], 'b')
plt.plot(history.history['val_loss'], 'r')
plt.yscale('log')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper right')
plt.show()

# Plot loss history

In [None]:
# list all data in history
print(history.history.keys())

# Testing functions

In [None]:
### TESTING
coordinates = galaxies[:1000, :3]
halo_masses = np.power(10, galaxies[:1000, 6])
nr_points = np.shape(coordinates)[0]

nr_neighbours_wanted = 30
box_sides = np.array([200, 200, 200])

neigh_densities = get_density_periodic(coordinates, halo_masses, nr_neighbours_wanted, 
                                                                 box_sides, nr_points, verbatim=True)
print(neigh_densities)