In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
from os.path import expanduser
home_dir = expanduser("~")
module_path = home_dir + '/code/modules/'
models_path = home_dir + '/models/'
import sys
sys.path.append(module_path)
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
from scipy import stats
import json
import pandas as pd
%load_ext autoreload
%autoreload 1
%aimport data_processing
from data_processing import *

## Construct mock SMF

In [None]:
original_directory = '/home/magnus/data/galcats_nonzero_sfr_no_density_with_growth_rate_no_lastMajM/'
destination_directory = '/home/magnus/data/mock_data/stellar_mass_functions/'

redshifts = [0, .1, .2, .5, 1, 2, 3, 4, 6, 8]
bin_width = .1
error = .3

for redshift in redshifts:

    file_name = 'galaxies.Z{:02.0f}'.format(redshift*10)

    galfile = pd.read_hdf(original_directory + file_name + '.h5')
    galaxies = galfile.values
    gal_header = galfile.keys().tolist()

    data_keys = {}
    for col_nr, key in enumerate(gal_header):
        data_keys[key] = col_nr

    # Remove the smaller haloes
    galaxies = galaxies[galaxies[:,data_keys['Halo_mass']] > 10.5, :]

  #  print(np.amax(galaxies[:, data_keys['Halo_mass']]))
  #  print(np.amin(galaxies[:, data_keys['Halo_mass']]))

    max_stellar_mass = np.amax(galaxies[:, data_keys['Stellar_mass']])
    min_stellar_mass = np.amin(galaxies[:, data_keys['Stellar_mass']])
    lower_bin_edge = np.floor(min_stellar_mass * 10)/10
    upper_bin_edge = np.ceil(max_stellar_mass* 10)/10
   # print(min_stellar_mass, max_stellar_mass)
   # print(lower_bin_edge, upper_bin_edge)

    bin_edges = np.arange(lower_bin_edge, upper_bin_edge + bin_width, bin_width)
    n_bins = len(bin_edges)-1
    bin_stats = stats.binned_statistic(galaxies[:, data_keys['Stellar_mass']], galaxies[:, data_keys['Stellar_mass']], 
                                       bins=bin_edges)

    bin_counts = [np.sum(bin_stats[2] == i) for i in range(1, n_bins+1)]

    bin_centers = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]
  #  print(bin_centers)

    #plt.hist(galaxies[:, data_keys['Stellar_mass']], bins = bin_edges, density = False, label = 'Stellar_mass')
  #  plt.scatter(bin_centers, bin_counts)

  #  print(bin_counts)
    bin_counts = [float('nan') if count == 0 else count for count in bin_counts]
    bin_counts_arr = np.array(bin_counts, dtype=np.float)
    bin_counts_per_mpc3 = bin_counts_arr / 200**3 / bin_width
    bin_counts_per_mpc3[bin_counts_per_mpc3 > 0] = np.log10(bin_counts_per_mpc3[bin_counts_per_mpc3 > 0])
  #  print(bin_counts_per_mpc3)

    full_data_list = [[stell_mass, phi, error] for (stell_mass, phi) in zip(bin_centers, bin_counts_per_mpc3) 
                      if (not np.isnan(phi)) ]
    full_data_list.insert(0, {'bin_widths': bin_width, 'bin_edges': bin_edges.tolist()})
  #  print(full_data_list)

    with open(destination_directory + file_name + '.json', 'w+') as f:
        json.dump(full_data_list, f)
    f.close()

In [None]:
### Try loading a data file
file_name = 'galaxies.Z00'
with open(destination_directory + file_name + '.json', 'r') as f:
    test = json.load(f)
f.close()
print(test)

## Construct mock SSFR

In [None]:
original_directory = '/home/magnus/data/galcats_nonzero_sfr_no_density_with_growth_rate_no_lastMajM/'
destination_directory = '/home/magnus/data/mock_data/ssfr/'
redshifts = [0, .1, .2, .5, 1, 2, 3, 4, 6, 8]

for redshift in redshifts:

    file_name = 'galaxies.Z{:02.0f}'.format(redshift*10)
    
    bin_width = .2
    error = .3

    galfile = pd.read_hdf(original_directory + file_name + '.h5')
    galaxies = galfile.values
    gal_header = galfile.keys().tolist()

    data_keys = {}
    for col_nr, key in enumerate(gal_header):
        data_keys[key] = col_nr

    # Remove the smaller haloes
    galaxies = galaxies[galaxies[:,data_keys['Halo_mass']] > 10.5, :]

    # print(np.amax(galaxies[:, data_keys['Halo_mass']]))
    # print(np.amin(galaxies[:, data_keys['Halo_mass']]))

    sfr_log = galaxies[:, data_keys['SFR']]
    sfr = np.power(10, sfr_log)
    # print(np.amax(sfr))
    # print(np.amin(sfr))

    stellar_mass_log = galaxies[:, data_keys['Stellar_mass']]
    stellar_mass = np.power(10, stellar_mass_log)
    # print('{:.3e}'.format(np.amax(stellar_mass)))
    # print('{:.3e}'.format(np.amin(stellar_mass)))

    ssfr = sfr / stellar_mass
    ssfr_log = np.log10(ssfr)

    max_stellar_mass = np.amax(stellar_mass_log)
    min_stellar_mass = np.amin(stellar_mass_log)
    lower_bin_edge = np.floor(min_stellar_mass * 10)/10
    upper_bin_edge = np.ceil(max_stellar_mass* 10)/10
    # print(min_stellar_mass, max_stellar_mass)
    # print(lower_bin_edge, upper_bin_edge)
    if (upper_bin_edge - lower_bin_edge) % bin_width > 1e-5 and (upper_bin_edge - lower_bin_edge) % bin_width < bin_width - 1e-5:
    #     print((upper_bin_edge - lower_bin_edge) % bin_width)
    #     print('hej')
        upper_bin_edge += (upper_bin_edge - lower_bin_edge) % bin_width
    # print(lower_bin_edge, upper_bin_edge)

    bin_edges = np.arange(lower_bin_edge, upper_bin_edge + bin_width, bin_width)
    n_bins = len(bin_edges)-1
    bin_stats_means = stats.binned_statistic(stellar_mass_log, ssfr_log, bins=bin_edges, statistic='mean')
    bin_stats_stds = stats.binned_statistic(stellar_mass_log, ssfr_log, bins=bin_edges, statistic=np.std)
    bin_means = bin_stats_means[0]
    bin_stds = bin_stats_stds[0]
    # print(bin_means)
    # print(bin_stds)

    bin_centers = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]
    # print(bin_edges)
    # print(bin_centers)

    # plt.errorbar(bin_centers, bin_means, yerr=bin_stds, fmt = 'bo')

    full_data_list = [[stell_mass, mean_ssfr_log, error] for (stell_mass, mean_ssfr_log) in zip(bin_centers, bin_means)]
    full_data_list.insert(0, {'bin_widths': bin_width, 'bin_edges': bin_edges.tolist()})
    # print(full_data_list)

    with open(destination_directory + file_name + '.json', 'w+') as f:
        json.dump(full_data_list, f)
    f.close()

## Construct mock fraction of quenched galaxies

In [12]:
original_directory = '/home/magnus/data/galcats_nonzero_sfr_no_density_with_growth_rate_no_lastMajM/'
destination_directory = '/home/magnus/data/mock_data/fq/'
redshifts = [0, .1, .2, .5, 1, 2, 3, 4, 6, 8]

np.seterr(invalid='raise')

for i_red, redshift in enumerate(redshifts):
    scale_factor = 1 / (1 + redshift)

    h_0 = 67.81 / (3.09e19) # 1/s
    h_0 = h_0 * 60 * 60 * 24 * 365 # 1/yr
    h_r = h_0 * np.sqrt(1e-3*scale_factor**(-4) + 0.308*scale_factor**(-3) + 0*scale_factor**(-2) + 0.692)
    ssfr_cutoff = 0.3*h_r
    log_ssfr_cutoff = np.log10(ssfr_cutoff)

    file_name = 'galaxies.Z{:02.0f}'.format(redshift*10)

    bin_width = .25
    error = .3

    galfile = pd.read_hdf(original_directory + file_name + '.h5')
    galaxies = galfile.values
    gal_header = galfile.keys().tolist()

    data_keys = {}
    for col_nr, key in enumerate(gal_header):
        data_keys[key] = col_nr

    # Remove the smaller haloes
    galaxies = galaxies[galaxies[:,data_keys['Halo_mass']] > 10.5, :]

#     print(np.amax(galaxies[:, data_keys['Halo_mass']]))
#     print(np.amin(galaxies[:, data_keys['Halo_mass']]))

    sfr_log = galaxies[:, data_keys['SFR']]
    sfr = np.power(10, sfr_log)
    # print(np.amax(sfr))
    # print(np.amin(sfr))

    stellar_mass_log = galaxies[:, data_keys['Stellar_mass']]
    stellar_mass = np.power(10, stellar_mass_log)
    # print('{:.3e}'.format(np.amax(stellar_mass)))
    # print('{:.3e}'.format(np.amin(stellar_mass)))

    ssfr = sfr / stellar_mass

    # print(np.min(ssfr), np.max(ssfr))
    # print(h_r)

    ssfr_log = np.log10(ssfr)

    max_stellar_mass = np.amax(stellar_mass_log)
    min_stellar_mass = np.amin(stellar_mass_log)
    lower_bin_edge = np.floor(min_stellar_mass * 1/bin_width)*bin_width
    upper_bin_edge = np.ceil(max_stellar_mass * 1/bin_width)*bin_width
#     print(min_stellar_mass, max_stellar_mass)
#     print(lower_bin_edge, upper_bin_edge)
    if (upper_bin_edge - lower_bin_edge) % bin_width > 1e-5 and (upper_bin_edge - lower_bin_edge) % bin_width < bin_width - 1e-5:
    #     print((upper_bin_edge - lower_bin_edge) % bin_width)
        upper_bin_edge += (upper_bin_edge - lower_bin_edge) % bin_width
#     print(lower_bin_edge, upper_bin_edge)

    bin_edges = np.arange(lower_bin_edge, upper_bin_edge + bin_width, bin_width)
    n_bins = len(bin_edges)-1
    bin_means, bin_edges, bin_numbers = stats.binned_statistic(stellar_mass_log, ssfr_log, bins=bin_edges, statistic='mean')

    bin_fqs = []
    for bin_num in range(1, n_bins+1):
        try:
            fq = np.sum(ssfr_log[bin_numbers == bin_num] < log_ssfr_cutoff) / len(ssfr_log[bin_numbers == bin_num])
        except:
            if len(ssfr_log[bin_numbers == bin_num]) == 0:
                print('bin number {:d} contains 0 points'.format(bin_num))
                fq = 0
            else:
                print('some other error')
        bin_fqs.append(fq)

    bin_centers = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]
#     fig = plt.figure(i_red, figsize=(10,7))
#     ax = plt.gca()
#     plt.plot(bin_centers, bin_fqs)
#     ax.set_ylim(bottom=0)
#     ax.set_ylim(top=1)
# #     ax.set_xlim(left=8)
# #     ax.set_xlim(right=12)
#     plt.title('Fraction of quenched galaxies at redshift {:.1f}'.format(redshift))
# #     fig.savefig('fq_Z{:02.0f}.png'.format(redshift*10), bbox_inches = 'tight')
#     plt.show()

# print(bin_edges)
# print(bin_centers)

# plt.errorbar(bin_centers, bin_means, yerr=bin_stds, fmt = 'bo')

    full_data_list = [[stell_mass, fq, error] for (stell_mass, fq) in zip(bin_centers, bin_fqs)]
    full_data_list.insert(0, {'bin_widths': bin_width, 'bin_edges': bin_edges.tolist()})

    with open(destination_directory + file_name + '.json', 'w+') as f:
        json.dump(full_data_list, f)
    f.close()

### Mock stellar mass halo mass relationship

In [None]:
original_directory = '/home/magnus/data/galcats_nonzero_sfr_no_density_with_growth_rate_no_lastMajM/'
destination_directory = '/home/magnus/data/mock_data/stellar_halo_mass_relations/'
redshifts = [0, .1, .2, .5, 1, 2, 3, 4, 6, 8]

np.seterr(invalid='raise')

for i_red, redshift in enumerate(redshifts):

    file_name = 'galaxies.Z{:02.0f}'.format(redshift*10)

    bin_width = .2
    error = .3

    galfile = pd.read_hdf(original_directory + file_name + '.h5')
    galaxies = galfile.values
    gal_header = galfile.keys().tolist()

    data_keys = {}
    for col_nr, key in enumerate(gal_header):
        data_keys[key] = col_nr

    # Remove the smaller haloes
    galaxies = galaxies[galaxies[:,data_keys['Halo_mass']] > 10.5, :]

    halo_mass = galaxies[:, data_keys['Halo_mass']]
    stellar_mass = galaxies[:, data_keys['Stellar_mass']]

    min_halo_mass = np.amin(halo_mass)
    max_halo_mass = np.amax(halo_mass)
    
    lower_bin_edge = np.floor(min_halo_mass * 1/bin_width)*bin_width
    upper_bin_edge = np.ceil(max_halo_mass * 1/bin_width)*bin_width
    print(min_halo_mass, max_halo_mass)
    print(lower_bin_edge, upper_bin_edge)

    if (upper_bin_edge - lower_bin_edge) % bin_width > 1e-5 and (upper_bin_edge - lower_bin_edge) % bin_width < bin_width - 1e-5:
        print((upper_bin_edge - lower_bin_edge) % bin_width)
        upper_bin_edge += (upper_bin_edge - lower_bin_edge) % bin_width
#     print(lower_bin_edge, upper_bin_edge)

    bin_edges = np.arange(lower_bin_edge, upper_bin_edge + bin_width, bin_width)
    n_bins = len(bin_edges)-1
    bin_means, bin_edges, bin_numbers = stats.binned_statistic(halo_mass, stellar_mass, bins=bin_edges, statistic='mean')

    bin_centers = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]
    fig = plt.figure(i_red, figsize=(15,5))
    ax = plt.subplot(121)
    ax.errorbar(bin_centers, bin_means, yerr=error*np.ones(len(bin_centers)), fmt = 'bo')
    ax = plt.subplot(122)
    ax.plot(halo_mass[:10000], stellar_mass[:10000], 'b.', markersize=1)
    plt.suptitle('Stellar masses at redshift {:.1f}'.format(redshift))
    plt.show()

    

    full_data_list = [[halo_mass, stellar_mass, error] for (halo_mass, stellar_mass) in zip(bin_centers, bin_means)]
    full_data_list.insert(0, {'bin_widths': bin_width, 'bin_edges': bin_edges.tolist()})

    with open(destination_directory + file_name + '.json', 'w+') as f:
        json.dump(full_data_list, f)
    f.close()

### Real data

In [6]:
filename = '/home/magnus/data/observational_data/all_data.h5'
file = h5py.File(filename, 'r')
#file?
# List all groups
print("Keys: ", list(file.keys()))

Keys:  ['Universe_0']


In [7]:
# Get the universe
universe_0 = file['Universe_0']
#universe_0?
# List all objects in the universe
print("Keys: ", list(universe_0.keys()))

Keys:  ['CSFRD', 'Clustering', 'FQ', 'Model_Parameters', 'SMF', 'SSFR']


In [8]:
# Get the SMF objects from the universe
smf = universe_0['SMF']
#smf?
# List all objects
print("Keys: ", list(smf.keys()))
# Get the dataset 'Data'
data = smf['Data']
print("Keys of Data: ", list(data.keys()))
data_keys = list(data.keys())
model = smf['Model']
sets = smf['Sets']

Keys:  ['Data', 'Model', 'Sets']
Keys of Data:  ['000 Li & White 2009 (z = 0.00 - 0.20)', '001 Baldry 2012 (z = 0.00 - 0.06)', '002 Bernardi 2013 (z = 0.00 - 0.20)', '003 Perez-Gonzalez 2008 (z = 0.00 - 0.20)', '004 Perez-Gonzalez 2008 (z = 0.20 - 0.40)', '005 Perez-Gonzalez 2008 (z = 0.40 - 0.60)', '006 Perez-Gonzalez 2008 (z = 0.60 - 0.80)', '007 Perez-Gonzalez 2008 (z = 0.80 - 1.00)', '008 Perez-Gonzalez 2008 (z = 1.00 - 1.30)', '009 Perez-Gonzalez 2008 (z = 1.30 - 1.60)', '010 Perez-Gonzalez 2008 (z = 1.60 - 2.00)', '011 Perez-Gonzalez 2008 (z = 2.00 - 2.50)', '012 Perez-Gonzalez 2008 (z = 2.50 - 3.00)', '013 Perez-Gonzalez 2008 (z = 3.00 - 3.50)', '014 Perez-Gonzalez 2008 (z = 3.50 - 4.00)', '015 Santini 2012 (z = 0.60 - 1.00)', '016 Santini 2012 (z = 1.00 - 1.40)', '017 Santini 2012 (z = 1.40 - 1.80)', '018 Santini 2012 (z = 1.80 - 2.50)', '019 Santini 2012 (z = 2.50 - 3.50)', '020 Santini 2012 (z = 3.50 - 4.50)', '021 Ilbert 2013 (z = 0.20 - 0.50)', '022 Ilbert 2013 (z = 0.50 - 

In [54]:
print(list(data.keys()))

['000 Li & White 2009 (z = 0.00 - 0.20)', '001 Baldry 2012 (z = 0.00 - 0.06)', '002 Bernardi 2013 (z = 0.00 - 0.20)', '003 Perez-Gonzalez 2008 (z = 0.00 - 0.20)', '004 Perez-Gonzalez 2008 (z = 0.20 - 0.40)', '005 Perez-Gonzalez 2008 (z = 0.40 - 0.60)', '006 Perez-Gonzalez 2008 (z = 0.60 - 0.80)', '007 Perez-Gonzalez 2008 (z = 0.80 - 1.00)', '008 Perez-Gonzalez 2008 (z = 1.00 - 1.30)', '009 Perez-Gonzalez 2008 (z = 1.30 - 1.60)', '010 Perez-Gonzalez 2008 (z = 1.60 - 2.00)', '011 Perez-Gonzalez 2008 (z = 2.00 - 2.50)', '012 Perez-Gonzalez 2008 (z = 2.50 - 3.00)', '013 Perez-Gonzalez 2008 (z = 3.00 - 3.50)', '014 Perez-Gonzalez 2008 (z = 3.50 - 4.00)', '015 Santini 2012 (z = 0.60 - 1.00)', '016 Santini 2012 (z = 1.00 - 1.40)', '017 Santini 2012 (z = 1.40 - 1.80)', '018 Santini 2012 (z = 1.80 - 2.50)', '019 Santini 2012 (z = 2.50 - 3.50)', '020 Santini 2012 (z = 3.50 - 4.50)', '021 Ilbert 2013 (z = 0.20 - 0.50)', '022 Ilbert 2013 (z = 0.50 - 0.80)', '023 Ilbert 2013 (z = 0.80 - 1.10)', '02

In [53]:
for i in range(len(list(data.keys()))):
    first_data_key = data[list(data.keys())[i]]
    bin_width = first_data_key.value[1][0] - first_data_key.value[0][0]
    print('i: {:d}, bin_width: {:.4f}'.format(i, bin_width))
    
    
    

i: 0, bin_width: 0.1000
i: 1, bin_width: 0.2000
i: 2, bin_width: 0.1000
i: 3, bin_width: 0.2000
i: 4, bin_width: 0.2000
i: 5, bin_width: 0.2000
i: 6, bin_width: 0.2000
i: 7, bin_width: 0.2000
i: 8, bin_width: 0.2000
i: 9, bin_width: 0.2000
i: 10, bin_width: 0.2000
i: 11, bin_width: 0.2000
i: 12, bin_width: 0.2000
i: 13, bin_width: 0.2000
i: 14, bin_width: 0.2000
i: 15, bin_width: 0.2000
i: 16, bin_width: 0.2000
i: 17, bin_width: 0.2000
i: 18, bin_width: 0.2000
i: 19, bin_width: 0.2000
i: 20, bin_width: 0.2000
i: 21, bin_width: 0.2850
i: 22, bin_width: 0.2500
i: 23, bin_width: 0.2500
i: 24, bin_width: 0.2500
i: 25, bin_width: 0.2500
i: 26, bin_width: 0.2000
i: 27, bin_width: 0.1600
i: 28, bin_width: 0.1600
i: 29, bin_width: 0.4300
i: 30, bin_width: 0.3000
i: 31, bin_width: 0.2850
i: 32, bin_width: 0.2400
i: 33, bin_width: 0.2200
i: 34, bin_width: 0.1800
i: 35, bin_width: 0.1500
i: 36, bin_width: 0.0600
i: 37, bin_width: 0.1000
i: 38, bin_width: 0.1000
i: 39, bin_width: 0.1000
i: 40, bin

In [29]:
print(list(sets))
#sets?

[(39, 0, 0., 0.2, b'Li & White 2009'), (21, 39, 0., 0.06, b'Baldry 2012'), (32, 60, 0., 0.2, b'Bernardi 2013'), (20, 92, 0., 0.2, b'Perez-Gonzalez 2008'), (20, 112, 0.2, 0.4, b'Perez-Gonzalez 2008'), (17, 132, 0.4, 0.6, b'Perez-Gonzalez 2008'), (14, 149, 0.6, 0.8, b'Perez-Gonzalez 2008'), (13, 163, 0.8, 1., b'Perez-Gonzalez 2008'), (13, 176, 1., 1.3, b'Perez-Gonzalez 2008'), (11, 189, 1.3, 1.6, b'Perez-Gonzalez 2008'), (10, 200, 1.6, 2., b'Perez-Gonzalez 2008'), (9, 210, 2., 2.5, b'Perez-Gonzalez 2008'), (7, 219, 2.5, 3., b'Perez-Gonzalez 2008'), (6, 226, 3., 3.5, b'Perez-Gonzalez 2008'), (6, 232, 3.5, 4., b'Perez-Gonzalez 2008'), (19, 238, 0.6, 1., b'Santini 2012'), (15, 257, 1., 1.4, b'Santini 2012'), (13, 272, 1.4, 1.8, b'Santini 2012'), (11, 285, 1.8, 2.5, b'Santini 2012'), (9, 296, 2.5, 3.5, b'Santini 2012'), (8, 305, 3.5, 4.5, b'Santini 2012'), (15, 313, 0.2, 0.5, b'Ilbert 2013'), (13, 328, 0.5, 0.8, b'Ilbert 2013'), (12, 341, 0.8, 1.1, b'Ilbert 2013'), (10, 353, 1.1, 1.5, b'Ilbe

In [None]:
print(list(model))
#model?

In [None]:
#print(list(data['000 Li & White 2009 (z = 0.00 - 0.20)']))
print(list(data))
#data?

In [None]:
(1 + 1/(1+.2))/2

In [None]:
data_obj_0 = data[data_keys[0]]
print(list(data_obj_0))
#data_obj_0?

### Using pandas

In [10]:
file_name = '/home/magnus/data/observational_data/all_data.h5'

file = pd.read_hdf(file_name, key='Universe_0')
file_header = file.keys().tolist()
print(file_header)

TypeError: cannot create a storer if the object is not existing nor a value are passed

In [69]:
predicted_redshift = 8
data_to_load = 'smf'

data_dir = '/home/magnus/data/observational_data/all_data.h5'
file = h5py.File(data_dir, 'r')
universe_0 = file['Universe_0']

if data_to_load == 'smf':
    smf = universe_0['SMF']
    data = smf['Data']
    sets = smf['Sets']

    for i_set, dataset in enumerate(list(sets)):
        
        if predicted_redshift >= dataset[2] and predicted_redshift <= dataset[3]:
            print('Redshift {:.2f} - {:.2f}. Survey {}'.format(dataset[2], dataset[3], dataset[-1]))
        
        

Redshift 7.50 - 8.50. Survey b'Song 2009'
