In [None]:
#Demonstrates the Base workflow with the Processor

In [1]:
import flowcode
import processing
import res_flow_vis as visual
import device_use
import externalize as ext

import torch
import numpy as np

In [2]:
#Filename associated with this specific run
filename = "leavout_MttZ_all_CL2_24_10_512_8_lo[66, 20, 88, 48, 5]" #"leavout_MttZ_MWs_CL2_24_10_512_8_lo5"

In [3]:
#Initiate a processor to handle data
mpc = processing.Processor_cond()

In [4]:
#Load raw data
Galaxies_raw = mpc.get_data("all_sims")

In [5]:
#Clean data
Galaxies_cleaned = mpc.constraindata(Galaxies_raw, N_min=500)

Cut out 5 of 95 galaxies, 2072015 of 34878379 stars (~6%).


In [6]:
#Chose a subset of the data
#Conditions to use:
#Given by function, will contain computation instructions of conditions from galaxy data and their names
cond_fn = ext.cond_M_stars_2age_avZ
#Subset to view for comaprison (all galaxies):
use_fn_view = ext.construct_all_galaxies_leavout("id", [])
Galaxies = mpc.choose_subset(Galaxies_cleaned, use_fn = use_fn_view, cond_fn=cond_fn)

#Subset to train on (e.g. leave one out):
leavout_idices = [66, 20, 88, 48, 5]
use_fn_train = ext.construct_all_galaxies_leavout("id", leavout_idices)
Galaxies_train = mpc.choose_subset(Galaxies_cleaned, use_fn = use_fn_train, cond_fn=cond_fn)

Chose 90 of 90 galaxies.
Chose 85 of 90 galaxies.


In [7]:
#Choose device
device = "cpu"

In [8]:
#Hyperparameters of the flow
LAYER_TYPE = flowcode.NSF_CL2
N_LAYERS = 24
COND_NAMES = mpc.cond_names["galaxy"]
DIM_COND = len(COND_NAMES)
DIM_NOTCOND = len(Galaxies_train[0]["stars"]) - DIM_COND
SPLIT = 0.5
K = 10
B = 3
BASE_NETWORK = flowcode.MLP
BASE_NETWORK_N_LAYERS = 8
BASE_NETWORK_N_HIDDEN = 512
BASE_NETWORK_LEAKY_RELU_SLOPE = 0.2

SPLIT = {"split":SPLIT} if LAYER_TYPE == flowcode.NSF_CL else {}

In [9]:
#Instantiate the model
model = flowcode.NSFlow(N_LAYERS, DIM_NOTCOND, DIM_COND, LAYER_TYPE, **SPLIT, K=K, B=B, network=BASE_NETWORK, network_args=(BASE_NETWORK_N_HIDDEN,BASE_NETWORK_N_LAYERS,BASE_NETWORK_LEAKY_RELU_SLOPE))
model = model.to(device)
#Load pre-trained model
#model.load_state_dict(torch.load("saves/leavout_M_star_MWs_CL2_24_10_512_8_lo1.pth"))

In [10]:
#Training hyperparameters
N_EPOCHS = 12
INIT_LR = 0.00009
GAMMA = 0.998
BATCH_SIZE = 1024

#Define indices for preprocessing
LOG_LEARN = (["M_stars"],)

#Define how to scale the data to learn in a different space
transformations = (np.log10, )
trf_names = (LOG_LEARN, )
transformations_inv = (lambda x: 10**x, )

#For demonstration purposes:
logdets = (ext.logdet_log10,)

In [11]:
#Prepare data for flow
Data_flow = mpc.Data_to_flow(mpc.diststack(Galaxies_train), transformations, trf_names, transformations_inv, transformation_logdets=logdets)

In [None]:
#Training takes long, so usually we do not want it in a notebook but in a nohup background process
#E.g. it is possible to use the cond_trainer.py script to train the model
#this will export the model and data accordingly to the cond_trainer folder
torch.save(Data_flow, "cond_trainer/data_cond_trainer.pth")
torch.save(model, "cond_trainer/model_cond_trainer.pth")
np.save("cond_trainer/params_cond_trainer.npy", np.append(COND_NAMES,np.array([N_EPOCHS,INIT_LR,BATCH_SIZE,GAMMA])))
np.save("cond_trainer/filename_cond_trainer.npy", filename)

In [None]:
#Start background training
#nohup python cond_trainer.py <name_suffix> <optional:GPU id> &
#Will save model in saves folder with loss history and train time (last entry in loss history)
#Will flush training information to a textfile containing the name suffix

In [None]:
#...OR train directly in notebook/current process
import time
train_loss_saver = []
start = time.perf_counter()
flowcode.train_flow(model, Data_flow, COND_NAMES, N_EPOCHS, lr=INIT_LR, batch_size=BATCH_SIZE, loss_saver=train_loss_saver, gamma=GAMMA)
end = time.perf_counter()
torch.save(model.state_dict(), f"saves/{filename}.pth")
np.save(f"saves/loss_{filename}.npy",np.array(train_loss_saver+[end-start]))

In [12]:
#Load in training results:
model.load_state_dict(torch.load(f"saves/{filename}.pth", map_location=device))
loss_results = np.load(f"saves/loss_{filename}.npy")
loss_results, tot_time = loss_results[:-1], loss_results[-1]/60

In [13]:
#Get a sample from the flow
use_GPUs = [9]
import time
start = time.perf_counter()
#Set a condition for the sample
#Here, sample back all galaxies
condition = mpc.diststack(Galaxies)[COND_NAMES]

#Get sample
flow_sample = mpc.sample_to_Data(mpc.sample_Conditional(model, condition, split_size=int(6e5), GPUs=use_GPUs))

#To revert to a galaxy interpretation we need to specify the number of stars in each galaxy
#Again, we use the same as in the data
N_stars_galaxies = mpc.get_array(Galaxies, "galaxy" ,"N_stars")
#or
N_stars_galaxies = np.array([len(galaxy["stars"]) for galaxy in Galaxies])

#Convert sample to galaxy interpretation
flow_sample = mpc.galaxysplit(flow_sample, N_stars_galaxies)

#However this is now list of pandas dataframes, not a list of dictionaries as the original data
#we can convert it back to a list of dictionaries
flow_sample = [{"stars":galaxy_stars} for galaxy_stars in flow_sample]
#In our special case, we can also reinsert the galaxy information that we know from the original data
#This is of course not possible in general
for galaxy_flow, galaxy in zip(flow_sample, Galaxies):
    galaxy_flow["galaxy"] = galaxy["galaxy"]

#Similarly, one can also reinsert the conditions
for galaxy_flow, galaxy in zip(flow_sample, Galaxies):
    galaxy_flow["parameters"] = galaxy["parameters"]

#Format in minutes and seconds
print(f"Time to sample: {int((time.perf_counter()-start)/60)} minutes and {int((time.perf_counter()-start)%60)} seconds")

Time to sample: 26 minutes and 41 seconds


In [None]:
### Visualize data

In [1]:
#Get multiple galaxy plot
visual.plot_conditional_2(Galaxies, flow_sample, type="N", label=filename, N_unit="massperkpc", color_pass="first", global_grid=True)

NameError: name 'visual' is not defined

In [None]:
#Get comparison plot of single galaxy

visual.get_result_plots(Galaxies[5], flow_sample[5], label=filename, format_="pdf")

In [None]:
visual.plot_conditional_histograms(flow_sample, label = filename, log=True)

In [None]:
visual.loss_plot(loss_results, tot_time=tot_time, savefig=filename)