In [None]:
## Data retrieval

# Obtain data from Zenodo if not already downloaded (~10 GB)
# This will take some time!

import subprocess
#subprocess.call('wget https://zenodo.org/record/7335961/files/active_learning_sims.tar.gz', shell=True)

# Strongly suggest using something like aria2 for speed-up
# Remove "-q" flag for verbose output

subprocess.call('aria2c --file-allocation=none -c -x 10 -s 10 -q \
https://zenodo.org/record/7335961/files/active_learning_sims.tar.gz', shell=True)

# An output of "0" indicates success!

In [None]:
## Data setup

# Untar the data

subprocess.call('tar -zxf active_learning_sims.tar.gz', shell=True)

In [1]:
## Data setup

# Optional: once downloaded, convert spectra to hdf5 format
# This takes about an hour, but it's a one-time thing and
# will *significantly* increase IO during training
# Training script will require reformatting if not using hdf5 spectra

import convert_dat_to_h5 as convh5

convh5.convert_dat_to_h5(path_to_sims_dir='knsc1_active_learning', \
                         path_to_h5_out='TP_wind2_spectra.h5', \
                         cutoff=21) # going out to about 3 weeks

Converting spectra .dat files from knsc1_active_learning
Outputting spectra .hdf5 file to TP_wind2_spectra.h5
Using 60 time iterations with a maximum time of 21
Concatenated 1 out of 412 spectra
Concatenated 2 out of 412 spectra
Concatenated 3 out of 412 spectra
Concatenated 4 out of 412 spectra
Concatenated 5 out of 412 spectra
Concatenated 6 out of 412 spectra
Concatenated 7 out of 412 spectra
Concatenated 8 out of 412 spectra
Concatenated 9 out of 412 spectra
Concatenated 10 out of 412 spectra
Concatenated 11 out of 412 spectra
Concatenated 12 out of 412 spectra
Concatenated 13 out of 412 spectra
Concatenated 14 out of 412 spectra
Concatenated 15 out of 412 spectra
Concatenated 16 out of 412 spectra
Concatenated 17 out of 412 spectra
Concatenated 18 out of 412 spectra
Concatenated 19 out of 412 spectra
Concatenated 20 out of 412 spectra
Concatenated 21 out of 412 spectra
Concatenated 22 out of 412 spectra
Concatenated 23 out of 412 spectra
Concatenated 24 out of 412 spectra
Concaten

Concatenated 231 out of 412 spectra
Concatenated 232 out of 412 spectra
Concatenated 233 out of 412 spectra
Concatenated 234 out of 412 spectra
Concatenated 235 out of 412 spectra
Concatenated 236 out of 412 spectra
Concatenated 237 out of 412 spectra
Concatenated 238 out of 412 spectra
Concatenated 240 out of 412 spectra
Concatenated 241 out of 412 spectra
Concatenated 242 out of 412 spectra
Concatenated 243 out of 412 spectra
Concatenated 244 out of 412 spectra
Concatenated 245 out of 412 spectra
Concatenated 246 out of 412 spectra
Concatenated 247 out of 412 spectra
Concatenated 248 out of 412 spectra
Concatenated 249 out of 412 spectra
Concatenated 250 out of 412 spectra
Concatenated 251 out of 412 spectra
Concatenated 252 out of 412 spectra
Concatenated 253 out of 412 spectra
Concatenated 254 out of 412 spectra
Concatenated 255 out of 412 spectra
Concatenated 256 out of 412 spectra
Concatenated 257 out of 412 spectra
Concatenated 258 out of 412 spectra
Concatenated 259 out of 412 

In [3]:
## Random forest training

import spectra_interpolator as si

# Use the random forest interpolator
intp = si.intp(rf=True)

# Load the simulation parameters and spectra
# t_max = None implies time used during interpolation
# theta = 0 defined in degrees

intp.load_data('knsc1_active_learning/*spec*', \
               'TP_wind2_spectra.h5', \
               t_max=None, theta=30, trim_dataset=True)

# Create verification test set

intp.create_test_set(size=5)

# Append time as the 5th input parameter
# Nominally, spectra have shape [N_sims, times, wavs, thetas]
# For free time + fixed angle, append_input_parameter(intp.times, 1)
# For free angle + fixed time, append_input_parameter(intp.angles, 2)

intp.append_input_parameter(intp.times, 1)

# Preprocess data for easier training

intp.preprocess()

# Train!

intp.train()

# Save the model

intp.save('rf_spec_intp.joblib')

# Evaluate the test set input parameters if evaluate() given no arguments
# If arguments provided, stores them under intp.prediction
# Predictions can be returned by setting argument ret_out=True 

intp.evaluate()

# Make plots of the test set to visually identify off-sample fitting
# Test set plots stored in intp_figures directory

intp.make_plots()

Using rf interpolator
Parameter array shape:  (412, 4)
Spectra times:  [ 1.41419956  1.54219501  1.68177499  1.83398798  1.99997737  2.18099002
  2.37838564  2.59364701  2.82839112  3.0843813   3.36354047  3.66796559
  3.99994342  4.36196769  4.75675782  5.18727935  5.65676624  6.16874514
  6.7270619   7.33591043  7.99986422  8.72391071  9.51348872 10.37452935]
Angle index of 2 corresponds to 31.586338 degrees
Loading h5 file, this may take a while on the first execution
(412, 60, 1024, 54)
Data loaded and reshaped
Test set parameters should be:  [[0.00101  0.187834 0.034515 0.298814]
 [0.001392 0.291114 0.047775 0.086227]
 [0.00149  0.222715 0.001105 0.09309 ]
 [0.002314 0.262187 0.099887 0.162864]
 [0.056379 0.297409 0.095523 0.05931 ]]
Test set parameters are:  [[1.01000000e-03 1.87834000e-01 3.45150000e-02 2.98814000e-01
  1.41419956e+00]
 [1.39200000e-03 2.91114000e-01 4.77750000e-02 8.62270000e-02
  1.83398798e+00]
 [1.49000000e-03 2.22715000e-01 1.10500000e-03 9.30900000e-02
  1