In [None]:
import numpy as np
import os
import numpy
import pandas as pd
import ast

#This notebook updates the data for the black-box functions after each submission / measurement

Usage:

This notebook assumes that we have initial data in directory ./measurements/wk00, that we receive updates which we drop into the directory ./measurements/wk01 as ./measurements/wk01/new_inputs.txt and ./measurements/wk01/new_inputs.txt, and that the updated data is stored as numpy arrays in
    
    ./measurements/wk01/function_1/inputs.npy and outputs.npy
    ./measurements/wk01/function_2/inputs.npy and outputs.npy
    ...

and that this latest data is copied into ./measurements/latest

Weekly update procedure is as follows:

    (1) Add directory for the updated data, for example ./measurements/wk01/
    (2) Drop inside ./measurements/wk01/new_inputs.txt and ./measurements/wk01/new_outputs.txt
    (3) Update the variable Previous_week_number
    (4) Run the notebook, which will merge the new data and write the data directories
    (5) Commit and move on to the analysis notebook, which assumes data in ./measurements/latest


In [3]:
## Helper function

''' Converts black box functions from np.arrays to pandas.DataFrames for easier interpretation of tabular data
Inputs:
    npa_x       inputs x1,x2,... as np array
    npa_y       output y as np array

Outputs:
    df          Dataframe with columns x1, x2, ... and y
'''

def fcn_as_df(npa_x,npa_y):
    # assure that even single rows have two dimensions - avoids breaking code below
    if npa_x.ndim == 1:
        npa_x = npa_x.reshape(1,-1)
        npa_y = npa_y.reshape(1,-1)

    n_col = npa_x.shape[1]
    col_names = [f"x{i+1}" for i in range(n_col)]
    df = pd.DataFrame(npa_x,columns=col_names)
    df["y"] = np.ravel(npa_y).T
    return df


In [4]:
''' Helper - Reads numpy arrays for each of the eight functions from disk and returns a list of dataframes containing the input and output info

Inputs: 
    path_to_fcns    Directory path - must contain folders "function_1","function_2"... each containing inputs.npy and outputs.npy 

Outputs: 
    fcn_dict - Dictionary of dataframes for each function, each dataframe containing columns x1, x2,..., y
'''


def read_fcns_from_disk(path_to_fcns):
    f_in = []
    f_out = []
    num_functions = 8
    # read the functions as a list of numpy arrays, separate for inputs x and output y
    for i in range(0, num_functions):
        dir_path = os.path.join(path_to_fcns, f'function_{i+1}')
        inputs = np.load(os.path.join(dir_path, 'inputs.npy'))
        outputs = np.load(os.path.join(dir_path, 'outputs.npy'))
        f_in.append(inputs)
        f_out.append(outputs)
        
    # create a dictionary of form {"f_1": DataFrame, "f_2": DataFrame, } where the DataFrames have column names x1, x2, ..., y
    fcn_dict = {}
    for i in range(0, num_functions):
        npa_x = f_in[i]
        npa_y = f_out[i]

        fcn_dict[f"f_{i+1}"] = fcn_as_df(npa_x,npa_y)

    return fcn_dict
 



In [None]:
''' Helper - Reads updated measurement points from queries and converts to a dictionary that can be merged with the prior data

Inputs:
    path_to_fresh           path to a folder that must contain two txt files: new_inputs.txt and new_outputs.txt

Outputs:
    new_data_point_dict     dictionary of dataframes, each containing a single row and columns x1, x12, ...y, ready for merge with existing data
'''

def read_fresh_measurements_from_disk(path_to_fresh):
    new_input_file = os.path.join(".",path_to_fresh,"new_inputs.txt")
    new_output_file = os.path.join(".",path_to_fresh,"new_outputs.txt")
    with open(new_output_file, "r") as f:
        text = f.read()
        # Safely evaluate the string as a Python literal
        # output_list = ast.literal_eval(text) # List containing one output per function, so 8 numbers in all
        output_list = eval(text)
        # Ensure all elements are np.float64 (optional, probably already are)
        output_list = [np.float64(y) for y in output_list]

    # input file is more difficult, because it is of the form [array([0.997304, 1.      ]), array([0.758325, 1.      ])...
    with open(new_input_file, "r") as f:
        text = f.read().replace("array","np.array")
        input_list = eval(text)
    
    # convert to dataframes and place in dictionary
    d = {}
    for i in range(8):
        x_numpy = input_list[i]
        y_numpy = output_list[i]
        df = fcn_as_df(x_numpy,y_numpy)
        d[f"f_{i+1}"] = df
    
    return d


f_1
---
         x1   x2              y
0  0.997304  1.0  3.215893e-191
f_2
---
         x1   x2         y
0  0.758325  1.0  0.240588
f_3
---
         x1   x2        x3         y
0  0.805407  1.0  0.556213 -0.064954
f_4
---
    x1        x2        x3        x4          y
0  1.0  0.742121  0.737022  0.430983 -26.331648
f_5
---
        x1        x2   x3        x4            y
0  0.25491  0.962474  1.0  0.998899  3964.328632
f_6
---
         x1       x2   x3        x4        x5         y
0  0.994041  0.21117  1.0  0.947369  0.076993 -1.244888
f_7
---
         x1       x2        x3        x4        x5   x6         y
0  0.079204  0.67263  0.338485  0.298396  0.575165  1.0  0.160449
f_8
---
         x1        x2        x3       x4        x5        x6        x7   x8  \
0  0.063205  0.073851  0.025674  0.04343  0.452292  0.896953  0.546764  1.0   

          y  
0  9.472972  
{'f_1':          x1   x2              y
0  0.997304  1.0  3.215893e-191, 'f_2':          x1   x2         y
0  0.758325 

In [21]:
''' OVERALL WORKFLOW

 (1) Read previous week's data
 (2) Read this week's updated inputs and outputs 
 (3) Add the new inputs and outputs to the data and display it
 (4) Export as numpy arrays
 (4) Save in this week's folder
 (5) copy this week's folder to "latest"
 '''


# Important global variables
Previous_week_number = 0  # week number that we will append new data to
meas_path = "./measurements"

prev_wk_str = f"wk{Previous_week_number:02d}"
curr_wk_str = f"wk{Previous_week_number+1:02d}"

path_to_previous_functions = os.path.join(meas_path,prev_wk_str)
path_to_new_measurements = os.path.join(meas_path,curr_wk_str)

f_previous = read_fcns_from_disk(path_to_previous_functions)
f_new = read_fresh_measurements_from_disk(path_to_new_measurements)

f_merged = {}
for k in f_new.keys():

    # merge the new data.  By design, the dictionaries f_previous and f_new have the same keys
    f_merged[k] = pd.concat([f_previous[k], f_new[k] ],ignore_index=True)

    print(k)
    print("---")
    print("Previous week data - " + prev_wk_str )
    print(f_previous[k])
    print("Fresh data point - " + curr_wk_str )
    print(f_new[k])
    print("Merged data - " + curr_wk_str )
    print(f_merged[k])




# write the merged data to the current week's directory as np arrays, ready for reading next week





f_1
---
Previous week data - wk00
         x1        x2              y
0  0.319404  0.762959   1.322677e-79
1  0.574329  0.879898   1.033078e-46
2  0.731024  0.733000   7.710875e-16
3  0.840353  0.264732  3.341771e-124
4  0.650114  0.681526  -3.606063e-03
5  0.410437  0.147554  -2.159249e-54
6  0.312691  0.078723  -2.089093e-91
7  0.683418  0.861057   2.535001e-40
8  0.082507  0.403488   3.606771e-81
9  0.883890  0.582254   6.229856e-48
Fresh data point - wk01
         x1   x2              y
0  0.997304  1.0  3.215893e-191
Merged data - wk01
          x1        x2              y
0   0.319404  0.762959   1.322677e-79
1   0.574329  0.879898   1.033078e-46
2   0.731024  0.733000   7.710875e-16
3   0.840353  0.264732  3.341771e-124
4   0.650114  0.681526  -3.606063e-03
5   0.410437  0.147554  -2.159249e-54
6   0.312691  0.078723  -2.089093e-91
7   0.683418  0.861057   2.535001e-40
8   0.082507  0.403488   3.606771e-81
9   0.883890  0.582254   6.229856e-48
10  0.997304  1.000000  3.215893e-

In [22]:
''' writes dictionary of functions back to disk 

inputs:
    d           Dictionary of DataFrames
    write_path  string containing path to write to, for example, ./measurements/wk01

output: the following hierarchy is written to disk

write_path
    function_1
        inputs.npy
        output.npy
    function_2
        inputs.npy
        output.npy
        ...
    function_8
        inputs.npy
        output.npy        

'''
def write_dict_of_functions_to_disk(write_path,d):
    for i,k in enumerate(d):
        fcn_directory = os.path.join(write_path,f"function_{i+1}")

        if os.path.exists(fcn_directory):
            raise FileExistsError(f"Directory '{fcn_directory}' already exists!")

        os.makedirs(fcn_directory)
        input_file_path = os.path.join(fcn_directory, "inputs.npy")
        output_file_path = os.path.join(fcn_directory, "outputs.npy")

        y = d[k].iloc[:,-1].to_numpy()      # output is in last column
        x = d[k].iloc[:,:-1].to_numpy()     # other columns are inputs
    
        np.save(output_file_path,y)
        print(f"saved {output_file_path}")
        np.save(input_file_path,x)
        print(f"saved {input_file_path}")
    

In [23]:
write_path = os.path.join(meas_path,curr_wk_str)
write_dict_of_functions_to_disk(write_path,f_merged)



saved ./measurements/wk01/function_1/outputs.npy
saved ./measurements/wk01/function_1/inputs.npy
saved ./measurements/wk01/function_2/outputs.npy
saved ./measurements/wk01/function_2/inputs.npy
saved ./measurements/wk01/function_3/outputs.npy
saved ./measurements/wk01/function_3/inputs.npy
saved ./measurements/wk01/function_4/outputs.npy
saved ./measurements/wk01/function_4/inputs.npy
saved ./measurements/wk01/function_5/outputs.npy
saved ./measurements/wk01/function_5/inputs.npy
saved ./measurements/wk01/function_6/outputs.npy
saved ./measurements/wk01/function_6/inputs.npy
saved ./measurements/wk01/function_7/outputs.npy
saved ./measurements/wk01/function_7/inputs.npy
saved ./measurements/wk01/function_8/outputs.npy
saved ./measurements/wk01/function_8/inputs.npy


In [24]:
# re-read the functions and compare to the original data to be sure all is ok
d = read_fcns_from_disk(path_to_new_measurements)

for k in f_merged.keys():

    # subtract the data.  They should be identical
    delta = d[k] - f_merged[k]

    print("Re-reading from disk to confirm that data was re-read correctly")
    print(k)
    print("---")
    print(delta)
    print(" ")
 

Re-reading from disk to confirm that data was re-read correctly
f_1
---
     x1   x2    y
0   0.0  0.0  0.0
1   0.0  0.0  0.0
2   0.0  0.0  0.0
3   0.0  0.0  0.0
4   0.0  0.0  0.0
5   0.0  0.0  0.0
6   0.0  0.0  0.0
7   0.0  0.0  0.0
8   0.0  0.0  0.0
9   0.0  0.0  0.0
10  0.0  0.0  0.0
 
Re-reading from disk to confirm that data was re-read correctly
f_2
---
     x1   x2    y
0   0.0  0.0  0.0
1   0.0  0.0  0.0
2   0.0  0.0  0.0
3   0.0  0.0  0.0
4   0.0  0.0  0.0
5   0.0  0.0  0.0
6   0.0  0.0  0.0
7   0.0  0.0  0.0
8   0.0  0.0  0.0
9   0.0  0.0  0.0
10  0.0  0.0  0.0
 
Re-reading from disk to confirm that data was re-read correctly
f_3
---
     x1   x2   x3    y
0   0.0  0.0  0.0  0.0
1   0.0  0.0  0.0  0.0
2   0.0  0.0  0.0  0.0
3   0.0  0.0  0.0  0.0
4   0.0  0.0  0.0  0.0
5   0.0  0.0  0.0  0.0
6   0.0  0.0  0.0  0.0
7   0.0  0.0  0.0  0.0
8   0.0  0.0  0.0  0.0
9   0.0  0.0  0.0  0.0
10  0.0  0.0  0.0  0.0
11  0.0  0.0  0.0  0.0
12  0.0  0.0  0.0  0.0
13  0.0  0.0  0.0  0.0
14 