# grid_to_df
notebook for converting garstec out hdf5 files to neural network friendly dataframes saved back to hdf5

In [None]:
#misc
import pandas as pd
import numpy as np
import h5py
import os

## import garstec h5 file

In [None]:
garstec_h5 = h5py.File("/home/oxs235/datastorage/repos_data/ojscutt/pitchfork/data/Garstec_AS09_barbieMS.hdf5", 'r') ### EDIT WITH YOUR FILEPATH

print(list(garstec_h5['grid/tracks/track0001']))

## def and run track_df_gen func

In [None]:
def track_df_gen(grid, track_ids, n_min=15, n_max=25, headers=['age', 'Teff', 'LPhot']):
    """
    Convert garstec hdf5 file (grid) to NN friendly dataframe file for training.
    
    args:
    grid -- garstec hdf5 out file
    track_ids -- list of track ids (including leading zeros) to iterate over

    kwargs:
    n_min -- lower limit of radial order to load/pad
    n_max -- upper limit of radial order to load/pad
    headers -- relevant headers to include in final dataframe (inputs and outputs for training, typically)

    returns:
    pandas dataframe with columns:
        - ['track_id'] -- integer track id from GARSTEC hdf5 out file, no leading 0s
        - [headers] -- columns for each header defined by 'headers' kwarg
        - [nu_headers] -- columns for mode frequency values for radial orders in range n_min->n_max
    
    notes:
    - Operates track by track, and fills missing mode freqencies by padding using GARSTEC dnufit.
    - Modes loaded/filled between a min and max radial order. This must be consistent between all dataframe rows.
    - Warning is printed when padding is occuring. This is unexpected behaviour if radial orders between n_min and n_max should be present for all points in the grid.
    """
    nu_headers = [f"nu_0_{n}" for n in range(n_min, n_max+1)]
    first_switch=0
    for track_id in track_ids:
        print(str(track_id), end="\r")
        track = grid['grid/tracks/track'+track_id]
        
        track_array = np.full(len(track[headers[0]]), int(track_id))

        for header in headers:
            track_array = np.column_stack((track_array, np.array(track[header])))
        
        age = track['age']
    
        points = np.array(age) #np.array(age)[np.where(np.array(age)<=max_age)[0]]
        first_point = points[0]
        last_point = points[-1]
        
        osckeys = track['osckey']
        dnufits = track['dnufit']

        i = 0
        padding_switch = 0
        for point in points:
            n_vals = osckeys[i][1][np.where(osckeys[i][0] == 0)[0]]
            n_upper = n_vals[-1]
            n_lower = n_vals[0]
            dnufit = dnufits[i]
            
            nu_vals = list(track['osc'][i][0][np.where(osckeys[i][0] == 0)])
            if n_upper < n_max:
                nu_max = nu_vals[-1]
                n_diff = n_max - n_upper
                try:
                    nu_upper_pad = (np.linspace(1,n_diff,n_diff)*dnufit)+nu_max
                except:
                    nu_upper_pad = np.full(n_diff, nu_max)
                nu_vals = nu_vals + nu_upper_pad.tolist()
                n_upper=n_max
                padding_switch = 1

            
            if n_lower > n_min:
                nu_min = nu_vals[0]
                n_diff = n_lower - n_min
                try:
                    nu_lower_pad = nu_min - (np.linspace(n_diff,1,n_diff)*dnufit)
                except:
                    nu_lower_pad = np.full(n_diff, nu_min)
                nu_vals = nu_lower_pad.tolist() + nu_vals
                n_lower = n_min
                padding_switch = 1
            
            nu_vals = nu_vals[n_min - n_lower:(n_max - n_lower)+1]
            
            if i == 0:
                nu_vals_arr = nu_vals
            else:
                nu_vals_arr = np.vstack((nu_vals_arr, nu_vals))
            i+=1
        track_array = np.concatenate((track_array, nu_vals_arr), axis=1)
       
        if first_switch == 0:
            tracks_array = track_array
            first_switch = 1
        else:
            tracks_array = np.vstack((tracks_array, track_array))

        if padding_switch == 1:
            print(f"padding! requested radial order range {n_min}->{n_max} exceeded range present in track{track_id}")


    return pd.DataFrame(tracks_array, columns = ['track_id']+headers+nu_headers)

headers = ['massini', 'zini', 'yini', 'alphaMLT', 'age', 'TAMS', 'radPhot', 'LPhot', 'Teff', 'zsur', 'numax', 'dnufit']

track_ids = [track_name.replace('track', '') for track_name in list(garstec_h5['grid/tracks'])]

garstec_df = track_df_gen(garstec_h5,track_ids, n_min=19, n_max=21, headers=headers)

## check df

In [None]:
garstec_df

In [None]:
garstec_df.describe()