In [45]:
import os
import json
import pandas as pd

def collect_data(main_folder):
    """ Collects the minimum test loss and corresponding parameters across all subfolders in the main folder. """
    results = []
    context_size = 0
    # Traverse through each subfolder in the main folder
    for subfolder in os.listdir(main_folder):
        subfolder_path = os.path.join(main_folder, subfolder)
        progress_file = os.path.join(subfolder_path, 'progress.csv')
        params_file = os.path.join(subfolder_path, 'params.json')
        
        # Check if both necessary files exist
        if os.path.exists(progress_file) and os.path.exists(params_file):
            try:
                # Read progress.csv and find the minimum test loss
                data = pd.read_csv(progress_file)
                data.fillna(0, inplace=True)
                # Read params.json
                with open(params_file, 'r') as file:
                    params = json.load(file)
                    # Collect required params and the corresponding test loss
                    result = {}
                    if 'master_neurons' in params:
                        result['master_neurons'] = params.get('master_neurons')
                    if 'context_store' in params:
                        result['store_embedding_neurons'] = params.get('context_store')
                    if 'context' in params:
                        result['context_neurons'] = params.get('context')
                    result['learning_rate'] = params.get('learning_rate')

                    result['best_train_loss'] = data['train_loss'].min()
                    result['dev_loss(at best_train)'] = data[data['train_loss'] == result['best_train_loss']]['dev_loss'].iloc[0]
                    result['best_dev_loss'] = data['dev_loss'].min()
                    result['train_loss(at best_dev)'] = data[data['dev_loss'] == result['best_dev_loss']]['train_loss'].iloc[0]
                    results.append(result)
            except Exception as e:
                print(f"Error processing files in {subfolder_path}: {e}")

    return results

def create_results_table(main_folder, remove_dup_hyperparams = False):
    """ Creates a table of the minimum test losses for each combination of learning_rate, context_size, and samples. """
    data = collect_data(main_folder)
    if data:
        # Create DataFrame from collected data
        df = pd.DataFrame(data)
        # Group by the parameters and find the row with the minimum dev_loss
        columns_to_sort = ['master_neurons', 'store_embedding_neurons', 'context_neurons', 'learning_rate']
        existing_columns = [col for col in columns_to_sort if col in df.columns]
        sort_order = [True] * len(existing_columns)
        sorted_result_df = df.sort_values(by=existing_columns, ascending=sort_order)
        
        if remove_dup_hyperparams == True:
            sorted_result_df = sorted_result_df.drop_duplicates(subset=existing_columns, keep='first')
        
        pd.set_option('display.float_format', lambda x: '%.5f' % x)
        return sorted_result_df
        # print(sorted_result_df.to_string(index=False))
    else:
        print("No data collected. Check the contents of your directories.")


In [33]:
df = create_results_table('/user/ml4723/Prj/NIC/ray_results_real_symmetry_GNN_real_data/run_2024-07-18_05-16-26', True)
df = df[(df['best_train_loss'] <= -250) | (df['best_dev_loss'] <= -250)]
print(df.to_string(index=False))

 store_embedding_neurons  context_neurons  learning_rate  best_train_loss  dev_loss(at best_train)  best_dev_loss  train_loss(at best_dev)
                      16               16        0.00010       -371.07332               -318.13960     -320.26870               -370.46277
                      16               16        0.00100       -363.71438               -303.73568     -322.61008               -361.45815
                      16               16        0.00100       -371.31272               -324.28394     -325.36754               -368.90999
                      16               32        0.00010       -380.89409               -332.31020     -333.39084               -380.47350
                      16               64        0.00100       -388.41780               -340.98204     -343.72932               -385.36803
                      16              128        0.01000       -309.59894               -286.68678     -286.68678               -309.59894
                      16   

In [47]:
df = create_results_table('/user/ml4723/Prj/NIC/ray_results_real_data_driven_net_real_data/run_2024-07-16_21-40-08', True)
df = df[(df['best_train_loss'] <= -250) | (df['best_dev_loss'] <= -250)]
print(df.to_string(index=False))

 master_neurons  learning_rate  best_train_loss  dev_loss(at best_train)  best_dev_loss  train_loss(at best_dev)
             32        0.00010       -326.09080               -294.21318     -294.43363               -325.91324
             32        0.00100       -399.60241               -356.28404     -358.72971               -397.63554
             32        0.01000       -364.92925               -327.85471     -330.94319               -360.64344
             64        0.00010       -366.55641               -327.64441     -328.10669               -365.93256
             64        0.00100       -385.88158               -336.90190     -342.26485               -381.54177
             64        0.01000       -396.50087               -347.48151     -356.73791               -395.11970
            128        0.00010       -400.67679               -354.28443     -356.00071               -397.28431
            128        0.00100       -411.46534               -340.93867     -352.29748         

In [60]:
GNN_df = create_results_table('/user/ml4723/Prj/NIC/ray_results/real/GNN/run_2024-07-18_05-16-26', False)
CTX_df = create_results_table('/user/ml4723/Prj/NIC/ray_results/real/ctx/run_2024-07-16_21-40-19', False)
Vanilla_df = create_results_table('/user/ml4723/Prj/NIC/ray_results/real/vanilla/run_2024-07-16_21-40-08', False)
BEN_df = create_results_table('/user/ml4723/Prj/NIC/ray_results/real/bench/run_2024-07-24_21-10-47', False)


In [61]:
gnn_df = pd.DataFrame(GNN_df.loc[GNN_df['best_train_loss'].idxmin()].drop(['store_embedding_neurons', 'context_neurons', 'learning_rate'])).T
gnn_df.insert(0, 'type', 'GNN') 
ctx_df = pd.DataFrame(CTX_df.loc[CTX_df['best_train_loss'].idxmin()].drop(['learning_rate'])).T
ctx_df.insert(0, 'type', 'Context') 
van_df = pd.DataFrame(Vanilla_df.loc[Vanilla_df['best_train_loss'].idxmin()].drop(['learning_rate', 'master_neurons'])).T
van_df.insert(0, 'type', 'Vanilla') 
ben_df = pd.DataFrame(BEN_df.loc[BEN_df['best_train_loss'].idxmin()].drop(['learning_rate'])).T
ben_df.insert(0, 'type', 'Benchmark') 
oracle_df = pd.DataFrame({'type': ['Oracle'], 'best_train_loss': [-498.58855], 'dev_loss(at best_train)': [None], 'best_dev_loss' : [-460.50378], 'train_loss(at best_dev)' : [None]})
print(pd.concat([oracle_df, gnn_df, ctx_df, van_df, ben_df]).to_string(index=False))

     type  best_train_loss dev_loss(at best_train)  best_dev_loss train_loss(at best_dev)
   Oracle       -498.58855                    None     -460.50378                    None
      GNN       -396.31411              -319.20072     -352.66954              -382.75000
  Context       -423.62699              -321.39473     -351.54730              -405.59714
  Vanilla       -429.19212              -317.88645     -356.51181              -396.97424
Benchmark       -325.10248              -284.37065     -284.37065              -325.10248
