In [6]:
import pandas as pd
from pathlib import Path
from collections import defaultdict
import re
import numpy as np
# HOME = Path.cwd().parent
HOME = Path('.')
workspace = HOME / 'workspaces'

In [7]:
# Helper functions
def format_scores(scores, show_max_min: bool = True):
    mean = np.nanmean(scores)
    std = np.std(scores)
    if show_max_min:
        min_ = np.min(scores)
        max_ = np.max(scores)
        return f'{mean:.0f}±{std:.0f}'
    else:
        return f'{mean:.0f}±{std:.0f}'

def format_df(res_dict, agg_fn):
    new_res = {
        k: {
            key: agg_fn(val) for key, val in v.items()
        } for k, v in res_dict.items()
    }
    return pd.DataFrame(new_res)

def extract_scores(log_filename):
    with open(log_filename, 'r') as f:
        last_line = f.read().splitlines()[-1]
    scores = {}
    try:
        rmse, mae, mape = re.findall(r'[^:]+: (\d+\.\d+)', last_line)
    except:
        print(log_filename, last_line)
        raise
    scores['RMSE'] = float(rmse)
    scores['MAE'] = float(mae)
    scores['MAPE'] = float(mape) * 100
    return scores

In [13]:
#workspace
datasets_to_use = ['clo','cruh','crush','snl','matr_1', 'matr_2', 'hust', 'mix']
sklearn_baseline_names = [
    'dummy',
    'variance_model',
    'discharge_model',
    'full_model',
    'ridge',
    'pcr',
    'plsr',
    'gpr',
    'xgb',
    'rf',
]
nn_baseline_names = ['mlp', 'cnn', 'lstm', 'transformer']

def collect_results(dataset):
    sklearn_results = [
        sklearn_baselines[dataset][name] for name in sklearn_baseline_names
    ]
    nn_results = [
        nn_baselines[dataset][name] for name in nn_baseline_names
    ]
    # our_results = [ours[dataset]['Ours']]
    return sklearn_results + nn_results
    # return sklearn_results + nn_results + our_results

sklearn_baselines = defaultdict(dict)
for method_path in workspace.glob('baselines/sklearn/*'):
    method = method_path.name
    for dataset_res in method_path.glob('*'):
        dataset = dataset_res.name
        score = extract_scores(dataset_res / 'log.0')['RMSE']
        sklearn_baselines[dataset][method] = f'{int(score)}'




nn_baselines = defaultdict(dict)
for method_path in workspace.glob('baselines/nn_models/*'):
    method = method_path.name
    for dataset_res in method_path.glob('*'):
        dataset = dataset_res.name
        scores = []
        for i in range(10):
            try:
                score = extract_scores(dataset_res / f'log.{i}')['RMSE']
            except:
                score = None
            scores.append(score)
        try:
            nn_baselines[dataset][method] = format_scores(scores)
        except:
            nn_baselines[dataset][method] = None




In [14]:
# new
main_table_df = pd.DataFrame({
    dataset: collect_results(dataset) for dataset in datasets_to_use
}, index=sklearn_baseline_names+nn_baseline_names)


main_table_df = main_table_df.rename(columns={
    'snl': 'SNL',
    'clo': 'CLO',
    'hust': 'HUST',
    'matr_1': 'MATR-1',
    'matr_2': 'MATR-2',
    'cruh': 'CRUH',
    'crush': 'CRUSH',
    'mix': 'MIX'
}, index={
    'dummy': 'Training Mean',
    'variance_model': '``Variance\'\' Model',
    'discharge_model': '``Discharge\'\' Model',
    'full_model': '``Full\'\' Model',
    'ridge': 'Ridge Regression',
    'pcr': 'PCR',
    'plsr': 'PLSR',
    'gpr': 'Guassian process',
    'xgb': 'XGBoost',
    'rf': 'Random Forest',
    'mlp': 'MLP',
    'cnn': 'CNN',
    'lstm': 'LSTM',
    'transformer': 'Transformer',
})[['MATR-1', 'MATR-2', 'HUST','SNL', 'CLO',  'CRUH', 'CRUSH', 'MIX']]
main_table_df

Unnamed: 0,MATR-1,MATR-2,HUST,SNL,CLO,CRUH,CRUSH,MIX
Training Mean,398,510,419,466,331,239,576,573
``Variance'' Model,136,211,398,360,179,118,506,521
``Discharge'' Model,329,148,321,266,143,76,13514,1743
``Full'' Model,166,1074,335,433,138,93,1051,330
Ridge Regression,115,183,54194,242,169,65,39804073,372
PCR,123,197,434,199,197,68,559,376
PLSR,103,180,431,241,176,60,535,383
Guassian process,153,223,54202,250,203,115,42861399732314,573
XGBoost,333,798,394,547,214,119,330,205
Random Forest,164,231,358,558,192,81,416,197
