In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

import sys
sys.path.append("..")
from data.small_context import get_datasets
from data.metrics import calculate_crps

sns.set(style="whitegrid", font_scale=1)

name_map = {
    "gp": "SM-GP",
    "arima": "ARIMA",
    "TCN": "TCN",
    "N-BEATS": "N-BEATS",
    "N-HiTS": "N-HiTS",
    'text-davinci-003':'GPT-3',
    'LLaMA7B': 'LLaMA7B',
    'LLaMA13B': 'LLaMA13B',
    'LLaMA30B': 'LLaMA30B', 
    'LLaMA70B': 'LLaMA70B',
    "llama1_7B": "LLaMA 7B",
    "llama1_13B": "LLaMA 13B",
    "llama1_30B": "LLaMA 30B",
    "llama1_70B": "LLaMA 70B",
    "llama2_7B": "LLaMA-2 7B",
    "llama2_13B": "LLaMA-2 13B",
    "llama2_70B": "LLaMA-2 70B",
    "llama2_7B_chat": "LLaMA-2 7B (chat)",
    "llama2_13B_chat": "LLaMA-2 13B (chat)",
    "llama2_70B_chat": "LLaMA-2 70B (chat)",
}

hue_order = ['N-BEATS','SM-GP','TCN','N-HiTS','ARIMA']#, 'LLaMA70B']
hue_order += ["LLaMA-2 70B", 'GPT-3']
# hue_order += [
#     "LLaMA 7B", "LLaMA-2 7B", "LLaMA-2 7B (chat)",
#     "LLaMA 13B", "LLaMA-2 13B", "LLaMA-2 13B (chat)",
#     "LLaMA 30B", "LLaMA 70B", "LLaMA-2 70B", "LLaMA-2 70B (chat)"
# ]
nlls = defaultdict(list)
crps = defaultdict(list)
mae = defaultdict(list)
datasets = get_datasets()
for dsname,(train,test) in datasets.items():
    # if dsname == "SunspotsDataset":
    #     continue

    # print(dsname)
    with open(f'eval/small_context_tuned/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)
    for model_name,preds in data_dict.items():
        # print(f"\t{model_name}")
        if model_name in ['ada','babbage','curie']:
            continue
        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']
        if model_name=='text-davinci-003-tuned':
            model_name='GPT3'

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)

llama_models = [
    "llama1_7B",
    "llama2_7B",
    "llama2_7B_chat",
    "llama1_13B",
    "llama2_13B",
    "llama2_13B_chat",
    "llama1_30B",
    "llama1_70B",
    "llama2_70B",
    "llama2_70B_chat",
]
for dsname,(train,test) in datasets.items():
    # if dsname == "SunspotsDataset":
    #     continue

    for model_name in llama_models:
        if model_name in ['llama2_70B', 'llama2_70B_chat']:
            fn = f'eval/llama_70B_sweep_sample/{model_name}/darts-{dsname}/1.0_0.9_0.99_0.3_3_,_.pkl'
        else:
            fn = f'eval/llama_2_results/{model_name}/darts-{dsname}/0.4_0.9_0.99_0.3_3_,_.pkl'
        with open(fn,'rb') as f:
            data_dict = pickle.load(f)

        preds = data_dict#[model_name]

        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)


nlls = {k:np.array(v) for k,v in nlls.items()}
crps = {k:np.array(v) for k,v in crps.items()}
mae = {k:np.array(v) for k,v in mae.items()}

print({k: len(v) for k,v in nlls.items()})

# Update dataset keys by removing 'Dataset' substring
dataset_keys = [key.replace('Dataset', '') for key in datasets.keys()]

fig, ax = plt.subplots(1, 1, figsize=(3.5, 2.2))
dfs = [pd.DataFrame({'Dataset':dataset_keys,'CRPS':v,'Type':k}) for k,v in crps.items()]
df = pd.concat(dfs)

df['Type'] = df['Type'].apply(lambda x: name_map[x])

palette = sns.color_palette('Dark2', len(hue_order))
palette = palette[:2] + palette[3:-1] + ['#a60355'] + palette[2:3]

sns.barplot(
    # x='Dataset',
    order=hue_order,#['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA','GPT-3', 'LLaMA70B'],
    y='Type',
    x='CRPS',
    # hue='Type',
    data=df,
    ax=ax, 
    palette=palette,
    errwidth=1,
    errorbar='se',
)

#color the first 5 bars grey
for i in range(5):
    ax.patches[i].set_facecolor('#D3D3D3')
    ax.patches[i].set_edgecolor('grey')
    ax.patches[i].set_linewidth(0.5)

# ax.set_title('Aggregated', pad=15)
ax.set_ylabel('')
ax.legend(loc='upper right', frameon=True, framealpha=0.7)
# plt.setp(ax.get_xticklabels(), rotation=45)
plt.setp(ax.get_yticklabels())
#remove space between x tick labels and x axis
ax.tick_params(axis='x', which='major', pad=0)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=9)

ax.set_xlabel('CRPS', fontsize=14)  # Remove x-axis label
ax.set_xlim((0, 0.2))

ax.get_legend().remove()
handles, labels = ax.get_legend_handles_labels()
ax.legend(
    handles=handles,
    labels=labels,
    markerscale=1.5,
    bbox_to_anchor=(1.05, 1),
    loc='upper left',
    borderaxespad=0.3,
)

plt.tight_layout()
plt.savefig('small_DARTS_CRPS.png', dpi=300, bbox_inches='tight')
plt.savefig('small_DARTS_CRPS.pdf', bbox_inches='tight')
plt.show()

  from pandas.core.computation.check import NUMEXPR_INSTALLED


FileNotFoundError: [Errno 2] No such file or directory: 'eval/small_context_tuned/AirPassengersDataset.pkl'

In [None]:
df.to_csv("crps_top_fig.csv", index=False)

In [None]:
sns.set(style="whitegrid", font_scale=1)

old_names = [
    'ada',
    'babbage',
    'curie',
    'text-davinci-003'
]
name_map = {
    # "arima": "ARIMA",
    'ada': "Ada",
    'babbage': "Babbage",
    'curie': "Curie",
    'text-davinci-003':'Davinci',
    # 'LLaMA7B':'LLaMA7B',
    # 'LLaMA13B':'LLaMA13B',
    # 'LLaMA30B':'LLaMA30B',
    # 'LLaMA70B':'LLaMA70B',
    "llama1_7B": "LLaMA 7B",
    "llama1_13B": "LLaMA 13B",
    "llama1_30B": "LLaMA 30B",
    "llama1_70B": "LLaMA 70B",
    "llama2_7B": "LLaMA-2 7B",
    "llama2_13B": "LLaMA-2 13B",
    "llama2_70B": "LLaMA-2 70B",
    "llama2_7B_chat": "LLaMA-2 7B (chat)",
    "llama2_13B_chat": "LLaMA-2 13B (chat)",
    "llama2_70B_chat": "LLaMA-2 70B (chat)",
}
hue_order = ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA','GPT-3']#, 'LLaMA7B', 'LLaMA13B', 'LLaMA30B', 'LLaMA70B']
nlls = defaultdict(list)
crps = defaultdict(list)
mae = defaultdict(list)
datasets = get_datasets()
for dsname,(train,test) in datasets.items():
    with open(f'eval/small_context_tuned/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)

    for model_name in old_names:

        preds = data_dict[model_name]

        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']
        if model_name=='text-davinci-003-tuned':
            model_name='GPT3'

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)

llama_models = [
    "llama1_7B",
    "llama2_7B",
    "llama2_7B_chat",
    "llama1_13B",
    "llama2_13B",
    "llama2_13B_chat",
    "llama1_30B",
    "llama1_70B",
    "llama2_70B",
    "llama2_70B_chat",
]
for dsname,(train,test) in datasets.items():
    for model_name in llama_models:
        if model_name in ['llama2_70B', 'llama2_70B_chat']:
            fn = f'eval/llama_70B_sweep_sample/{model_name}/darts-{dsname}/1.0_0.9_0.99_0.3_3_,_.pkl'
        else:
            fn = f'eval/llama_2_results/{model_name}/darts-{dsname}/0.4_0.9_0.99_0.3_3_,_.pkl'
        with open(fn,'rb') as f:
            data_dict = pickle.load(f)

        preds = data_dict#[model_name]

        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)

nlls = {k:np.array(v) for k,v in nlls.items()}
crps = {k:np.array(v) for k,v in crps.items()}
mae = {k:np.array(v) for k,v in mae.items()}

# Update dataset keys by removing 'Dataset' substring
dataset_keys = [key.replace('Dataset', '') for key in datasets.keys()]

# fig, ax = plt.subplots(1, 1, figsize=(3, 3))
# dfs = [pd.DataFrame({'Dataset':dataset_keys,'NLL/D':v,'Type':k}) for k,v in nlls.items()]
# df = pd.concat(dfs)

# df['Type'] = df['Type'].apply(lambda x: name_map[x])

palette = sns.color_palette('Dark2', len(hue_order))
palette = palette[:2] + palette[3:] + palette[2:3]

mmlu_numbers = {
    'ada': 0.238,
    'babbage': 0.235,
    'curie': 0.237,
    'text-davinci-003': 0.569, 
    'cohere-medium': 0.279,
    'cohere-base-light': 0.264, 
    'cohere-base': 0.324,
    'cohere-command-nightly': 0.452,
    'forefront-gpt-j-6b-vanilla': 0.249,
    "alephalpha-luminous-extended": 0.321, 
    "alephalpha-luminous-supreme": 0.452,   
    # 'ARIMA': 0.,
    'cohere-command-light': 0.264,
    'cohere-command-light-nightly': 0.264,
    'forefront-gpt-neox-20b-vanilla': 0.24,
    # 'LLaMA7B': 0.351, 
    # 'LLaMA13B': 0.469, 
    # 'LLaMA30B': 0.578, 
    # 'LLaMA70B': 0.634,
    'llama1_7B': 0.351,
    'llama2_7B': 0.453,
    # 'llama2_7B_chat': 0.469,
    'llama1_13B': 0.469,
    'llama2_13B': 0.548,
    # 'llama2_13B_chat': 0.469,
    'llama1_30B': 0.578,
    'llama1_70B': 0.634,
    'llama2_70B': 0.689,
    # 'llama2_70B_chat': 0.578,
}

df = []
for k,nll in nlls.items():
    if "chat" in k:
        continue
    print(k)
    _crps, _mae = crps[k], mae[k]
    # print(k,nll)
    # print(list(datasets.keys()))
    # for i in range(len(nll)):
    d = {
        # 'Dataset':list(datasets.keys()),
        'MAE': np.mean(_mae),
        'CRPS': np.mean(_crps),
        'NLL/D': np.mean(nll), #nll[i],#np.mean(nll),
        'Type':k,
        'MMLU Accuracy': mmlu_numbers[k],
    }
    df.append(d)
df = pd.DataFrame(df)


host_name_map = {
    'openai': 'OpenAI',
    'cohere': 'Cohere',
    'forefront': 'Forefront',
    'alephalpha': 'Aleph Alpha',
    'LLaMA': 'LLaMA'
}

def hostify(x):
    if x in ['Ada','Babbage','Curie','Davinci']:
        return 'openai-'+x
    if 'LLaMA' in x:
        return x.replace('LLaMA','LLaMA-')
    return x

df['Type'] = df['Type'].apply(lambda x: name_map[x])
df['Type'] = df['Type'].apply(hostify)
df['host'] = df['Type'].apply(lambda x: host_name_map[x.split("-")[0]])

# fig, ax = plt.subplots(1, 1, figsize=(3.5, 2.2))

fig, ax = plt.subplots(1, 1, figsize=(3.5, 2), gridspec_kw = {'wspace': 0.6})

sns.regplot(
    data=df,
    x='MMLU Accuracy',
    # x='Dataset',
    y='CRPS',
    order=1,
    # errorbar='se',
    ax=ax, 
    scatter_kws={'color':'white'},
)

sns.scatterplot(
    data=df,
    x='MMLU Accuracy',
    y='CRPS',
    color='black',
    # hue='host',
    # palette='Dark2',
    ax=ax,
)

#remove pad between yticklabels and axis
ax.tick_params(axis='y', which='major', pad=0)
ax.tick_params(axis='x', which='major', pad=0)

ax.set_xlabel(ax.get_xlabel(), fontsize=16)
ax.set_ylabel(ax.get_ylabel(), fontsize=16)

plt.savefig('small_crps_vs_mmlu.png', bbox_inches='tight', dpi=300)
plt.savefig('small_crps_vs_mmlu.svg', bbox_inches='tight')
plt.tight_layout()
plt.show()


In [None]:
df.to_csv("mmlu_top_fig.csv", index=False)