### Memorization

In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict

import sys
sys.path.append("..")
from data.small_context import get_datasets
from data.metrics import calculate_crps
from data.small_context import get_memorization_datasets

datasets = get_memorization_datasets()
nlls = defaultdict(dict)
mae = defaultdict(dict)

name_map = {
    "gp": "SM-GP",
    "arima": "ARIMA",
    "TCN": "TCN",
    "N-HiTS": "N-HiTS",
    "N-BEATS": "N-BEATS",
    "text-davinci-003": "GPT-3",
}

hue_order = ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA','GPT-3']
palette = sns.color_palette('Dark2', len(hue_order))
palette = palette[:2] + palette[3:] + palette[2:3]

for dsname,(train,test) in datasets.items():
    with open(f'../eval/memorization/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)

    # print(dsname)
    # print(type(data_dict['gp']['samples']))

    for model_name,preds in data_dict.items():
        print(model_name)
        if isinstance(preds['median'], np.ndarray):
            preds['median'] = pd.Series(preds['median'])
        try: 
            if 'NLL/D' not in preds:
                continue
            nll = preds['NLL/D']
            if model_name=='text-davinci-003-tuned':
                model_name='GPT3'
            nlls[model_name][dsname] = nll
            test_values = test.values if isinstance(test,pd.Series) else test

            tmae = np.abs(test_values-preds['median'].values).mean()/np.abs(test_values).mean()
            mae[model_name][dsname] = tmae
        except Exception as e:
            print(e)
        
data = []

for model_name in nlls:
    for dsname in nlls[model_name]:
        entry = {
            "model_name": model_name,
            "dataset_name": dsname,
            "nll": nlls[model_name][dsname],
            "mae": mae[model_name][dsname],
        }
        data.append(entry)

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(data)

# drop all model_names except for 'gp', 'arima', 'TCN', 'text-davinci-003'
# result_df = result_df[result_df['model_name'].isin(['arima', 'TCN', 'text-davinci-003'])]
# result_df = result_df[result_df['model_name'].isin(['gp', 'arima', 'N-HiTS', 'TCN', 'text-davinci-003'])]

result_df['dataset_name'].unique()
result_df['NLL/D'] = result_df['nll']
result_df['model_name'] = result_df['model_name'].apply(lambda x: name_map[x])

n_cols = 11
n_rows = 1
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.3)
fig, axes = plt.subplots(
    n_rows, n_cols, 
    figsize=(20, 4), constrained_layout=True, #sharex=True,
    gridspec_kw={'width_ratios': [
        4, 0.1, 1, 0.4, 
        4, 0.1, 1, 0.4, 
        4, 0.1, 1]} 
)
axes = axes.flatten()

dsname_map = {
    'IstanbulTraffic': 'Istanbul Traffic',
    'TSMCStock': 'TSMC Stock',
    'TurkeyPower': 'Turkey Power',
}

result_df['ll'] = -1 * result_df['nll']

ax_idx = 0
# Iterate through the datasets and plot the samples
for idx, dsname in enumerate(datasets.keys()):
    train,test = datasets[dsname]
    print(dsname, ax_idx)
    with open(f'eval/memorization/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)
    ax = axes[ax_idx]
    ax_idx += 1
    if not 'text-davinci-003' in data_dict:
        print('GPT-3 not available')
        continue
    samples = data_dict['text-davinci-003']['samples']
    lower = samples.quantile(0.1,axis=0)
    upper = samples.quantile(0.9,axis=0)
    
    # if dsname == "square":
    #     print(np.min(train.values), np.max(train.values))

    pred_color = sns.color_palette('Dark2')[2]
    ax.plot(pd.concat([train,test]),color='k',label='Ground Truth', linewidth=2)
    ax.fill_between(samples.iloc[0].index, lower, upper, alpha=0.5)
    ax.plot(data_dict['text-davinci-003']['median'], color=pred_color,label='GPT-3 Median', linewidth=2)
    #ax.plot(samples.T, alpha=0.5)
    ax.set_title(dsname_map[dsname], fontsize=20)
    # turn off y axis
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)

    ax = axes[ax_idx]
    ax_idx += 1

    ax.set_axis_off()

    ax = axes[ax_idx]
    ax_idx += 1
    
    sns.barplot(
        x='dataset_name',
        # x='data_type',
        # order=['Trend', 'Periodic', 'Trend + Periodic'],
        y='NLL/D',
        hue='model_name', 
        hue_order=hue_order,
        data=result_df[result_df['dataset_name'] == dsname], 
        ax=ax, 
        palette=palette,
    )
    ax.get_legend().remove()
    if idx % 3 != 2:
        ax.set_ylabel("")
    else:
        ax.set_ylabel("NLL/D", rotation=-90, labelpad=15, fontsize=18)
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    ax.spines['left'].set_color('black')
    ax.plot(0, 0, 'vk', transform=ax.transAxes, clip_on=False, zorder=10)
    ax.margins(x=0.2)
    # ax.spines['bottom'].set_axisline_style("-|>")
    
    # ax.set_yticks([])
    # ax.set_ylim((-0.5,12))
    # ax.set_ylim(bottom=-3, top=9)
    ax.get_xaxis().set_visible(False)

    if idx % 3 != 2:
        ax = axes[ax_idx]
        ax_idx += 1
        ax.set_axis_off()

# axes[0].legend(['Ground Truth','Zero shot GPT3'],loc='upper left',fontsize=8,frameon=True,framealpha=0.7)

handles, labels = axes[0].get_legend_handles_labels()
handles2, labels2 = axes[2].get_legend_handles_labels()
handles += handles2
labels += ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA','GPT-3']

plt.subplots_adjust(wspace=0, hspace=.5)

# # Remove unused subplots
# for idx in range(n_datasets, n_rows * n_cols):
#     fig.delaxes(axes[idx])

plt.savefig('memorization.pdf', dpi=300, bbox_inches='tight')
plt.savefig('memorization.png', dpi=300, bbox_inches='tight')
plt.show()


fig, ax = plt.subplots(1, 1, figsize=(2, 2))
ax.legend(
    handles=handles,
    labels=labels,
    markerscale=1.5,
    loc='upper left',
    fontsize=20,
    ncol=4
)

plt.axis("off")
# plt.tight_layout()
plt.savefig('memorization_legend.pdf', bbox_inches='tight')
plt.show()

### Monash samples

In [None]:
from real_benchmarks import get_benchmark_test_sets

benchmarks = get_benchmark_test_sets()
# shuffle the benchmarks
for k, v in benchmarks.items():
    x, scaler = v # scaler is not used
    # seed
    np.random.seed(0)
    x = np.random.permutation(x)
    benchmarks[k] = x
    
df = pd.read_csv('eval/last_value_results.csv')
df.sort_values(by='mae')

df_paper = pd.read_csv('eval/paper_mae_raw.csv') # pdf text -> csv
datasets = df_paper['Dataset']
name_map = {
    'Aus. Electricity Demand' :'australian_electricity_demand',
    'Kaggle Weekly': 'kaggle_web_traffic_weekly',
    'FRED-MD': 'fred_md',
    'Saugeen River Flow': 'saugeenday',
    
}
datasets = [name_map.get(d, d) for d in datasets]
# lower case and repalce spaces with underscores
datasets = [d.lower().replace(' ', '_') for d in datasets]
df_paper['Dataset'] = datasets
# remove from df_paper datasets in df_paper but not in df
df_paper = df_paper[df_paper['Dataset'].isin(df['dataset'])]
df_paper = df_paper.reset_index(drop=True)
# for each dataset, add last value mae to df_paper
for dataset in df_paper['Dataset']:
    df_paper.loc[df_paper['Dataset'] == dataset, 'Last Value'] = df[df['dataset'] == dataset]['mae'].values[0]
# turn '-' into np.nan
df_paper = df_paper.replace('-', np.nan)
# convert all values to float
for method in df_paper.columns[1:]:
    df_paper[method] = df_paper[method].astype(float)
df_paper.to_csv('eval/paper_mae.csv', index=False)
# normalize each method by dividing by last value mae
for method in df_paper.columns[1:-1]: # skip dataset and last value
    df_paper[method] = df_paper[method] / df_paper['Last Value']
# sort df by minimum mae across methods
df_paper['normalized_min'] = df_paper[df_paper.columns[1:-1]].min(axis=1)
df_paper['normalized_median'] = df_paper[df_paper.columns[1:-1]].median(axis=1)
df_paper = df_paper.sort_values(by='normalized_min')
df_paper = df_paper.reset_index(drop=True)
# save as csv
df_paper.to_csv('eval/paper_mae_normalized.csv', index=False)

predictable_datasets = df_paper.head(10)['Dataset']
datasets = {k: benchmarks[k] for k in predictable_datasets}

In [None]:
def plot_predictions(inputs, targets, medians, ax):
    sns.color_palette('Set1')
    train = pd.Series(inputs, index=range(len(inputs)))
    test = pd.Series(targets, index=range(len(inputs), len(inputs)+len(targets)))
    medians = pd.Series(medians, index=test.index)
    ax.plot(pd.concat([train,test]), color='C1',label='Ground Truth', linewidth=1)
    ax.plot(medians, color='C4',label='GPT-3 Median', linewidth=1.5)
    # ax.legend()
    # remove all ticks
    ax.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False)
    for spine in ax.spines.values():
        spine.set_edgecolor('grey')

    

def plot_pred_dataset(dataset, pred_dict, max_series=None, max_num_samples=None, show_median=True, title=None):
    hyper = pred_dict['info']['hyper']
    settings = hyper.settings
    preds = pred_dict['preds']
    medians = pred_dict['medians']
    if max_series is None:
        max_series = len(preds)
    max_series = min(max_series, len(preds))
    dataset = dataset[:max_series]
    preds = preds[:max_series]
    # separate inputs and targets
    inputs = [xy[0][-hyper.max_history:] for xy in dataset]
    targets = np.array([xy[1] for xy in dataset])
    plot_predictions(inputs, targets, medians, title)
    

In [None]:
sns.set_palette("Set1")

D = len(datasets)  # number of datasets
N = 4  # number of series per dataset
np.random.seed(99) # 2

fig, axes = plt.subplots(D, N, figsize=(5.5*N, 4*D), dpi=100)

for i, (ds_name, dataset) in enumerate(datasets.items()):
    print(ds_name)
    path = f'eval/{ds_name}.pkl'
    with open(path, 'rb') as f:
        pred_dict = pickle.load(f)

    hyper = pred_dict['info']['hyper']
    settings = hyper.settings
    medians = pred_dict['medians']
    

    inputs = [xy[0][-hyper.max_history:] for xy in dataset]
    targets = [xy[1] for xy in dataset]
    # Select N series randomly
    # sample without replacement
    if N < len(medians):
        indices = np.random.choice(len(medians), N, replace=False)
    else:
        indices = np.arange(len(medians))
    inputs = [inputs[i] for i in indices]
    targets = [targets[i] for i in indices]
    medians = [medians[i] for i in indices]
    

    for j in range(len(indices)):
        plot_predictions(inputs[j], targets[j], medians[j], axes[i, j])
    # show title inside plot on the leftmost column
    # put the title on the left
    title = '  ' + ds_name.replace('_', ' ').title()
    title = title.replace('Us', 'US')
    axes[i, 0].set_title(title, fontsize=20, y=0.88, loc='left')

for ax in axes.flat:
    ax.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False)

from matplotlib.lines import Line2D
legend_elements = [Line2D([0], [0], color='C1', lw=2, label='Ground Truth'),
                    Line2D([0], [0], color='C4', lw=2, label='GPT-3 Median')]
fig.legend(handles=legend_elements, loc='lower center', ncol=2, fontsize=24, bbox_to_anchor=(0.5, -0.02))
plt.tight_layout()
plt.savefig('monash_examples.pdf', bbox_inches='tight', pad_inches=0.1)

### Darts complete

In [None]:
n_datasets = len(datasets)
n_cols = 15
n_rows = 2#(n_datasets + n_cols - 1) // n_cols
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.3)
fig, axes = plt.subplots(
    n_rows, n_cols, 
    figsize=(16, 4), constrained_layout=True, #sharex=True,
    gridspec_kw={'width_ratios': [
        3, 0.2, 1.0, 0.2, 
        3, 0.2, 1.0, 0.2, 
        3, 0.2, 1.0, 0.2,
        3, 0.2, 1.0]}, 
)
axes = axes.flatten()

name_map = {
    "gp": "SM-GP",
    "arima": "ARIMA",
    "TCN": "TCN",
    "N-BEATS": "N-BEATS",
    "N-HiTS": "N-HiTS",
    'text-davinci-003':'GPT-3',
    'LLaMA7B': 'LLaMA7B',
    'LLaMA13B': 'LLaMA13B',
    'LLaMA30B': 'LLaMA30B', 
    'LLaMA70B': 'LLaMA70B',
    "llama1_7B": "LLaMA 7B",
    "llama1_13B": "LLaMA 13B",
    "llama1_30B": "LLaMA 30B",
    "llama1_70B": "LLaMA 70B",
    "llama2_7B": "LLaMA-2 7B",
    "llama2_13B": "LLaMA-2 13B",
    "llama2_70B": "LLaMA-2 70B",
    "llama2_7B_chat": "LLaMA-2 7B (chat)",
    "llama2_13B_chat": "LLaMA-2 13B (chat)",
    "llama2_70B_chat": "LLaMA-2 70B (chat)",
}

hue_order = ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA']#, 'LLaMA70B']
hue_order += ["LLaMA-2 70B", 'GPT-3']
# hue_order += [
#     "LLaMA 7B", "LLaMA-2 7B", "LLaMA-2 7B (chat)",
#     "LLaMA 13B", "LLaMA-2 13B", "LLaMA-2 13B (chat)",
#     "LLaMA 30B", "LLaMA 70B", "LLaMA-2 70B", "LLaMA-2 70B (chat)"
# ]
nlls = defaultdict(list)
crps = defaultdict(list)
mae = defaultdict(list)
datasets = get_datasets()
for dsname,(train,test) in datasets.items():
    # if dsname == "SunspotsDataset":
    #     continue

    # print(dsname)
    with open(f'eval/small_context_tuned/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)
    for model_name,preds in data_dict.items():
        # print(f"\t{model_name}")
        if model_name in ['ada','babbage','curie']:
            continue
        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']
        if model_name=='text-davinci-003-tuned':
            model_name='GPT3'

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)

llama_models = [
    "llama1_7B",
    "llama2_7B",
    "llama2_7B_chat",
    "llama1_13B",
    "llama2_13B",
    "llama2_13B_chat",
    "llama1_30B",
    "llama1_70B",
    "llama2_70B",
    "llama2_70B_chat",
]
for dsname,(train,test) in datasets.items():
    # if dsname == "SunspotsDataset":
    #     continue

    for model_name in llama_models:
        if model_name in ['llama2_70B', 'llama2_70B_chat']:
            fn = f'eval/llama_70B_sweep_sample/{model_name}/darts-{dsname}/1.0_0.9_0.99_0.3_3_,_.pkl'
        else:
            fn = f'eval/llama_2_results/{model_name}/darts-{dsname}/0.4_0.9_0.99_0.3_3_,_.pkl'
        with open(fn,'rb') as f:
            data_dict = pickle.load(f)

        preds = data_dict#[model_name]

        if 'NLL/D' not in preds:
            continue
        nll = preds['NLL/D']

        if type(preds['samples']) == np.ndarray:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'][:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)
        else:
            nlls[model_name].append(nll)
            crps[model_name].append(calculate_crps(test.values,preds['samples'].values[:10],10))
            tmae = np.abs(test.values-preds['median']).mean()/np.abs(test.values).mean()
            mae[model_name].append(tmae)


nlls = {k:np.array(v) for k,v in nlls.items()}
crps = {k:np.array(v) for k,v in crps.items()}
mae = {k:np.array(v) for k,v in mae.items()}


dfs = [pd.DataFrame({'Dataset':dataset_keys,'NLL/D':v,'Type':k, 'CRPS':crps[k],'MAE':mae[k]}) for k,v in nlls.items()]
df = pd.concat(dfs)

df['Type'] = df['Type'].apply(lambda x: name_map[x])

result_df = df
result_df['ll'] = -1 * result_df['NLL/D']

print(result_df)

datasets_to_plot = [
    'AirPassengersDataset', 'AusBeerDataset', 
    'GasRateCO2Dataset', 'MonthlyMilkDataset', 
    'SunspotsDataset', 'WineDataset', 
    'WoolyDataset', 'HeartRateDataset'
]
hue_order = ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA',"LLaMA-2 70B",'GPT-3']

ax_idx = 0
# Iterate through the datasets and plot the samples
for idx, dsname in enumerate(datasets_to_plot):
    train,test = datasets[dsname]

    with open(f'eval/small_context_tuned2/{dsname}.pkl','rb') as f:
        data_dict = pickle.load(f)
    ax = axes[ax_idx]
    ax_idx += 1
    if not 'text-davinci-003' in data_dict:
        continue
    samples = data_dict['text-davinci-003']['samples']
    lower = samples.quantile(0.1,axis=0)
    upper = samples.quantile(0.9,axis=0)
    
    # if dsname == "square":
    #     print(np.min(train.values), np.max(train.values))

    pred_color = sns.color_palette('Dark2')[2]
    train_test = pd.concat([train,test])
    # print(train_test.values.shape)
    # print(list(train_test.iloc))
    ax.plot(train_test, color='k',label='Ground Truth', linewidth=1)
    ax.fill_between(samples.iloc[0].index, lower, upper, alpha=0.5)
    ax.plot(samples.iloc[0].index, data_dict['text-davinci-003']['median'], color=pred_color,label='GPT3 Median',linewidth=1)
    ax.set_title(dsname.replace("Dataset",""))
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)

    import matplotlib.dates as mdates
    myFmt = mdates.DateFormatter('%d')
    ax.xaxis.set_major_formatter(myFmt)
    ax.xaxis.set_major_locator(plt.MaxNLocator(2))
    # ax.set_xticklabels([0, 0, len(train_test) // 2, len(train_test)])

    ax = axes[ax_idx]
    ax_idx += 1

    ax.set_axis_off()

    ax = axes[ax_idx]
    ax_idx += 1
    
    palette = sns.color_palette('Dark2', len(hue_order))
    palette = palette[:2] + palette[3:-1] + ['#a60355'] + palette[2:3]

    sns.barplot(
        x='Dataset',
        # x='data_type',
        # order=['Trend', 'Periodic', 'Trend + Periodic'],
        y='NLL/D',
        hue='Type', 
        hue_order=hue_order,
        data=result_df[result_df['Dataset'] == dsname.replace("Dataset","")], 
        ax=ax, 
        palette=palette,
    )
    ax.get_legend().remove()
    if idx % 4 != 3:
        ax.set_ylabel("")
    else:
        ax.set_ylabel("NLL", rotation=-90, labelpad=15, fontsize=11)
    ax.yaxis.set_label_position("right")
    ax.yaxis.tick_right()

    ax.spines['left'].set_color('black')
    ax.plot(0, 0, 'vk', transform=ax.transAxes, clip_on=False, zorder=10)
    ax.margins(x=0.15)
    # ax.spines['bottom'].set_axisline_style("-|>")
    
    # ax.set_yticks([])
    # ax.set_ylim((-0.5,12))
    ax.set_ylim(bottom=-0.05)
    ax.get_xaxis().set_visible(False)

    if idx % 4 != 3:
        ax = axes[ax_idx]
        ax_idx += 1
        ax.set_axis_off()


# axes[0].legend(['Ground Truth','Zero shot GPT3'],loc='upper left',fontsize=8,frameon=True,framealpha=0.7)

handles, labels = axes[0].get_legend_handles_labels()
handles2, labels2 = axes[2].get_legend_handles_labels()
handles += handles2
labels += ['SM-GP','N-BEATS','TCN','N-HiTS','ARIMA',"LLaMA-2 70B",'GPT-3']

plt.subplots_adjust(wspace=0, hspace=.5)
# plt.locator_params(axis='x', nbins=10)

# # Remove unused subplots
# for idx in range(n_datasets, n_rows * n_cols):
#     fig.delaxes(axes[idx])

plt.savefig('darts_qualitative.pdf', dpi=300, bbox_inches='tight')
# plt.savefig('darts_qualitative.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

fig, ax = plt.subplots(1, 1, figsize=(2, 2))
ax.legend(
    handles=handles[:2],
    labels=labels[:2],
    markerscale=1.5,
    labelspacing=0.1,
    loc='upper left',
    fontsize=25,
    ncol=8,
    frameon=True
)

plt.axis("off")
plt.tight_layout()
plt.savefig('darts_qualitative_legend_1.pdf', bbox_inches='tight')
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(2, 2))
ax.legend(
    handles=handles[2:],
    labels=labels[2:],
    markerscale=1.5,
    labelspacing=0.1,
    loc='upper left',
    fontsize=25,
    ncol=8,
    frameon=True,
    handlelength=1., 
    columnspacing=1.
)

plt.axis("off")
plt.tight_layout()
plt.savefig('darts_qualitative_legend_2.pdf', bbox_inches='tight')
plt.show()