# Benchmark Plots

In [None]:
""" 
Generates figures and LaTeX tables for 

SubseasonalClimateUSA: A Dataset for Subseasonal Forecasting and Benchmarking

Soukayna Mouatadid, Paulo Orenstein, Genevieve Flaspohler, Miruna Oprescu, 
Judah Cohen, Franklyn Wang, Sean Knight, Maria Geogdzhayeva, Sam Levang, 
Ernest Fraenkel, and Lester Mackey. 
"""
# Ensure notebook is being run from base repository directory
import os, sys
try:
    os.chdir("/home/{}/forecast_rodeo_ii/".format(os.environ["USER"]))
except Exception as err:
    print(f"Warning: unable to change directory; {repr(err)}")
    
%load_ext autoreload
%autoreload 2
%matplotlib inline    
    
import itertools
import importlib
import subprocess
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
from functools import partial

from IPython.display import Markdown, display

import copy
import pdb
import calendar 
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib   
from matplotlib.gridspec import GridSpec

from subseasonal_toolkit.utils.experiments_util import pandas2hdf
from subseasonal_toolkit.utils.general_util import printf, make_directories
from subseasonal_toolkit.utils.eval_util import get_target_dates, score_to_mean_rmse, contest_quarter_start_dates, contest_quarter
from subseasonal_toolkit.utils.models_util import get_selected_submodel_name
from viz_util import *

# set figure and font sizes for seaborn plots
sns.set(rc={'figure.figsize':(8,6)}, font_scale=1)

#
# Directory for saving output
#
out_dir = "/home/{}/forecast_rodeo_ii/subseasonal_toolkit/viz/benchmark".format(os.environ["USER"])
make_directories(out_dir)

In [None]:
# 
# Full set of regions, times, and tasks to evaluate
#
metrics = ["rmse", "skill", "score"]

contest_gt_ids = ["contest_tmp2m", "contest_precip"]
us_gt_ids = ["us_tmp2m", "us_precip"]
east_gt_ids = ["east_tmp2m", "east_precip"]
us_1_5_gt_ids = ["us_tmp2m_1.5x1.5", "us_precip_1.5x1.5"]

# All ground truth ids
gt_ids = contest_gt_ids + us_gt_ids 

horizons = ["34w", "56w"]
target_eval_dates = ["std_paper", "std_contest"]

# The full set of models we to evaluate in some
# experiment 
all_models = [
    # Raw Baselines
    'raw_cfsv2', 
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # ECMWF
    'ecmwf'
    # ABC 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    #Learning
    'autoknn',
    'informer',
    'tuned_localboosting',
    'multillr',
    'nbeats',
    'prophet',
    'salient',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble',  
    'online_learning'
]

# Main experiment model names
main_experiment_models = [
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # ABC 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp_cfsv2',
    #Learning
    'autoknn',
    'informer',
    'tuned_localboosting',
    'multillr',
    'nbeats',
    'prophet',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble',  
    'online_learning'
]

# Rodeo experiment model names
rodeo_experiment_models = [
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # ABC 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp_cfsv2',
    #Learning
    'autoknn',
    'tuned_localboosting',
    'multillr',
    'prophet',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble',
    'online_learning',
#     'linear_ensemble_localFalse_dynamicFalse_stepFalse_LtCtD',
#     'linear_ensemble_localFalse_dynamicFalse_stepFalse_AMLPtCtDtKtS',  
#     'online_learning-ah_rpNone_R1_recent_g_SC_LtCtD',
#     'online_learning-ah_rpNone_R1_recent_g_SC_AMLPtCtDtKtS'
]

# Salient experiment model names
salient_experiment_models = [
    # Baselines   
    'deb_cfsv2',
    # ABC 
    'tuned_cfsv2pp',
    #Learning
    'tuned_salient2',
]


# ECMWF experiment model names
ecmwf_experiment_models = [
    # Baselines
    "climatology", 
    'deb_cfsv2',
    'persistence',
    # ABC 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp_cfsv2',
    # ECMWF
#     "deb_ecmwf",
    'ecmwf-years20_leads15-15_lossmse_forecastc_debiasp+c',
    'ecmwf-years20_leads15-15_lossmse_forecastp_debiasp+c',
    # Ensembles
    "online_learning", 
    "linear_ensemble" 
]

graphcast_experiment_models = [
    #Baseline
    "deb_cfsv2",
    # Model   
    'graphcast',
    # Ensembles
    "online_learning", 
    "linear_ensemble" 
]

In [None]:
#
# Dictionaries mapping all model names and tasks to their display names
#

east_tasks = {
    "east_tmp2m_34w": "Temp. weeks 3-4",
    "east_tmp2m_56w": "Temp. weeks 5-6",
    "east_precip_34w": "Precip. weeks 3-4",
    "east_precip_56w": "Precip. weeks 5-6"
}

contest_tasks = {
    "contest_tmp2m_34w": "Temp. weeks 3-4",
    "contest_tmp2m_56w": "Temp. weeks 5-6",
    "contest_precip_34w": "Precip. weeks 3-4",
    "contest_precip_56w": "Precip. weeks 5-6"
}
us_tasks = {
    "us_tmp2m_34w": "Temp. weeks 3-4",
    "us_tmp2m_56w": "Temp. weeks 5-6",
    "us_precip_34w": "Precip. weeks 3-4",
    "us_precip_56w": "Precip. weeks 5-6",   
}

us_1_5_tasks = {
    "us_tmp2m_1.5x1.5_34w": "Temp. weeks 3-4",
    "us_tmp2m_1.5x1.5_56w": "Temp. weeks 5-6",
    "us_precip_1.5x1.5_34w": "Precip. weeks 3-4",
    "us_precip_1.5x1.5_56w": "Precip. weeks 5-6",   
}


## Model list for generate predictions
`tuned_climpp,tuned_cfsv2pp,tuned_localboosting,tuned_salient_fri,perpp,multillr,autoknn,raw_cfsv2,nbeats_final,prophet`
## Model list for tuning
`climpp,cfsv2,catboost,salient_fri`
## Tuned models for metrics
`tuned_climpp,tuned_cfsv2pp,tuned_localboosting,tuned_salient_fri`

## Read in all metrics for all tasks and all models
Reads metrics, generates a summary of missing data, and produces the `all_metrics` dictionary to be used in further analysis. 

In [None]:
"""
Generate a dictionary with metric values for all models and every combination of gt_id, 
horizon, and target dates
"""


# TODO: change to add us_gt_ids once metrics are ready
all_metrics = {}

# Get metrics for main experiment, rodeo experiment, salient experiment and ecmwf experiment
for metric, gt_id, horizon, target_dates in \
        [x for x in product(['rmse', 'skill'], us_gt_ids, horizons, ['std_paper'])] \
        +[x for x in product(['rmse'], contest_gt_ids, horizons, ['std_contest'])] \
        +[x for x in product(['rmse'], contest_gt_ids, horizons, ['std_paper'])] \
        +[x for x in product(['rmse', 'skill'], us_gt_ids, ['34w'], ['std_paper_graphcast'])] \
        +[x for x in product(['rmse', 'skill'], us_1_5_gt_ids, horizons, ['std_ecmwf'])]: 
   
    
    #Set model names   
    if 'us' in gt_id:
        if 'graphcast' in target_dates:
            model_names = graphcast_experiment_models 
            model_names_str = 'graphcast_experiment_models' 
        elif '1.5x1.5' in gt_id:
            model_names = ecmwf_experiment_models 
            model_names_str = 'ecmwf_experiment_models' 
        else:
            model_names =  main_experiment_models
            model_names_str = 'main_experiment_models'
    elif 'contest' in gt_id:
        model_names = rodeo_experiment_models if 'contest' in target_dates else salient_experiment_models
        model_names_str = 'rodeo_experiment_models' if 'contest' in target_dates else 'salient_experiment_models'
    else:
        model_names = all_models
        model_names_str = 'all_models'


    # Get task
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"### {model_names_str}: {metric}, {task}, {target_dates}"))

    # Get all metrics
    if metric is 'skill' and 'climatology' in model_names:
        display(Markdown(f"#### ===> Warning: skill is not calculated for the climatology baseline model."))
    df = get_metrics_df(gt_id, horizon, metric, target_dates, model_names=model_names)
    
        

    # No models exist for this task    
    if df is None: 
        continue

    # Add yearly and quarterly columns to the dataframe
    df = add_groupby_cols(df, horizon=horizon)
    all_metrics[(metric, task, target_dates)] = copy.copy(df)

    if metric in ['rmse', 'skill']:
        key = (metric, task, target_dates)
        try:        
            missing_df = all_metrics[key].loc[(all_metrics[key][model_names].isnull().any(axis=1)),:]
        except:        
            missing_df = all_metrics[key].loc[(all_metrics[key].isnull().any(axis=1)),:]            
#         if missing_df.shape[0] != 0:
#             True
#             display(Markdown(f"#### Missing metrics"))
#             display(missing_df)    
#         else:
#             printf("All metrics present.")

## Getting metrics 
After generating the above `all_metrics` dictionary, you can get the average metric value for a set of models and specific period using the following function:
```
df = get_per_period_metrics_df(all_metrics, period="quarterly", horizon="34w", metric="score", target_dates="std_paper", model_names=all_models)
```

The period can be `quarterly`, `yearly`, or `quarterly_yearly` (returns average metrics values in YY1-Q1, YY1-Q2, ..., YY2-Q1, ... etc.).

# Figure 1: 
#### Schematic of the SubseasonalClimateUSA data collection and processing pipeline

In [None]:
printf("see subseasonal_toolkit/viz/benchmark/figures/flowcharts/perpp-get_weights.ipynb")

# Figure 2: 
#### Example of SubseasonalClimateUSA observations and dynamical model forecasts.

In [None]:
# Generate figure metrics
figure_gt_ids = us_1_5_gt_ids
figure_horizons = ['34w']
figure_target_dates = 'std_paper_forecast'
figure_metrics = ['lat_lon_pred']
figure_models = [
    # Baselines
    "gt",
    "raw_cfsv2",
    "raw_ecmwf",
    "raw_ccsm4", 
    "raw_geos_v2p1",
    "raw_nesm",
    "raw_fimr1p1",
    "raw_gefs",
    "raw_gem",    
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models,
                                                    first_target_date = True)
    
# Set figure parameter
figure_gt_ids = us_1_5_gt_ids
figure_horizons = ['34w']
figure_metric = 'lat_lon_pred'
figure_mean_metric_df = None
figure_source_data = False
figure_show = True


figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for gt_id in figure_gt_ids:
    display(Markdown(f"#### {gt_id}"))
    figure_CB_minmax = (-20, 20) if 'tmp2m' in gt_id else (0, 80)
    plot_metric_maps_task(metric_dfs_rda, model_names=figure_model_names,
                         gt_ids=[gt_id],
                         horizons=figure_horizons,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["white", "#dede00", "#ff7f00", "blueviolet", "indigo", "yellowgreen", "lightgreen", "darkgreen"],
                         CB_minmax = figure_CB_minmax,
                         source_data = figure_source_data)

# Table 1: 
#### Average percentage skill and percentage improvement over mean debiased CFSv2 RMSE across 2011-2020 in the contiguous U.S. along with a 95% bootstrap confidence interval. The best performing model in each model group is bolded, and the best performing model overall is shown in green.

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_paper" 

# quarterly (seasonal quarters), contest_quarterly (contest quarters), 
# monthly, yearly, individual (return full dataframe), overall (return mean of full dataframe)
# monthly_yearly (every month in every year), quarterly_yearly (every quarter in every year)
period = "overall" # <- must be overall to merge with rodeo dataframe
figure_metrics = ["rmse", "skill"] 
table_models = main_experiment_models 
relative_to = 'deb_cfsv2' # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
horizons = horizons
region = 'us' #either us, east or contest
include_overall = True # include overall row in the dataframe

"""
End experiment parameters 
"""

for metric in figure_metrics:
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min

    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]
    metrics_sub = None

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
        task = f"{gt_id}_{horizon}"
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)
       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[[m for m in main_experiment_models if m in table_models]]
    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    #         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
        #for group in ['Baselines', 'ABC', 'Learning', 'Ensembles']:
            #display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
    #         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(
        all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    metrics_sub_se= metrics_sub.copy()
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)
        task = us_tasks[task]
        # import scikits.bootstraps as bootstraps
        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]
        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[[m for m in main_experiment_models if m in table_models]].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")


# Table 2: 
#### Average percentage skill and percentage improvement over mean debiased CFSv2 RMSE across 2016-2020 in the contiguous U.S. along with a 95% bootstrap confidence interval. The best performing model in each model group is bolded, and the best performing model overall is shown in green

In [None]:
target_dates = "std_ecmwf" 
period = "overall" 
figure_metrics = ["rmse", "skill"]
table_models = ecmwf_experiment_models
relative_to = 'deb_cfsv2'
dropna = True  
task_ids = us_1_5_gt_ids 
task_horizons_list = [["34w"], ["56w"]]
region = 'us_1.5x1.5' 
include_overall = True 

"""
End experiment parameters 
"""
for metric, task_horizons in product(figure_metrics, task_horizons_list):
    
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min
       
    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]
    metrics_sub = None

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
        task = f"{gt_id}_{horizon}"
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)

       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]

    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(all_metrics, 
                                                         period="individual", 
                                                         gt_id=gt_id, 
                                                         horizon=horizon,
                                                         metric=metric, 
                                                         target_dates=target_dates, 
                                                         relative_to=relative_to,
                                                         model_names=table_models, 
                                                         include_overall=include_overall, 
                                                         dropna=dropna)
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)


        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]


        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")

# Figure 3: 
### % improvement: ABC vs. LEARNING MODELS
#### Per season and per year average skill and improvement over mean debiased CFSv2 RMSE across the contiguous U.S. and the years 2011-2020. Despite their simplicity, the ABC models (solid lines) consistently outperform debiased CFSv2 and the state-of-the-art learners (dotted lines).

In [None]:
figure_models = [
    #relative_to
    'deb_cfsv2',
    #ABC
    'tuned_cfsv2pp',
    'tuned_climpp',
    'perpp_cfsv2',
    #learner
    'autoknn',
    'multillr',
    'prophet',
    'tuned_localboosting',
    'tuned_salient2',
]


"""
Paper experiments; can be configured to generate plots for any subset of models, averaged over
a set of periods, for a given target data. Produces a figure and saves to pdf.. 
"""
# Figure experiment parameters
target_dates = "std_paper"
task_ids = us_gt_ids 
task_horizons = horizons

# RMSE improvement by season 
# Subfigure experiment parameters
period = "quarterly" 
metric = 'rmse' 
relative_to = 'deb_cfsv2' # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_ABC_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# RMSE improvement by year 
# Subfigure experiment parameters
period = "yearly" 
metric = 'rmse' 
relative_to = 'deb_cfsv2' # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_ABC_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# Skill improvement by season 
# Subfigure experiment parameters
period = "quarterly" 
metric = 'skill' 
relative_to = None # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_ABC_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# Skill improvement by season 
# Subfigure experiment parameters
period = "yearly" 
metric = 'skill' 
relative_to = None # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_ABC_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# Figure 4: 
#### Percentage improvement over mean debiased CFSv2 RMSE in the contiguous U.S. over 2011-2020. White grid points indicate negative or 0% improvement.

In [None]:
# Generate figure metrics
figure_gt_ids = us_gt_ids
figure_horizons = horizons
figure_target_dates = 'std_paper'
figure_metrics = ['lat_lon_rmse']
figure_models = [
    # Baselines
    'deb_cfsv2',
    "tuned_climpp",
    "tuned_cfsv2pp",
    "perpp_cfsv2",
    "prophet", 
    "tuned_salient2",
    "online_learning",
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)
    
# Set figure parameter
figure_gt_ids = us_gt_ids
figure_horizons = horizons #['34w']
figure_metric = 'lat_lon_rmse'
figure_relative_to = 'deb_cfsv2'
figure_mean_metric_df = None
figure_cb_skip = 4
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    figure_cb_minmax = (0, 20)# if 'tmp2m' in gt_id else (0, 80)
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["white", "green", "darkgreen"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

# Supplementary figures
 
## B. Model Implementation Details
## Figure 5: 
#### Climatology++ hyperparameters automatically selected for each target date in 2011-2020.
Tuning plots: This code produces plots to analyze submodels selected by the tuner. 

In [None]:
# Figure parameters
figure_models = ["tuned_climpp"]
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)


## Figure 6: 
#### CFSv2++ hyperparameters automatically selected for each target date in 2011-2020.

In [None]:
# Figure parameters
figure_models = ["tuned_cfsv2pp"]
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)


## Figure 7: 
#### Spatial variation in Persistence++ learned regression weights when forecasting temperature in weeks 3-4 for the final target date, December 23, 2020.

In [None]:
printf("see subseasonal_toolkit/examples/benchmark/perpp-get_weights.ipynb")

## Figure 8: 
#### Spatial variation in Persistence++ learned regression weights when forecasting temperature in weeks 5-6 for the final target date, December 23, 2020.

In [None]:
printf("see subseasonal_toolkit/examples/benchmark/perpp-get_weights.ipynb")

## Figure 9: 
#### Spatial variation in Persistence++ learned regression weights when forecasting precipitation in weeks 3-4 for the final target date, December 23, 2020.

In [None]:
printf("see subseasonal_toolkit/examples/benchmark/perpp-get_weights.ipynb")

## Figure 10: 
#### Spatial variation in Persistence++ learned regression weights when forecasting precipitation in weeks 5-6 for the final target date, December 23, 2020

In [None]:
printf("see subseasonal_toolkit/examples/benchmark/perpp-get_weights.ipynb")

## Figure 11: 
#### LocalBoosting hyperparameters automatically selected for each target date in 2011-2020.

In [None]:
# Figure parameters
figure_models = ["tuned_localboosting"]
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)

## Figure 12: 
#### Salient 2.0 hyperparameters automatically selected for each target date in 2011-2020.

In [None]:
# Figure parameters
figure_models = ['tuned_salient2']
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)

## C. Supplementary Results
## C.1 Percentage Improvement over Meteorological Baselines
## Figure 13: 
#### Per season improvement of each ABC model over its corresponding baseline across the contiguous U.S. and the years 2011-2020. The learned ABC benchmarks yield consistent improvements in mean RMSE.

In [None]:
# Figure experiment parameters
target_dates = "std_paper" 
period = "quarterly" 
metric = "rmse" 
task_ids = us_gt_ids 
region = 'us'

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'climatology', 
    'tuned_climpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'deb_cfsv2', 
    'tuned_cfsv2pp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'persistence', 
    'perpp_cfsv2',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

## Figure 14: 
#### Per year improvement of each ABC model over its corresponding baseline across the contiguous U.S. and the years 2011-2020. The learned ABC benchmarks yield consistent improvements in mean RMSE.

In [None]:
# Figure experiment parameters
target_dates = "std_paper" 
period = "yearly" 
metric = "rmse" 
task_ids = us_gt_ids 
region = 'us'

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'climatology', 
    'tuned_climpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'deb_cfsv2', 
    'tuned_cfsv2pp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'persistence', 
    'perpp_cfsv2',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

## C.2 Yearly Percentage Improvement over Mean Debiased CFSv2 RMSE
## Table 3: 
#### Percentage improvement over mean debiased CFSv2 RMSE when forecasting temperature in the contiguous U.S. The best performing models within each class of models are shown in bold, while the best performing models overall are shown in green.

In [None]:
target_dates = "std_paper" 
period = "yearly" 
figure_metrics = ["rmse"]
table_models = main_experiment_models
relative_to = 'deb_cfsv2'
dropna = True  
task_ids = [g for g in us_gt_ids if "tmp2m" in g] 
task_horizons = horizons
region = 'us' 
include_overall = True 


"""
End experiment parameters 
"""
for metric, horizon in product(figure_metrics, task_horizons):
    
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min
      
    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]
    

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        metrics_sub = None
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)

       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]

    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(all_metrics, 
                                                         period="individual", 
                                                         gt_id=gt_id, 
                                                         horizon=horizon,
                                                         metric=metric, 
                                                         target_dates=target_dates, 
                                                         relative_to=relative_to,
                                                         model_names=table_models, 
                                                         include_overall=include_overall, 
                                                         dropna=dropna)
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)


        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]


        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")

## Table 4: 
#### Percentage improvement over mean debiased CFSv2 RMSE when forecasting precipitation in the contiguous U.S. The best performing models within each class of models are shown in bold, while the best performing models overall are shown in green.

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""
target_dates = "std_paper" 
period = "yearly" 
figure_metrics = ["rmse"]
table_models = main_experiment_models
relative_to = 'deb_cfsv2'
dropna = True  
task_ids = [g for g in us_gt_ids if "precip" in g] 
task_horizons = horizons
region = 'us' 
include_overall = True 



"""
End experiment parameters 
"""
for metric, horizon in product(figure_metrics, task_horizons):
    
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min
      
    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]
    

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        metrics_sub = None
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)

       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]

    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(all_metrics, 
                                                         period="individual", 
                                                         gt_id=gt_id, 
                                                         horizon=horizon,
                                                         metric=metric, 
                                                         target_dates=target_dates, 
                                                         relative_to=relative_to,
                                                         model_names=table_models, 
                                                         include_overall=include_overall, 
                                                         dropna=dropna)
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)


        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]


        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")

## C.3 Yearly Average Skill
## Table 5: 
#### Average percentage skill when forecasting temperature in the contiguous U.S. The best performing models within each group are shown in bold, while the best performing models overall are shown in green.

In [None]:
target_dates = "std_paper" 
period = "yearly" 
figure_metrics = ["skill"]
table_models = main_experiment_models
relative_to = 'deb_cfsv2'
dropna = True  
task_ids = [g for g in us_gt_ids if "tmp2m" in g] 
task_horizons = horizons
region = 'us' 
include_overall = True 



"""
End experiment parameters 
"""
for metric, horizon in product(figure_metrics, task_horizons):
    
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min
      
    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]
    

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        metrics_sub = None
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)

       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]

    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(all_metrics, 
                                                         period="individual", 
                                                         gt_id=gt_id, 
                                                         horizon=horizon,
                                                         metric=metric, 
                                                         target_dates=target_dates, 
                                                         relative_to=relative_to,
                                                         model_names=table_models, 
                                                         include_overall=include_overall, 
                                                         dropna=dropna)
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)


        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]


        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")

## Table 6: 
#### Average percentage skill when forecasting precipitation in the contiguous U.S. The best performing models within each group are shown in bold, while the best performing models overall are shown in green.

In [None]:
target_dates = "std_paper" 
period = "yearly" 
figure_metrics = ["skill"]
table_models = main_experiment_models
relative_to = 'deb_cfsv2'
dropna = True  
task_ids = [g for g in us_gt_ids if "precip" in g] 
task_horizons = horizons
region = 'us' 
include_overall = True 


"""
End experiment parameters 
"""
for metric, horizon in product(figure_metrics, task_horizons):
    
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min
      
    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]
    

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        metrics_sub = None
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)

       # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]

    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
#         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(all_metrics, 
                                                         period="individual", 
                                                         gt_id=gt_id, 
                                                         horizon=horizon,
                                                         metric=metric, 
                                                         target_dates=target_dates, 
                                                         relative_to=relative_to,
                                                         model_names=table_models, 
                                                         include_overall=include_overall, 
                                                         dropna=dropna)
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, gt_id in enumerate(task_ids):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)


        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]


        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{horizon}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")

## C.4 Spatial Improvement over Mean Debiased CFSv2 RMSE
## Figure 15: 
#### Percentage improvement over mean debiased CFSv2 RMSE when forecasting temperature in the contiguous U.S. over 2011-2020. White grid points indicate negative or 0% improvement

In [None]:
# Generate figure metrics
figure_gt_ids = [g for g in us_gt_ids if 'tmp2m' in g]
figure_horizons = horizons
figure_target_dates = 'std_paper'
figure_metrics = ['lat_lon_rmse']
figure_models = [
    #relative_to
    'deb_cfsv2',
    # Row 1
    'tuned_cfsv2pp', 
    'tuned_climpp', 
    'perpp_cfsv2', 
    'online_learning',                                                     
    # Row 2
    'autoknn', 
    'climatology', 
    'persistence',  
    'linear_ensemble',
    # Row 3
    'informer', 
    'tuned_localboosting', 
    'multillr', 
    'nbeats',                                                                          
    # Row 4
    'prophet', 
    'tuned_salient2', 
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)

# Set figure parameter
figure_gt_ids = [g for g in us_gt_ids if 'tmp2m' in g]
figure_horizons = horizons
figure_metric = 'lat_lon_rmse'
figure_relative_to = 'deb_cfsv2'
figure_mean_metric_df = None
figure_cb_minmax = (0, 20)
figure_cb_skip = 4
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["white", "green", "darkgreen"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

## Figure 16: 
#### Percentage improvement over mean debiased CFSv2 RMSE when forecasting precipitation in the contiguous U.S. over 2011-2020. White grid points indicate negative or 0% improvement

In [None]:
# Generate figure metrics
figure_gt_ids = [g for g in us_gt_ids if 'precip' in g]
figure_horizons = horizons
figure_target_dates = 'std_paper'
figure_metrics = ['lat_lon_rmse']
figure_models = [
    #relative_to
    'deb_cfsv2',
    # Row 1
    'tuned_cfsv2pp', 
    'tuned_climpp', 
    'perpp_cfsv2', 
    'online_learning',                                                     
    # Row 2
    'autoknn', 
    'climatology', 
    'persistence',  
    'linear_ensemble',
    # Row 3
    'informer', 
    'tuned_localboosting', 
    'multillr', 
    'nbeats',                                                                          
    # Row 4
    'prophet', 
    'tuned_salient2', 
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)
# Set figure parameter
figure_gt_ids = [g for g in us_gt_ids if 'precip' in g]
figure_horizons = horizons
figure_metric = 'lat_lon_rmse'
figure_relative_to = 'deb_cfsv2'
figure_mean_metric_df = None
figure_cb_minmax = (0, 20)
figure_cb_skip = 4
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["white", "green", "darkgreen"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

## C.5 Spatial Bias Maps
## Figure 17: 
#### Model bias when forecasting temperature in the contiguous U.S. over 2011-2020.

In [None]:
# Generate figure metrics
figure_gt_ids = [g for g in us_gt_ids if 'tmp2m' in g]
figure_horizons = horizons
figure_target_dates = 'std_paper'
figure_metrics = ['lat_lon_error']
figure_models = [
    # Row 1
    'tuned_cfsv2pp', 
    'tuned_climpp', 
    'perpp_cfsv2', 
    'online_learning',                                                     
    # Row 2
    'deb_cfsv2',
    'climatology', 
    'persistence',  
    'linear_ensemble',
    # Row 3
    'autoknn', 
    'informer', 
    'tuned_localboosting', 
    'multillr',                                                                          
    # Row 4
    'nbeats', 
    'prophet', 
    'tuned_salient2', 
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)

# Set figure parameter
figure_gt_ids = [g for g in us_gt_ids if 'tmp2m' in g]
figure_horizons = horizons
figure_metric = 'lat_lon_error'
figure_relative_to = None
figure_mean_metric_df = None
figure_cb_minmax = (-5, 5)
figure_cb_skip = 1
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["darkblue", "blue", "white", "red", "darkred"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

## Figure 18: 
#### Model bias when forecasting precipitation in the contiguous U.S. over 2011-2020.

In [None]:
# Generate figure metrics
figure_gt_ids = [g for g in us_gt_ids if 'precip' in g]
figure_horizons = horizons
figure_target_dates = 'std_paper'
figure_metrics = ['lat_lon_error']
figure_models = [
    # Row 1
    'tuned_cfsv2pp', 
    'tuned_climpp', 
    'perpp_cfsv2', 
    'online_learning',                                                     
    # Row 2
    'deb_cfsv2',
    'climatology', 
    'persistence',  
    'linear_ensemble',
    # Row 3
    'autoknn', 
    'informer', 
    'tuned_localboosting', 
    'multillr',                                                                          
    # Row 4
    'nbeats', 
    'prophet', 
    'tuned_salient2', 
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)

# Set figure parameter
figure_gt_ids = [g for g in us_gt_ids if 'precip' in g]
figure_horizons = horizons
figure_metric = 'lat_lon_error'
figure_relative_to = None
figure_mean_metric_df = None
figure_cb_minmax = (-15, 15)
figure_cb_skip = 5
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["orangered", "orange", "white", "green", "darkgreen"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

## C.6 GraphCast Comparison Details
## Table 7: 
#### Average percentage skill and percentage improvement over mean debiased CFSv2 RMSE across 2018–2020 in the contiguous U.S. along with a 95% bootstrap confidence interval. The best performing model overall is shown in green.

In [None]:
target_dates = "std_paper_graphcast" 
period = "overall" # <- must be overall to merge with rodeo dataframe
figure_metrics = ["rmse", "skill"] 
table_models = graphcast_experiment_models 
relative_to = 'deb_cfsv2' # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
horizons = ['34w']
region = 'us' #either us, east or contest
include_overall = True # include overall row in the dataframe


"""
End experiment parameters 
"""

for metric in figure_metrics:
    if metric is "skill":
        table_models = [m for m in table_models if m is not "climatology"]
        relative_to = None
    if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
        highlight_func = highlight_max
        bold_func = bold_max
    else:
        highlight_func = highlight_min
        bold_func = bold_min

    """
    Display metric table
    """
#     display(Markdown(f"##### {metric} -- {target_dates}"))
    # Get set of tasks 
    tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]
    metrics_sub = None

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
        task = f"{gt_id}_{horizon}"
#         printf(f"Processing {task}")
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period=period, gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)
       
        # Create metrics dataframe template
        if period == "overall":
            index = pd.Index([task], name="task")            
        else:
            index = pd.MultiIndex.from_product(
                [[task], m_sub.index], 
                names=('task', 'period'))   
        # Need to form task-by-task, since some tasks are missing target dates, so index differs
        if metrics_sub is None:
            metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
        else:
            metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        if period == "overall":
            metrics_sub.loc[task, :] = m_sub
        else:
            metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    # maintain ordered list of model names
    metrics_sub = metrics_sub[table_models]
    if metric is "skill":
        metrics_sub = metrics_sub.multiply(100)

    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    #         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
        #for group in ['Baselines', 'ABC', 'Learning', 'Ensembles']:
            #display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
    #         display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float), tables_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)
#     printf(f"Table saved in {os.path.join(tables_dir, f'table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}.tex')}\n")

    """
    Display metric table with standard error
    """
    # Display and save table with standard error
    display(Markdown(f"##### {metric} +/- SE -- {target_dates}"))         
    np.random.seed(123)
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(
        all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    metrics_sub_se= metrics_sub.copy()
    m_sub_se_or = pd.DataFrame(index=experiment_models, columns = metrics_sub.columns)

    # Generate metrics dataframe for each task
    for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
        task = f"{gt_id}_{horizon}"
        printf(f"Processing {task}")  
        # Read in metrics and reset experiment_models based on avaliable metrics
        m_sub, experiment_models = get_per_period_metrics_df(
            all_metrics, period="individual", gt_id=gt_id, horizon=horizon,
            metric=metric, target_dates=target_dates, 
            relative_to=relative_to,
            model_names=table_models, include_overall=include_overall, dropna=dropna)
        task = us_tasks[task]
        # import scikits.bootstraps as bootstraps
        for col in experiment_models:
            mean_col = m_sub[col].mean()
            std_col = m_sub[col].std()
            n_col = m_sub[col].notna().sum()
            t1 = [simulate_sample_mean(n_col, mean_col, std_col) for i in range(1000)]
            summary1 = summarize(t1, digits=6)
            m_sub_se_or.loc[col][task] = summary1['SE'][0]
        # maintain ordered list of model names
        m_sub_se = m_sub_se_or.T
        m_sub_se = m_sub_se[table_models].T
        if metric is "skill":
            m_sub_se = m_sub_se.multiply(100) 
        if period is 'overall':
            m_sub_se['model_type'] = [all_model_types[m] for m in m_sub_se.index]
            m_sub_se = m_sub_se.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
            m_sub_se.columns = m_sub_se.columns.get_level_values(0)

    # display and save dataframe in latex table format
    m_sub = metrics_sub.astype(float).round(2).astype(str).add(' $\pm$ ').add(m_sub_se.astype(float).round(2).astype(str))
    display(m_sub.style.apply(lambda x: metrics_sub.apply(highlight_func, axis=0), axis=None).apply(lambda x: metrics_sub.apply(bold_func, axis=0), axis=None))
    filename_table = f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}_se"
    table_to_tex(m_sub, tables_dir, filename_table, precision=2)
    printf(f"Table saved in {os.path.join(tables_dir, f'{filename_table}.tex')}\n")


## Figure 19: 
#### Percentage improvement over mean debiased CFSv2 RMSE when forecasting temperature in the contiguous U.S. over 2018-2020. White grid points indicate negative or 0% improvement

In [None]:
# Generate figure metrics
figure_gt_ids = us_gt_ids
figure_horizons = ['34w']
figure_target_dates = 'std_paper_graphcast'
figure_metrics = ['lat_lon_rmse']
figure_models = [
    #relative_to
    'deb_cfsv2',
    # Model 1
    'graphcast',                                                      
    # Model 2
    'linear_ensemble',
    # Model 3
    'online_learning',   
]

metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)
    
# Set figure parameter
figure_gt_ids = us_gt_ids
figure_horizons = ['34w']
figure_metric = 'lat_lon_rmse'
figure_relative_to = 'deb_cfsv2'
figure_mean_metric_df = None
figure_cb_minmax = (0, 30)
figure_cb_skip = 5
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["white", "green", "darkgreen"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)


## Figure 20: 
#### Model bias when forecasting temperature in the contiguous U.S. over 2018-2020.

In [None]:
# Generate figure metrics
figure_gt_ids = us_gt_ids
figure_horizons = ['34w']
figure_target_dates = 'std_paper_graphcast'
figure_metrics = ['lat_lon_error']
figure_models = [
    # Model 1
    'graphcast',                                                      
    # Model 2
    'linear_ensemble',
    # Model 3
    'online_learning',   
]


metric_dfs_rda = {}
for gt_id, horizon in product(figure_gt_ids, figure_horizons):
    task = f"{gt_id}_{horizon}"
    display(Markdown(f"#### Getting metrics for {gt_id} {horizon}"))
    metric_dfs_rda[task] = get_models_metric_lat_lon(gt_id=gt_id, horizon=horizon, 
                                                     target_dates=figure_target_dates, 
                                                     metrics = figure_metrics, 
                                                     model_names=figure_models)

# Set figure parameter
figure_gt_ids = us_gt_ids
figure_horizons = ['34w']
figure_metric = 'lat_lon_error'
figure_relative_to = None
figure_mean_metric_df = None
figure_source_data = False
figure_show = True



figure_model_names = figure_models 
display(Markdown(f'#### Models: {", ".join(figure_model_names)}'))
for figure_gt_id, figure_horizon in product(figure_gt_ids, figure_horizons):
    if 'tmp2m' in gt_id:
        figure_cb_minmax = (-6, 6)
        figure_cb_skip = 3
    elif 'precip' in gt_id:
        figure_cb_minmax = (-15, 15)
        figure_cb_skip = 5

                 
    display(Markdown(f"#### {figure_gt_id} {figure_horizon}"))
    plot_metric_maps_task_ds(metric_dfs_rda, model_names=figure_model_names,
                         gt_id=figure_gt_id,
                         horizon=figure_horizon,
                         metric=figure_metric,
                         target_dates=figure_target_dates,
                         relative_to=figure_relative_to,
                         mean_metric_df=figure_mean_metric_df,
                         show=figure_show, 
                         scale_type='linear',
                         CB_colors_customized=["darkblue", "blue", "white", "red", "darkred"],
                         CB_minmax = figure_cb_minmax,
                         CB_skip = figure_cb_skip,
                         source_data = figure_source_data)

## C.7 Western U.S. Competition Results
## Table 8: 
#### Percentage improvement over mean debiased CFSv2 RMSE over 26 contest dates (2019-2020) in the Western U.S. The best performing models within each class of models are shown in bold, while the best performing models overall are shown in green.

In [None]:
"""
Rodeo experiment tables
"""
target_dates = "std_contest" 
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "rmse" 
table_models = rodeo_experiment_models
relative_to = 'TC_CFSv2'
dropna = True # if true, compute average metrics only on dates where predictions have all values 
tasks = contest_tasks
horizons = ['34w', '56w']
region = 'contest'
include_overall = True # include overall row in the dataframe



"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min


# Create metrics dataframe template
metric_tasks = pd.DataFrame(index=tasks, columns=table_models)

# Get Topcoder CFSv2 baseline 
baseline_TC_df = get_leaderboard(metric, drop_columns=['Mouatadid'])[relative_to]


# Generate metrics dataframe for each task 
for gt_id, horizon in product(contest_gt_ids, horizons):
    task = f"{gt_id}_{horizon}"
    
    # Read in metrics and reset table_models based on avaliable metrics
    metric_tasks.loc[task], table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=None,
        model_names=table_models, include_overall=include_overall, dropna=dropna)   

# Get leaderboard dataframe
leaderboard = get_leaderboard(metric, 
                              relative_to=relative_to, 
                              baseline_df=baseline_TC_df,
                              drop_columns=['Mouatadid'])

#Calculate percentage improvement over Topcoder CFSv2 RMSE
metric_tasks = pd.merge(baseline_TC_df, metric_tasks, left_index=True, right_index=True).astype(float)
metric_tasks = metric_tasks.apply(partial(bss_score, metric_tasks.columns, relative_to), axis=1)

# Concat metric dataframe for model_names with leaderboard dataframe
metric_tasks = pd.merge(leaderboard, metric_tasks, left_index=True, right_index=True).T.astype(float)
metric_tasks = metric_tasks[[t for t in tasks]]

# Map input names to display names
metric_tasks = metric_tasks.rename(contest_tasks, axis=1).rename(all_model_names, axis=0)
# printf(metric_tasks)
metric_tasks = metric_tasks.loc[['TC_Salient', 'TC_Climatology', 
'1st place', '2nd place', '3rd place',
'AutoKNN', 'LocalBoosting', 'MultiLLR', 'Prophet', 'Salient 2.0', 
'Climatology++', 'CFSv2++', 'Persistence++', 
'Uniform ABC', 'Online ABC']]

print(f"{period} {target_dates}")

if period is 'overall':
    display(metric_tasks.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
else:
    display(metric_tasks.style.apply(highlight_func, axis=1).apply(bold_func, axis=1).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metric_tasks.astype(float).round(2), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## C.8 Salient 2.0 Dry Bias

## Figure 21: 
#### Temporal plot (left) and scatter plot (right) of yearly total precipitation and percentage improvement over mean debiased CFSv2 RMSE in the Western U.S. across 2011-2020.

In [None]:
"""
Generate a dictionary with anomalies values for all models and every combination of gt_id, 
horizon, and target dates
"""
all_preds = {}

# Populate dictionaries for each gt_id, horizon, and target_dates for main experiment and salient experiment
for gt_id, horizon, target_dates in \
            [x for x in product(contest_gt_ids[-1:], horizons[:1], ['std_paper'])] :
    model_names = ["deb_cfsv2", "tuned_cfsv2pp", "tuned_salient2"]#all_models
    model_names_str = "salient_dry_bias_models"#'all_models'   
        
    # Get task
    task = f"{gt_id}_{horizon}"
    
    display(Markdown(f"### {model_names_str}: {task}, {target_dates}"))
    
    # Get all anoms
    print(f"Creating dataframes for models:\n {model_names}\n")    
    df_preds, _, _ = get_trio_df(gt_id=gt_id, horizon=horizon, target_dates=target_dates,
                                              model_names=model_names)
    print(f"DONE!\n")
    # No models exist for this task    
    if df_preds is None: 
        continue
    
    # Add yearly and quarterly columns to the dataframe
    print(f"\nAdding group-by columns to dataframes...")
    tic()
    df_preds = add_groupby_cols(df_preds, horizon=horizon)
    toc()
    print(f"DONE!\n")
    
    all_preds[(task, target_dates)] = copy.copy(df_preds.reset_index('start_date')) 
    

# Figure parameters
figure_models = [
    'deb_cfsv2',
    'tuned_cfsv2pp',
    'gt',
    'tuned_salient2'
]
target_dates = "std_paper" 
period = "yearly" 
metric = "rmse" 
relative_to = 'deb_cfsv2' 
task_ids = ['contest_precip'] 
task_horizons = ['34w']
file_str = f"contest_{period}_over_{relative_to}" 

"""
End experiment parameters 
"""
print(target_dates)
# Plot subfigure
plot_models_metrics_preds_line(all_metrics,
                              all_preds,
                              gt_id_list=task_ids, 
                              horizon_list=task_horizons, 
                              target_dates=target_dates, 
                              model_names=figure_models,
                              period=period,
                              relative_to=relative_to,
                              metric=metric,
                              file_str=file_str)
# Plot subfigure
plot_models_metrics_preds_scatter(all_metrics,
                              all_preds,
                              gt_id_list=task_ids, 
                              horizon_list=task_horizons, 
                              target_dates=target_dates, 
                              model_names=figure_models,
                              period=period,
                              relative_to=relative_to,
                              metric=metric,
                              file_str=file_str)