# Benchmark Plots

In [None]:
""" 
Generates figures and LaTeX tables for 

Learned Benchmarks for Subseasonal Forecasting

Soukayna Mouatadid, Paulo Orenstein, Genevieve Flaspohler, Miruna Oprescu, 
Judah Cohen, Franklyn Wang, Sean Knight, Maria Geogdzhayeva, Sam Levang, 
Ernest Fraenkel, and Lester Mackey. 
"""
# Ensure notebook is being run from base repository directory
import os, sys
try:
    os.chdir("/home/{}/forecast_rodeo_ii/".format(os.environ["USER"]))
except Exception as err:
    print(f"Warning: unable to change directory; {repr(err)}")
    
%load_ext autoreload
%autoreload 2
%matplotlib inline    
    
import itertools
import importlib
import subprocess
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
from functools import partial

from IPython.display import Markdown, display

import copy
import pdb
import calendar 
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib   
from matplotlib.gridspec import GridSpec

from subseasonal_toolkit.utils.experiments_util import pandas2hdf
from subseasonal_toolkit.utils.general_util import printf
from subseasonal_toolkit.utils.eval_util import get_target_dates, score_to_mean_rmse, contest_quarter_start_dates, contest_quarter
from subseasonal_toolkit.utils.models_util import get_selected_submodel_name
from subseasonal_toolkit.utils.viz_util import *

# set figure and font sizes for seaborn plots
sns.set(rc={'figure.figsize':(8,6)}, font_scale=1)

#
# Directory for saving output
#
out_dir = "/home/{}/forecast_rodeo_ii/subseasonal_toolkit/viz".format(os.environ["USER"])


In [None]:
# 
# Full set of regions, times, and tasks to evaluate
#
metrics = ["rmse", "skill", "score"]

contest_gt_ids = ["contest_tmp2m", "contest_precip"]
us_gt_ids = ["us_tmp2m", "us_precip"]
east_gt_ids = ["east_tmp2m", "east_precip"]
us_1_5_gt_ids = ["us_tmp2m_1.5x1.5", "us_precip_1.5x1.5"]

# All ground truth ids
gt_ids = contest_gt_ids + us_gt_ids 

horizons = ["34w", "56w"]
target_eval_dates = ["std_paper", "std_contest"]

# The full set of models we to evaluate in some
# experiment 
all_models = [
    # Raw Baselines
    'raw_cfsv2', 
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # ECMWF
    'ecmwf'
    # Toolkit 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    #Learning
    'autoknn',
    'informer',
    'tuned_localboosting',
    'multillr',
    'nbeats',
    'prophet',
    'salient',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble',  
    'online_learning'
]

# Main experiment model names
main_experiment_models = [
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # Toolkit 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    #Learning
    'autoknn',
    'tuned_localboosting',
    'multillr',
    'nbeats',
    'informer',
    'prophet',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble',  
    'online_learning'
]

# Rodeo experiment model names
rodeo_experiment_models = [
    # Baselines
    "climatology",   
    'deb_cfsv2',
    'persistence',
    # Toolkit 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    #Learning
    'autoknn',
    'tuned_localboosting',
    'multillr',
    'prophet',
    'tuned_salient2',
    #Ensembles
    'linear_ensemble_localFalse_dynamicFalse_stepFalse_LtCtD',
    'linear_ensemble_localFalse_dynamicFalse_stepFalse_AMLPtCtDtKtS',  
    'online_learning-ah_rpNone_R1_recent_g_SC_LtCtD',
    'online_learning-ah_rpNone_R1_recent_g_SC_AMLPtCtDtKtS'
]

# Salient experiment model names
salient_experiment_models = [
    # Baselines   
    'deb_cfsv2',
    # Toolkit 
    'tuned_cfsv2pp',
    #Learning
    'tuned_salient2',
]


# ECMWF experiment model names
ecmwf_experiment_models = [
    # Baselines
    "climatology", 
    'deb_cfsv2',
    'persistence',
    # Toolkit 
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    # ECMWF
    'ecmwf-years20_leads15-15_lossmse_forecastc_debiasp+c',
    'ecmwf-years20_leads15-15_lossmse_forecastp_debiasp+c',
    # Ensembles
    "online_learning", 
    "linear_ensemble" 
]

In [None]:
#
# Dictionaries mapping all model names and tasks to their display names
#

east_tasks = {
    "east_tmp2m_34w": "Temp. weeks 3-4",
    "east_tmp2m_56w": "Temp. weeks 5-6",
    "east_precip_34w": "Precip. weeks 3-4",
    "east_precip_56w": "Precip. weeks 5-6"
}

contest_tasks = {
    "contest_tmp2m_34w": "Temp. weeks 3-4",
    "contest_tmp2m_56w": "Temp. weeks 5-6",
    "contest_precip_34w": "Precip. weeks 3-4",
    "contest_precip_56w": "Precip. weeks 5-6"
}
us_tasks = {
    "us_tmp2m_34w": "Temp. weeks 3-4",
    "us_tmp2m_56w": "Temp. weeks 5-6",
    "us_precip_34w": "Precip. weeks 3-4",
    "us_precip_56w": "Precip. weeks 5-6",   
}


## Model list for generate predictions
`tuned_climpp,tuned_cfsv2pp,tuned_localboosting,tuned_salient_fri,perpp,multillr,autoknn,raw_cfsv2,nbeats_final,prophet`
## Model list for tuning
`climpp,cfsv2,catboost,salient_fri`
## Tuned models for metrics
`tuned_climpp,tuned_cfsv2pp,tuned_localboosting,tuned_salient_fri`

## Read in all metrics for all tasks and all models
Reads metrics, generates a summary of missing data, and produces the `all_metrics` dictionary to be used in further analysis. 

In [None]:
"""
Generate a dictionary with metric values for all models and every combination of gt_id, 
horizon, and target dates
"""


# TODO: change to add us_gt_ids once metrics are ready
all_metrics = {}

# Get metrics for main experiment, rodeo experiment, salient experiment and ecmwf experiment
for metric, gt_id, horizon, target_dates in \
        [x for x in product(['rmse', 'skill'], us_gt_ids, horizons, ['std_paper'])] \
        +[x for x in product(['rmse'], contest_gt_ids, horizons, ['std_contest'])] \
        +[x for x in product(['rmse'], contest_gt_ids, horizons, ['std_paper'])] \
        +[x for x in product(['rmse', 'skill'], us_1_5_gt_ids, horizons, ['std_ecmwf'])]: 
   
    
    #Set model names   
    if 'us' in gt_id:
        model_names = ecmwf_experiment_models if '1.5x1.5' in gt_id else main_experiment_models
        model_names_str = 'ecmwf_experiment_models' if '1.5x1.5' in gt_id else 'main_experiment_models'
    elif 'contest' in gt_id:
        model_names = rodeo_experiment_models if 'contest' in target_dates else salient_experiment_models
        model_names_str = 'rodeo_experiment_models' if 'contest' in target_dates else 'salient_experiment_models'
    else:
        model_names = all_models
        model_names_str = 'all_models'


    # Get task
    task = f"{gt_id}_{horizon}"

    #display(Markdown(f"### Loading metric {metric} for task {task} and dates {target_dates}"))
    display(Markdown(f"### {model_names_str}: {metric}, {task}, {target_dates}"))

    # Get all metrics
    df = get_metrics_df(gt_id, horizon, metric, target_dates, model_names=model_names)

    # No models exist for this task    
    if df is None: 
        continue

    # Add yearly and quarterly columns to the dataframe
    df = add_groupby_cols(df, horizon=horizon)

    all_metrics[(metric, task, target_dates)] = copy.copy(df)
    #print(all_metrics)

    if metric in ['rmse', 'skill']:
        key = (metric, task, target_dates)
        try:        
            missing_df = all_metrics[key].loc[(all_metrics[key][model_names].isnull().any(axis=1)),:]
        except:        
            missing_df = all_metrics[key].loc[(all_metrics[key].isnull().any(axis=1)),:]            
        if missing_df.shape[0] != 0:
            True
            display(Markdown(f"#### Missing metrics"))
            display(missing_df)    
        else:
            printf("All metrics present.")

## Getting metrics 
After generating the above `all_metrics` dictionary, you can get the average metric value for a set of models and specific period using the following function:
```
df = get_per_period_metrics_df(all_metrics, period="quarterly", horizon="34w", metric="score", target_dates="std_paper", model_names=all_models)
```

The period can be `quarterly`, `yearly`, or `quarterly_yearly` (returns average metrics values in YY1-Q1, YY1-Q2, ..., YY2-Q1, ... etc.).

# TABLES
## Table 1: 
### U.S. 2011-2020: % improvement over deb. CFSv2
This code produces tables to analyze model performance over differnt periods. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_paper" 

# quarterly (seasonal quarters), contest_quarterly (contest quarters), 
# monthly, yearly, individual (return full dataframe), overall (return mean of full dataframe)
# monthly_yearly (every month in every year), quarterly_yearly (every quarter in every year)
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "rmse" 
table_models = main_experiment_models 
relative_to = 'deb_cfsv2' # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
horizons = horizons
region = 'us' #either us, east or contest
include_overall = True # include overall row in the dataframe


"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min


# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]

metrics_sub = None

# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    
    
    # Read in metrics and reset experiment_models based on avaliable metrics
    m_sub, experiment_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    
    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=experiment_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=experiment_models)])
        
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

print(f"{metric} - {target_dates}");
#if relative_to is not None:
#    metrics_sub.drop(relative_to, axis=1, inplace=True)

# maintain ordered list of model names
metrics_sub = metrics_sub[[m for m in main_experiment_models if m in table_models]]

if period is 'overall':
    metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
    metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
    metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
    metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #for group in ['Baselines', 'Toolkit', 'Learning', 'Ensembles']:
        #display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
else:
    metrics_sub = metrics_sub.reindex([m for m in table_models if m in metrics_sub.columns], axis=1).T
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metrics_sub.astype(float), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)



## Table 1: 
### U.S. 2011-2020: % skill
This code produces tables to analyze model performance over differnt periods. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_paper" 

# quarterly (seasonal quarters), contest_quarterly (contest quarters), 
# monthly, yearly, individual (return full dataframe), overall (return mean of full dataframe)
# monthly_yearly (every month in every year), quarterly_yearly (every quarter in every year)
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "skill" # rmse, skill, score
table_models = main_experiment_models #rodeo_main_models # rodeo_appendix_models (all models), rodeo_main_models (toolkit models)
relative_to = None # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
horizons = horizons
region = 'us' #either us, east or contest
include_overall = True # include overall row in the dataframe


"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min

    
    
# Include overall performance?
include_overall = True

# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]

metrics_sub = None

# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    
    
    # Read in metrics and reset table_models based on avaliable metrics
    m_sub, table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    
    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=table_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=table_models)])
        
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

print(f"{metric} - {target_dates}");

metrics_sub = metrics_sub[[m for m in main_experiment_models if m in table_models]]

if period is 'overall':
    metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
    metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
    metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
    metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    metrics_sub = metrics_sub.apply(lambda x: 100*x)#T
    #print(metrics_sub.round(2))
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #for group in ['Baselines', 'Toolkit', 'Learning', 'Ensembles']:
        #display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

else:
    metrics_sub = metrics_sub.reindex([m for m in main_experiment_models if m in metrics_sub.columns], axis=1).T
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metrics_sub.astype(float), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## Table 2: 
### ECMWF U.S. 2011-2020: % improvement over deb. CFSv2
This code produces tables to analyze model performance over differnt periods. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_ecmwf" 

# quarterly (seasonal quarters), contest_quarterly (contest quarters), 
# monthly, yearly, individual (return full dataframe), overall (return mean of full dataframe)
# monthly_yearly (every month in every year), quarterly_yearly (every quarter in every year)
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "rmse" # rmse, skill, score
table_models = ecmwf_experiment_models #rodeo_main_models # rodeo_appendix_models (all models), rodeo_main_models (toolkit models)
relative_to = 'deb_cfsv2' # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = False # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_1_5_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
task_horizons = horizons
region = 'us_1.5x1.5' #either us, east or contest
include_overall = True # include overall row in the dataframe


"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min

    
    
# Include overall performance?
include_overall = True

# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]

metrics_sub = None

# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    
    
    # Read in metrics and reset table_models based on avaliable metrics
    m_sub, table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)

    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=table_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=table_models)])
     
    
#    print(metrics_sub)
    
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

print(f"{metric} - {target_dates}");

metrics_sub = metrics_sub[[m for m in ecmwf_experiment_models if 'ecmwf' not in m]+[m for m in metrics_sub.columns if 'ecmwf' in m]] 



if period is 'overall':
    metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
    metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
    metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
    metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    #metrics_sub = metrics_sub.apply(lambda x: 100*x)#T
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #for group in ['Baselines', 'Toolkit', 'ECMWF', 'Ensembles']:
    #    display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
else:
    metrics_sub = metrics_sub.reindex([m for m in ecmwf_experiment_models if m in metrics_sub.columns], axis=1).T
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metrics_sub.astype(float), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## Table 2:
### ECMWF U.S. 2011-2020: % skill
This code produces tables to analyze model performance over differnt periods. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_ecmwf" 

# quarterly (seasonal quarters), contest_quarterly (contest quarters), 
# monthly, yearly, individual (return full dataframe), overall (return mean of full dataframe)
# monthly_yearly (every month in every year), quarterly_yearly (every quarter in every year)
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "skill" # rmse, skill, score
table_models = ecmwf_experiment_models #rodeo_main_models # rodeo_appendix_models (all models), rodeo_main_models (toolkit models)
relative_to = None # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_1_5_gt_ids # contest_gt_ids (for contest), us_gt_ids (for us), gt_ids (for all)
task_horizons = horizons
region = 'us_1.5x1.5' #either us, east or contest
include_overall = True # include overall row in the dataframe


"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min

    
    
# Include overall performance?
include_overall = True

# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, task_horizons)]

metrics_sub = None

# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    
    
    # Read in metrics and reset table_models based on avaliable metrics
    m_sub, table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    
    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=table_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=table_models)])
        
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

print(f"{metric} - {target_dates}");

metrics_sub = metrics_sub[[m for m in ecmwf_experiment_models if 'ecmwf' not in m and 'climatology' not in m]+[m for m in metrics_sub.columns if 'ecmwf' in m]] 


if period is 'overall':
    metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
    metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
    metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
    metrics_sub.columns = metrics_sub.columns.get_level_values(0)
    metrics_sub = metrics_sub.apply(lambda x: 100*x)
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #for group in ['Baselines', 'Toolkit', 'ECMWF', 'Ensembles']:
    #    display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

else:
    metrics_sub = metrics_sub.reindex([m for m in ecmwf_experiment_models if m in metrics_sub.columns], axis=1).T
    display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metrics_sub.astype(float), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## Table C1, C2:  
### %RMSE improv. per year
This code produces tables to analyze model performance over differnt periods for a single task. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_paper" 
period = "yearly" # <- must be overall to merge with rodeo dataframe
metric = "rmse"
table_models = main_experiment_models 
relative_to = 'deb_cfsv2' # compute value relative to climatology value: 1 - metric(model)/metric(climatology) 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids 
task_horizons = horizons
region = 'us' 
include_overall = False # include overall row in the dataframe

"""
End experiment parameters 
"""
if (metric is 'skill') or (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min
    
    
# Include overall performance?
include_overall = True

# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]



# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    print(metric)
    
    metrics_sub = None
    
    # Read in metrics and reset table_models based on avaliable metrics
    m_sub, table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=table_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=table_models)])
        
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    print(f"{target_dates}");
    #if relative_to is not None:
    #   metrics_sub.drop(relative_to, axis=1, inplace=True)

    metrics_sub = metrics_sub[[m for m in main_experiment_models if m in table_models]]
    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
        display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in main_experiment_models if m in metrics_sub.columns], axis=1).reset_index().drop('task', axis=1).set_index('period').T    
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index()
        metrics_sub.set_index(['model_type', 'index'], inplace=True)
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
        metrics_sub = metrics_sub
        #for group in ['Baselines', 'Toolkit', 'Learning', 'Ensembles']:
         #   display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
        display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float).round(2), out_dir, f"table_{task}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## Table C3, C4: 
### %SKILL per year
This code produces tables to analyze model performance over differnt periods for a single task. 

In [None]:
"""
Paper experiments; can be configured to generate metrics for any subset of models, averaged over
a set of periods, for a given target data. Produces a table and saves to tex. 
"""

target_dates = "std_paper" 
period = "yearly" # <- must be overall to merge with rodeo dataframe
metric = "skill" 
table_models = main_experiment_models 
relative_to = None 
dropna = True # if true, compute average metrics only on dates where predictions have all values 
task_ids = us_gt_ids 
task_horizons = horizons
region = 'us' 
include_overall = False # include overall row in the dataframe

"""
End experiment parameters 
"""
if (metric is 'skill') or (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min
    
    
# Include overall performance?
include_overall = True

# Get set of tasks 
tasks = [f"{gt_id}_{horizon}" for (gt_id, horizon) in product(task_ids, horizons)]



# Generate metrics dataframe for each task
for i, (gt_id, horizon) in enumerate(product(task_ids, task_horizons)):
    task = f"{gt_id}_{horizon}"
    print(task)
    print(metric)
    
    metrics_sub = None
    
    # Read in metrics and reset table_models based on avaliable metrics
    m_sub, table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=relative_to,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    
   # Create metrics dataframe template
    if period == "overall":
        index = pd.Index([task], name="task")            
    else:
        index = pd.MultiIndex.from_product(
            [[task], m_sub.index], 
            names=('task', 'period'))   
        
    # Need to form task-by-task, since some tasks are missing target dates, so index differs
    if metrics_sub is None:
        metrics_sub = pd.DataFrame(index=index, columns=table_models)           
    else:
        metrics_sub = pd.concat([metrics_sub, pd.DataFrame(index=index, columns=table_models)])
        
    if period == "overall":
        metrics_sub.loc[task, :] = m_sub
    else:
        metrics_sub[metrics_sub.index.get_level_values("task") == task] = m_sub.values

    print(f"{target_dates}");
    #if relative_to is not None:
    #    metrics_sub.drop(relative_to, axis=1, inplace=True)

    metrics_sub = metrics_sub[[m for m in main_experiment_models if m in table_models]]
    if period is 'overall':
        metrics_sub=metrics_sub.rename(us_tasks, axis=0).T
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index().set_index(['model_type', 'index'])
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
        display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    else:
        metrics_sub = metrics_sub.reindex([m for m in main_experiment_models if m in metrics_sub.columns], axis=1).reset_index().drop('task', axis=1).set_index('period').T    
        metrics_sub['model_type'] = [all_model_types[m] for m in metrics_sub.index]
        metrics_sub = metrics_sub.rename(all_model_names, axis=0).reset_index()
        metrics_sub.set_index(['model_type', 'index'], inplace=True)
        metrics_sub.columns = metrics_sub.columns.get_level_values(0)
        metrics_sub = metrics_sub#.T
        metrics_sub = metrics_sub.apply(lambda x: 100*x)
        #print(metrics_sub.round(2))
        #for group in ['Baselines', 'Toolkit', 'Learning', 'Ensembles']:
         #   display(metrics_sub.loc[group].style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
        display(metrics_sub.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
    #save dataframe in latex table format
    table_to_tex(metrics_sub.astype(float).round(2), out_dir, f"table_{task}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


## Table C5:
### Rodeo II Analysis - Table relative to Topcoder CFSv2
This code produces tables for model performance and merges with Rodeo performance 

In [None]:
"""
Rodeo experiment tables
"""
target_dates = "std_contest" 
period = "overall" # <- must be overall to merge with rodeo dataframe
metric = "rmse" 
table_models = rodeo_experiment_models
relative_to = 'TC_CFSv2'
dropna = True # if true, compute average metrics only on dates where predictions have all values 
tasks = contest_tasks
region = 'contest'
include_overall = True # include overall row in the dataframe



"""
End experiment parameters 
"""
if (metric is 'rmse' and relative_to is not None) or (metric is not 'rmse' and relative_to is None):
    highlight_func = highlight_max
    bold_func = bold_max
else:
    highlight_func = highlight_min
    bold_func = bold_min


# Create metrics dataframe template
metric_tasks = pd.DataFrame(index=tasks, columns=table_models)

# Get Topcoder CFSv2 baseline 
baseline_TC_df = get_leaderboard(metric, drop_columns=['Mouatadid'])[relative_to]


# Generate metrics dataframe for each task 
for gt_id, horizon in product(contest_gt_ids, horizons):
    task = f"{gt_id}_{horizon}"
    
    # Read in metrics and reset table_models based on avaliable metrics
    metric_tasks.loc[task], table_models = get_per_period_metrics_df(
        all_metrics, period=period, gt_id=gt_id, horizon=horizon,
        metric=metric, target_dates=target_dates, 
        relative_to=None,
        model_names=table_models, include_overall=include_overall, dropna=dropna)
    

# Get leaderboard dataframe
leaderboard = get_leaderboard(metric, 
                              relative_to=relative_to, 
                              baseline_df=baseline_TC_df,
                              drop_columns=['Mouatadid'])


#Calculate percentage improvement over Topcoder CFSv2 RMSE
metric_tasks = pd.merge(baseline_TC_df, metric_tasks, left_index=True, right_index=True).astype(float)
metric_tasks = metric_tasks.apply(partial(bss_score, metric_tasks.columns, relative_to), axis=1)

# Concat metric dataframe for model_names with leaderboard dataframe
metric_tasks = pd.merge(leaderboard, metric_tasks, left_index=True, right_index=True).T.astype(float)
metric_tasks = metric_tasks[[t for t in tasks]]

# Map input names to display names
metric_tasks = metric_tasks.rename(contest_tasks, axis=1).rename(all_model_names, axis=0)

print(f"{period} {target_dates}")

if period is 'overall':
    display(metric_tasks.style.apply(highlight_func, axis=0).apply(bold_func, axis=0).set_table_styles(styles))
else:
    display(metric_tasks.style.apply(highlight_func, axis=1).apply(bold_func, axis=1).set_table_styles(styles))

#save dataframe in latex table format
table_to_tex(metric_tasks.astype(float).round(2), out_dir, f"table_{region}_{period}_{metric}_over_{relative_to}_{target_dates}", precision=2)


# FIGURES
## Figure 1 :  
### % improvement: TOOLKIT vs. LEARNING MODELS
This code produces plots to analyze model performance over differnt periods. 

In [None]:
figure_models = [
    #relative_to
    'deb_cfsv2',
    #toolkit
    'tuned_cfsv2pp',
    'tuned_climpp',
    'perpp',
    #learner
    'autoknn',
    'multillr',
    'prophet',
    'tuned_localboosting',
    'tuned_salient2',
]


"""
Paper experiments; can be configured to generate plots for any subset of models, averaged over
a set of periods, for a given target data. Produces a figure and saves to pdf.. 
"""
# Figure experiment parameters
target_dates = "std_paper"
task_ids = us_gt_ids 
task_horizons = horizons

# RMSE improvement by season 
# Subfigure experiment parameters
period = "quarterly" 
metric = 'rmse' 
relative_to = 'deb_cfsv2' # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_toolkit_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# RMSE improvement by year 
# Subfigure experiment parameters
period = "yearly" 
metric = 'rmse' 
relative_to = 'deb_cfsv2' # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_toolkit_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# Skill improvement by season 
# Subfigure experiment parameters
period = "quarterly" 
metric = 'skill' 
relative_to = None # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_toolkit_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)

# Skill improvement by season 
# Subfigure experiment parameters
period = "yearly" 
metric = 'skill' 
relative_to = None # compute value relative to baseline value: 1 - metric(model)/metric(baseline)
file_str = f"{period}_over_{relative_to}" # saves to file with suffix file_str
# Generate figure
print(target_dates)
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_toolkit_vs_learner_quadruple(get_metrics_fh=fh, 
                    gt_id_list=task_ids, 
                    horizon_list=task_horizons, 
                    metric=metric, 
                    target_dates=target_dates, 
                    model_names=figure_models,
                    file_str=file_str)


## Figure C1:  
### Quarterly % improvement: TOOLKIT vs. BASELINES
This code produces plots to analyze model performance over differnt periods. 

In [None]:
# Figure experiment parameters
target_dates = "std_paper" 
period = "quarterly" 
metric = "rmse" 
task_ids = us_gt_ids 
region = 'us'

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'climatology', 
    'tuned_climpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'deb_cfsv2', 
    'tuned_cfsv2pp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'persistence', 
    'perpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

## Figure C2:  
### Yearly % improvement: TOOLKIT vs. BASELINES
This code produces plots to analyze model performance over differnt periods. 

In [None]:
# Figure experiment parameters
target_dates = "std_paper" 
period = "yearly" 
metric = "rmse" 
task_ids = us_gt_ids 
region = 'us'

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'climatology', 
    'tuned_climpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'deb_cfsv2', 
    'tuned_cfsv2pp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

# RMSE improvement by season 
# Subfigure experiment parameters
figure_models = [
    'persistence', 
    'perpp',
]
relative_to = figure_models[0] 
file_str = f"{region}_{period}_over_{relative_to}" # saves to file with suffix file_str
#Generate subfigure
fh = partial(get_per_period_metrics_df, all_metrics, period, relative_to) 
plot_models_and_metrics_plus(get_metrics_fh=fh, 
                            gt_id_list=task_ids, 
                            horizon_list=horizons, 
                            metric=metric, 
                            target_dates=target_dates, 
                            model_names=figure_models, 
                            file_str=file_str)

## PREDICTIONS, ANOMALIES and ERRORS
### Generate all preds, anoms and errors for all tasks and all models
Reads predictions, generates a summary of missing data, and produces the dictionary to be used in further analysis.

In [None]:
"""
Generate a dictionary with anomalies values for all models and every combination of gt_id, 
horizon, and target dates
"""
all_preds = {}
all_anoms = {}
all_errors = {}

# Populate dictionaries for each gt_id, horizon, and target_dates for main experiment and salient experiment
for gt_id, horizon, target_dates in \
            [x for x in product(us_gt_ids, horizons, ['std_paper'])] \
            +[x for x in product(['contest_precip'], ['34w'], ['std_paper'])]:
            
    #Set model names   
    if 'us' in gt_id:
        model_names = ecmwf_experiment_models if '1.5x1.5' in gt_id else main_experiment_models
        model_names_str = 'ecmwf_experiment_models' if '1.5x1.5' in gt_id else 'main_experiment_models'
    elif 'contest' in gt_id:
        model_names = rodeo_experiment_models if 'contest' in target_dates else salient_experiment_models
        model_names_str = 'rodeo_experiment_models' if 'contest' in target_dates else 'salient_experiment_models'
    else:
        model_names = all_models
        model_names_str = 'all_models'   
        
    # Get task
    task = f"{gt_id}_{horizon}"
    
    display(Markdown(f"### {model_names_str}: {task}, {target_dates}"))
    
    # Get all anoms
    print(f"Creating dataframes for models:\n {model_names}\n")    
    df_preds, df_anoms, df_errors = get_trio_df(gt_id=gt_id, horizon=horizon, target_dates=target_dates,
                                              model_names=model_names)
    print(f"DONE!\n")
    # No models exist for this task    
    if df_preds is None: 
        continue
    
    # Add yearly and quarterly columns to the dataframe
    print(f"\nAdding group-by columns to dataframes...")
    tic()
    df_preds = add_groupby_cols(df_preds, horizon=horizon)
    df_anoms = add_groupby_cols(df_anoms, horizon=horizon)
    df_errors = add_groupby_cols(df_errors, horizon=horizon)
    toc()
    print(f"DONE!\n")
    
    all_preds[(task, target_dates)] = copy.copy(df_preds.reset_index('start_date')) 
    all_anoms[(task, target_dates)] = copy.copy(df_anoms.reset_index('start_date')) 
    all_errors[(task, target_dates)] = copy.copy(df_errors.reset_index('start_date')) 

## Figure 2: 
### % improvement RMSE map
This code produces maps to analyze RMSE percentage improvment over different periods. 

In [None]:
figure_models = [
    #Baselines
    'deb_cfsv2',
    #Toolkit
    'tuned_climpp',
    'tuned_cfsv2pp',
    'perpp',
    #Learning
    'prophet',
    'tuned_salient2',
    #Ensembles  
    'online_learning'
]

# Figure parameter values
target_dates = 'std_paper'
period="overall"
dropna=True
relative_to='deb_cfsv2'
show=True


for gt_id, horizon in product(us_gt_ids, horizons):
    mean_metric_df, _ = get_per_period_metrics_df(
                                all_metrics, period=period, gt_id=gt_id, horizon=horizon,
                                metric='rmse', target_dates=target_dates, 
                                relative_to=relative_to,
                                model_names=figure_models, include_overall=False, dropna=dropna)

    # Plot anoms for each gt_id, horizon, and target_dates
    plot_errormaps(all_errors, 
                    period=period,
                    gt_id = gt_id,
                    horizon=horizon, 
                    target_dates = target_dates,
                    model_names = figure_models,
                    mean_metric_df = mean_metric_df,
                    dropna=dropna,
                    relative_to=relative_to,
                    show=show)

## Figure C3, C4: 
### % improvement RMSE map
This code produces maps to analyze RMSE percentage improvment over different periods. 

In [None]:
from subseasonal_toolkit.utils.viz_util import *
figure_models = [
    #Baselines
    'deb_cfsv2',
    #Row 1
    'tuned_cfsv2pp',
    'tuned_climpp',
    'perpp',
    'online_learning',
    #Row 2
    'autoknn',
    'climatology',
    'persistence',
    'linear_ensemble',  
    #Row 3
    'informer',
    'tuned_localboosting',
    'multillr',
    'nbeats',
    #Row 4
    'prophet',
    'tuned_salient2',
]


"""
Plot overall errors maps  for all models for std_contest
"""
target_dates = 'std_paper'
period="overall"
dropna=True
relative_to='deb_cfsv2'
show=True


for gt_id, horizon in product(us_gt_ids, horizons):
    mean_metric_df, _ = get_per_period_metrics_df(
                                all_metrics, period=period, gt_id=gt_id, horizon=horizon,
                                metric='rmse', target_dates=target_dates, 
                                relative_to=relative_to,
                                model_names=figure_models, include_overall=False, dropna=dropna)
    # Plot anoms for each gt_id, horizon, and target_dates
    plot_errormaps(all_errors, 
                    period=period,
                    gt_id = gt_id,
                    horizon=horizon, 
                    target_dates = target_dates,
                    model_names = figure_models,
                    mean_metric_df = mean_metric_df,
                    dropna=dropna,
                    relative_to=relative_to,
                    show=show)

## Figure C5, C6: 
### Model bias maps
This code produces maps to analyze model bias over different periods. 

In [None]:
figure_models = [
    #Baselines
    #Row 1
    'tuned_cfsv2pp',
    'tuned_climpp',
    'perpp',
    'online_learning',
    #Row 2
    'deb_cfsv2',
    'climatology',
    'persistence',
    'linear_ensemble',  
    #Row 3
    'autoknn',
    'informer',
    'tuned_localboosting',
    'multillr',
    #Row 4
    'nbeats',
    'prophet',
    'tuned_salient2',
]

"""
Plot overall mean model bias maps  for all models for std_contest
"""
target_dates = "std_paper"
period="overall"
include_overall=False
dropna=True
show=True



"""
End experiment parameters 
"""
for gt_id, horizon in product(us_gt_ids, horizons):
    # Plot anoms for each gt_id, horizon, and target_dates
    plot_biasmaps(all_anoms, 
                    period=period,
                    gt_id = gt_id,
                    horizon=horizon, 
                    target_dates = target_dates,
                    model_names = figure_models,
                    include_overall=include_overall,
                    dropna=dropna,
                    show=show)

## Figure C7:  
### SALIENT 2.0 vs PRECIP 
This code produces plots to analyze model performance over different periods. 

In [None]:
figure_models = [
    'deb_cfsv2',
    'tuned_cfsv2pp',
    'gt',
    'tuned_salient2'
]


target_dates = "std_paper" 
period = "yearly" 
metric = "rmse" 
relative_to = 'deb_cfsv2' 
task_ids = ['contest_precip'] 
task_horizons = ['34w']
file_str = f"contest_{period}_over_{relative_to}" 

"""
End experiment parameters 
"""
print(target_dates)
# Plot subfigure
plot_models_metrics_preds_line(all_metrics,
                              all_preds,
                              gt_id_list=task_ids, 
                              horizon_list=task_horizons, 
                              target_dates=target_dates, 
                              model_names=figure_models,
                              period=period,
                              relative_to=relative_to,
                              metric=metric,
                              file_str=file_str)
# Plot subfigure
plot_models_metrics_preds_scatter(all_metrics,
                              all_preds,
                              gt_id_list=task_ids, 
                              horizon_list=task_horizons, 
                              target_dates=target_dates, 
                              model_names=figure_models,
                              period=period,
                              relative_to=relative_to,
                              metric=metric,
                              file_str=file_str)

## Figure B1, B2, B7, B8:  
### Tuning plots
This code produces plots to analyze submodels selected by the tuner. 

In [None]:
from src.visualize.vis_util_grl import *
figure_models = ['tuned_climpp']

# Figure parameters
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)



In [None]:
figure_models = ['tuned_cfsv2pp']

# Figure parameters
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)

In [None]:
figure_models = ['tuned_localboosting']

# Figure parameters
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)

In [None]:
figure_models = ['tuned_salient2']

# Figure parameters
target_dates = "std_paper"
task_ids = us_gt_ids
task_horizons = horizons

# Generate subfigures
plot_tuning(gt_ids = task_ids,
            horizons = task_horizons,
            target_dates = target_dates,
            model_names = figure_models)