# Tuned ECMWF++

Adaptive ensembling and debiasing of ECMWF forecasts

In [None]:
# Ensure notebook is being run from base repository directory
import os, sys, ntpath, re
from subseasonal_toolkit.utils.notebook_util import isnotebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
else:
    from argparse import ArgumentParser
import pandas as pd
import numpy as np
import shutil
from datetime import datetime, timedelta
from filelock import FileLock
from pkg_resources import resource_filename
from subseasonal_data.utils import get_measurement_variable
from subseasonal_toolkit.utils.general_util import printf, make_directories, symlink, tic, toc
from subseasonal_toolkit.utils.experiments_util import get_first_year, get_start_delta
from subseasonal_toolkit.utils.models_util import (get_submodel_name, start_logger, log_params, get_forecast_filename,
                                                   save_forecasts)
from subseasonal_toolkit.utils.eval_util import get_target_dates, mean_rmse_to_score
from subseasonal_toolkit.models.tuner.util import *
pd.set_option('display.max_rows', None)

from subseasonal_data import data_loaders

In [None]:
#
# Specify model parameters
#
if not isnotebook():
    # If notebook run as a script, parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("pos_vars",nargs="*")  # gt_id and horizon 
    parser.add_argument('--target_dates', '-t', default="std_test")
    parser.add_argument('--num_years', '-y', default="all",
                       help="Number of years to use in training (all or integer)")
    parser.add_argument('--margin_in_days', '-m', default="None", 
                       help="Number of month-day combinations on either side of the target combination to include; "
                            "set to 0 to include only target month-day combo; set to None to include entire year; "
                            "None by default")
    parser.add_argument('--forecast_with', '-fw', default="p+c", 
                        help="Generate forecast using the control (c), "
                        "average perturbed (p), single perturbed (p1, ..., p50), "
                        "or perturbed-control ensemble (p+c) ECMWF forecast.")
    parser.add_argument('--debias_with', '-dw', default="p+c", 
                        help="Debias using the control (c), average perturbed (p), "
                        "or perturbed-control ensemble (p+c) ECMWF reforecast.")  
    args = parser.parse_args()
    
    # Assign variables                                                                                                                                     
    gt_id = args.pos_vars[0] # e.g., "contest_precip" or "contest_tmp2m"                                                                            
    horizon = args.pos_vars[1] # e.g., "12w", "34w", or "56w"    
    target_dates = args.target_dates
    num_years = args.num_years
    if num_years != "all":
        num_years = int(num_years)
    margin_in_days = args.margin_in_days
    if margin_in_days == "None":
        margin_in_days = None
    else:
        margin_in_days = int(args.margin_in_days)
    debias_with = args.debias_with
    forecast_with = args.forecast_with 
else:
    # Otherwise, specify arguments interactively
    gt_id = "global_tmp2m_p1_1.5x1.5" 
    horizon = "34w"
    target_dates = "s2s"
    num_years = 3
    margin_in_days = None
    debias_with = "p+c"
    forecast_with = "p+c"


#
# Process model parameters
#
# Record base model, output model, and submodel name
base_model_name = "ecmwfpp"
output_model_name = f"tuned_{base_model_name}"
task = f"{gt_id}_{horizon}"

submodel_name = get_submodel_name(
    output_model_name, num_years=num_years, 
    margin_in_days=margin_in_days,
    forecast_with=forecast_with,
    debias_with=debias_with)

# Create directory for storing forecasts if one does not already exist
out_dir = os.path.join("models", output_model_name, "submodel_forecasts", 
                       submodel_name, f"{gt_id}_{horizon}")
if not os.path.exists(out_dir):
    make_directories(out_dir)
    
if not isnotebook():
    # Save output to log file
    logger = start_logger(model=output_model_name,submodel=submodel_name,gt_id=gt_id,
                          horizon=horizon,target_dates=target_dates)
    # Store parameter values in log
    params_names = ['gt_id', 'horizon',  
                    'target_dates', 'num_years', 'margin_in_days',
                    'forecast_with', 'debias_with']
    params_values = [eval(param) for param in params_names]
    log_params(params_names, params_values)

# Select target dates and restrict to dates with available ground truth data
target_date_objs = get_target_dates(target_dates, horizon=horizon)
#print(target_date_objs)


In [None]:
#
# Generate predictions
#

# For each target date, we will use the tuned base model
# parameters associated with the tuned_ecmwf submodel
# with default settings of forecast_with and debias_with
default_submodel_name = get_submodel_name(output_model_name, 
    num_years=num_years, margin_in_days=margin_in_days)

printf(f"\ndefault_submodel = {default_submodel_name}")

# Script for running base model
predict_script = resource_filename("subseasonal_toolkit", 
    os.path.join("models",base_model_name,"batch_predict.py"))

forecast_debias_with_str = f" --forecast_with {forecast_with} --debias_with {debias_with}"
for target_date_obj in target_date_objs:
    target_date_str = datetime.strftime(target_date_obj, '%Y%m%d')
    
    # Skip if forecast already exists for this target date
    dst_file = os.path.join(out_dir, f"{gt_id}_{horizon}-{target_date_str}.h5")
    if os.path.isfile(dst_file):
        printf(f"\nprior forecast exists for target={target_date_obj}")
        continue
    
    # Check if forecast exists for default submodel
    default_file = get_forecast_filename(
        model=output_model_name, submodel=default_submodel_name, 
        gt_id=gt_id, horizon=horizon, 
        target_date_str=target_date_str)
    if not os.path.isfile(default_file):
        printf(f"\nno selected submodel forecast for target={target_date_obj}; skipping")
        continue
    
    tic()
    # Tuned forecasts are softlinked; identify submodel directory 
    # to which forecast file is linked
    linked_submodel = ntpath.basename(
        os.path.dirname(os.path.dirname(os.readlink(default_file))))
    printf(f"\n{target_date_str} linked_submodel = {linked_submodel}")

    # Extract base model parameters used to produce selected submodel forecast
    params = re.match(
        r'.*-debias(?P<fit_intercept>.*)_'
        r'years(?P<train_years>.*)_margin(?P<margin_in_days>.*)_'
        r'days(?P<first_day>.*)-(?P<last_day>.*)_leads(?P<first_lead>.*)-'
        r'(?P<last_lead>.*)_loss(?P<loss>.*)_forecast.*', linked_submodel)
    params = params.groupdict()
    
    # Create argument string from base model parameters plus
    # user-provided forecast_with and debias_with parameters
    args_list = ['--' + str(x[0]) + ' ' + str(x[1]) 
                 for x in params.items()]
    args_str = " ".join(args_list) + forecast_debias_with_str

    # Run base model with this argument string
    cmd = f"python \"{predict_script}\" {gt_id} {horizon} -t {target_date_str} {args_str}"
    printf(f"Running {cmd}")
    subprocess.call(cmd, shell=True)
    
    # Identify submodel name associated with this base model forecast
    base_submodel_name = get_submodel_name(
        model=base_model_name, forecast_with=forecast_with,
        debias_with=debias_with, **params)
    
    # Form tuned forecast by soft linking to base model forecast
    src_file = get_forecast_filename(
        model = base_model_name, submodel = base_submodel_name,
        gt_id = gt_id, horizon = horizon, target_date_str = target_date_str)
    if os.path.isfile(src_file):
        tic()
        printf(f"Linking {dst_file} to {src_file}")
        symlink(src_file, dst_file, use_abs_path=True)
        toc()
    else:
        printf(f"Warning: Missing file:\n{src_file}")
    printf(f"Total processing time")
    toc()