# Deterministic to probabilistic forecasting

### Produces probabilistic forecasts from a collection of deterministic forecasts

In [None]:
# Ensure notebook is being run from base repository directory
import os, sys
from subseasonal_toolkit.utils.notebook_util import isnotebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
else:
    from argparse import ArgumentParser
import pandas as pd
import numpy as np
import shutil
from datetime import datetime
from pkg_resources import resource_filename
from subseasonal_data.utils import get_measurement_variable
from subseasonal_toolkit.utils.general_util import printf, make_directories, tic, toc
from subseasonal_toolkit.utils.models_util import (get_submodel_name, start_logger, 
                                                   log_params, get_forecast_filename,
                                                   save_forecasts, get_d2p_submodel_names)
from subseasonal_toolkit.utils.eval_util import get_target_dates

from subseasonal_data import data_loaders

In [None]:
#
# Specify model parameters
#
if not isnotebook():
    # If notebook run as a script, parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("pos_vars",nargs="*")  # gt_id and horizon 
    parser.add_argument('--model_name', '-mn', default="raw_ecmwf")                                                                                 
    parser.add_argument('--target_dates', '-t', default="std_paper_forecast")
    parser.add_argument('--first_year', '-fy', default=1981, type=int,
                        help="first year of climatological period")
    parser.add_argument('--last_year', '-ly', default=2010, type=int,
                        help="last year of climatological period")
    args = parser.parse_args()
    
    # Assign variables                                                                                                                                     
    gt_id = args.pos_vars[0] # e.g., "contest_precip" or "contest_tmp2m"                                                                            
    horizon = args.pos_vars[1] # e.g., "12w", "34w", or "56w"    
    model_name = args.model_name
    target_dates = args.target_dates
    first_year = args.first_year
    last_year = args.last_year
else:
    # Otherwise, specify arguments interactively
    gt_id = "us_tmp2m_p1_1.5x1.5" 
    horizon = "34w"
    model_name = "raw_ecmwf" 
    target_dates = "std_paper_forecast"
    first_year = 1981
    last_year = 2010

#
# Process model parameters
#
# Record output model name 
output_model_name = f"d2p_{model_name}"
task = f"{gt_id}_{horizon}"

# Prepare a directory to store d2p model attributes and configuration files
src_dir = resource_filename(__name__, os.path.join("."))
dst_dir = resource_filename(__name__, 
                            os.path.join("..","..","models", output_model_name))
if not os.path.exists(dst_dir):
    tic()
    printf(f'\nCreating {dst_dir}')
    make_directories(dst_dir)
    # copy attributes and selected submodel files to output model folder
    shutil.copy(os.path.join(src_dir, "attributes.py"), 
                os.path.join(dst_dir, "attributes.py"))
    shutil.copy(os.path.join(src_dir, "selected_submodel.json"), 
                os.path.join(dst_dir, "selected_submodel.json"))
    # update MODEL_NAME in the attribute file
    filename = os.path.join(dst_dir, "attributes.py")
    with open(filename, "r") as f:
        newText=f.read().replace("d2p", output_model_name)
    with open(filename, "w") as f:
        f.write(newText)
    toc()
    
# Identify submodel name for new output model 
submodel_name = get_submodel_name(
    output_model_name, 
    first_year=first_year, last_year=last_year)

# Create directory for storing forecasts if one does not already exist
out_dir = os.path.join("models", output_model_name, "submodel_forecasts", 
                       submodel_name, f"{gt_id}_{horizon}")
if not os.path.exists(out_dir):
    make_directories(out_dir)
    
if not isnotebook():
    # Save output to log file
    logger = start_logger(model=output_model_name,submodel=submodel_name,gt_id=gt_id,
                          horizon=horizon,target_dates=target_dates)
    # Store parameter values in log
    params_names = ['gt_id', 'horizon', 'model_name', 
                    'target_dates', 'first_year', 'last_year']
    params_values = [eval(param) for param in params_names]
    log_params(params_names, params_values)

# Select target dates
target_date_objs = get_target_dates(target_dates, horizon=horizon)
#print(target_date_objs)


In [None]:
#
# Identify target tercile and associated deterministic gt_id
#
if "_p1" in gt_id:
    # Load tercile 1 and check if forecasts are no larger
    load_tercile = 1
    det_gt_id = gt_id.replace('_p1', '')
elif "_p3" in gt_id:
    # Load tercile 2 and check if forecasts are greater
    load_tercile = 2
    det_gt_id = gt_id.replace('_p3', '')
else:
    raise ValueError(f"unsupported probabilistic gt_id {gt_id}")

#
# Load climatological terciles
#
printf(f"\nLoading tercile data")
tic()
var = get_measurement_variable(det_gt_id)
terc_data = data_loaders.get_tercile(
    det_gt_id, tercile=load_tercile, first_year=first_year, 
    last_year=last_year).loc[:,["start_date","lat","lon",var]]
terc_data = terc_data.set_index(
    ["start_date","lat","lon"]).squeeze().unstack('start_date')
# Use month-day combinations as column indices
terc_data.columns = [(d.month,d.day) for d in terc_data.columns]
toc()

In [None]:
# Get list of deterministic submodels to ensemble in forming
# probabilistic forecasts
det_submodel_names = get_d2p_submodel_names(model_name, det_gt_id, horizon)

In [None]:
#
# Generate predictions
#
# Get template file names
forecast_template = get_forecast_filename(
    model=output_model_name, submodel=submodel_name, 
    gt_id=gt_id, horizon=horizon, 
    target_date_str="{}")
det_forecast_templates = [
    get_forecast_filename(
        model=model_name, submodel=det_submodel_name, 
        gt_id=det_gt_id, horizon=horizon, 
        target_date_str="{}")
    for det_submodel_name in det_submodel_names]
for target_date_obj in target_date_objs:
    # Skip if forecast already produced for this target
    target_date_str = datetime.strftime(target_date_obj, '%Y%m%d')
    forecast_file = forecast_template.format(target_date_str)
    if os.path.isfile(forecast_file):
        printf(f"\nprior forecast exists for target={target_date_obj}")
        continue
    
    tic()
    # Identify the target terciles for this month-day combo
    target_terc = terc_data[(target_date_obj.month, target_date_obj.day)]
    
    # Compute fraction of deterministic submodel forecasts falling into 
    # the tercile bin
    prob_pred = pd.Series(index=terc_data.index, data=0., dtype='float', name="pred")
    num_det_forecasts = 0
    for det_forecast_template in det_forecast_templates:
        # Load deterministic forecast
        det_forecast_file = det_forecast_template.format(target_date_str)
        if not os.path.isfile(det_forecast_file):
            continue
        det_pred = pd.read_hdf(det_forecast_file).loc[:,['lat','lon','pred']].set_index(['lat','lon']).squeeze()
        # Skip if deterministic forecast contains nans
        if det_pred.isnull().values.any():
            printf(f"\n{det_forecast_file} contains nans; skipping")
            continue
        num_det_forecasts += 1
        if load_tercile == 1:
            # Check if predictions belong to first tercile bin
            prob_pred += (det_pred <= target_terc)
        else:
            # Check if predictions belong to third tercile bin
            prob_pred += (det_pred > target_terc)
    
    if num_det_forecasts == 0:
        printf(f"\nno deterministic forecasts for target={target_date_obj}; skipping")
        toc()
        continue
    if num_det_forecasts == 1:
        printf(f"\nonly one deterministic forecast for target={target_date_obj}; skipping")
        toc()
        continue        
    printf(f"\nForming predictions for target={target_date_obj} based on {num_det_forecasts} forecasts")
    # Divide by the number of contributing forecasts
    prob_pred /= num_det_forecasts

    # Save prediction to file in standard format
    prob_pred = prob_pred.reset_index()
    # Add start date column after lat and lon
    prob_pred.insert(2,'start_date',target_date_obj)
    save_forecasts(prob_pred,
        model=output_model_name, submodel=submodel_name, 
        gt_id=gt_id, horizon=horizon, 
        target_date_str=target_date_str)
    toc()