# Batch regression

Carry out regression experiment for a fixed set of predictors and a set of target dates.

In [None]:
import os, sys
from subseasonal_toolkit.utils.notebook_util import isnotebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
else:
    from argparse import ArgumentParser

# Imports 
import numpy as np
import pandas as pd
from sklearn import *
import sys
import json
import subprocess
from datetime import datetime, timedelta
from functools import partial
from multiprocessing import cpu_count
from ttictoc import tic, toc
from subseasonal_data.utils import get_measurement_variable, df_merge, shift_df
from subseasonal_toolkit.utils.general_util import printf
from subseasonal_toolkit.utils.experiments_util import (get_id_name, get_th_name, get_first_year, 
                                                        get_start_delta, clim_merge)
from subseasonal_toolkit.utils.eval_util import get_target_dates, mean_rmse_to_score, save_metric
from subseasonal_toolkit.utils.fit_and_predict import apply_parallel
from subseasonal_toolkit.utils.models_util import (get_submodel_name, start_logger, log_params, get_forecast_filename,
                                                   save_forecasts)
from subseasonal_toolkit.models.perpp.perpp_util import fit_and_predict, years_ago
from subseasonal_data import data_loaders

import re

In [None]:
#
# Specify model parameters
#
model_name = "perpp"
if not isnotebook():
    # If notebook run as a script, parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("pos_vars",nargs="*")  # gt_id and horizon                                                                                  
    parser.add_argument('--target_dates', '-t', default="std_contest")
    
    # Number of years to use in training ("all" or integer)
    parser.add_argument('--train_years', '-y', default="all")
    
    # Number of month-day combinations on either side of the target combination 
    # to include when training
    # Set to 0 to include only target month-day combo
    # Set to "None" to include entire year
    parser.add_argument('--margin_in_days', '-m', default="None")
    
    # If True, use cfsv2 ensemble forecast as a feature
    parser.add_argument('--use_cfsv2', '-c', default="True")
    args, opt = parser.parse_known_args()
    
    # Assign variables                                                                                                                                     
    gt_id = get_id_name(args.pos_vars[0]) # "contest_precip" or "contest_tmp2m"                                                                            
    horizon = get_th_name(args.pos_vars[1]) # "34w" or "56w"                                                                                        
    target_dates = args.target_dates
    train_years = args.train_years
    if train_years != "all":
        train_years = int(train_years)
    if args.margin_in_days == "None":
        margin_in_days = None
    else:
        margin_in_days = int(args.margin_in_days)
        
    use_best_cfsv2 = (args.use_cfsv2 == "True")
    
else:
    # Otherwise, specify arguments interactively 
    gt_id = "us_precip"
    horizon = "56w"
    target_dates = "20171114"
    train_years = "all"
    margin_in_days = None
    use_best_cfsv2 = True

#
# Process model parameters
#

if not use_best_cfsv2:
    raise ValueError("--use_cfsv2 argument must be True")

# Get list of target date objects
target_date_objs = pd.Series(get_target_dates(date_str=target_dates,horizon=horizon))

# Sort target_date_objs by day of week
target_date_objs = target_date_objs[target_date_objs.dt.weekday.argsort(kind='stable')]

# Identify measurement variable name
measurement_variable = get_measurement_variable(gt_id) # 'tmp2m' or 'precip'

# Column names for gt_col, clim_col and anom_col 
gt_col = measurement_variable
clim_col = measurement_variable+"_clim"
anom_col = get_measurement_variable(gt_id)+"_anom" # 'tmp2m_anom' or 'precip_anom'

# Store delta between target date and forecast issuance date
start_delta =  timedelta(days=get_start_delta(horizon, gt_id))

In [None]:
#
# Choose regression parameters
#
# Record standard settings of these parameters
base_col = "zeros"    
if (gt_id.endswith("tmp2m")) and (horizon == "34w"):
    cfsv2_col = 'subx_cfsv2_tmp2m'
    x_cols = [
    'tmp2m_shift29',
    'tmp2m_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip")) and (horizon == "34w"):
    cfsv2_col = 'subx_cfsv2_precip'
    x_cols = [
    'precip_shift29',
    'precip_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("tmp2m")) and (horizon == "56w"):
    cfsv2_col = 'subx_cfsv2_tmp2m'
    x_cols = [
    'tmp2m_shift43',
    'tmp2m_shift86',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip")) and (horizon == "56w"):
    cfsv2_col = 'subx_cfsv2_precip'
    x_cols = [
    'precip_shift43',
    'precip_shift86',
    cfsv2_col,
    clim_col
    ]
elif (gt_id.endswith("tmp2m_1.5x1.5")) and (horizon == "34w"):
    cfsv2_col = 'iri_cfsv2_tmp2m'
    x_cols = [
    'tmp2m_shift29',
    'tmp2m_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip_1.5x1.5")) and (horizon == "34w"):
    cfsv2_col = 'iri_cfsv2_precip'
    x_cols = [
    'precip_shift29',
    'precip_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("tmp2m_1.5x1.5")) and (horizon == "56w"):
    cfsv2_col = 'iri_cfsv2_tmp2m'
    x_cols = [
    'tmp2m_shift43',
    'tmp2m_shift86',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip_1.5x1.5")) and (horizon == "56w"):
    cfsv2_col = 'iri_cfsv2_precip'
    x_cols = [
    'precip_shift43',
    'precip_shift86',
    cfsv2_col,
    clim_col
    ]
group_by_cols = ['lat', 'lon']

# Record submodel names for perpp model
submodel_name = get_submodel_name(
    model_name, train_years=train_years, margin_in_days=margin_in_days,
    use_cfsv2=use_best_cfsv2)

printf(f"Submodel name {submodel_name}")

if not isnotebook():
    # Save output to log file
    logger = start_logger(model=model_name,submodel=submodel_name,gt_id=gt_id,
                          horizon=horizon,target_dates=target_dates)
    # Store parameter values in log                                                                                                                        
    params_names = ['gt_id', 'horizon', 'target_dates',
                    'train_years', 'margin_in_days',
                    'base_col', 'x_cols', 'group_by_cols', 'use_best_cfsv2'
                   ]
    params_values = [eval(param) for param in params_names]
    log_params(params_names, params_values)

In [None]:
#
# Load ground truth data
#
printf("Loading ground truth data")
tic()
gt = data_loaders.get_ground_truth(gt_id)[['lat','lon','start_date',gt_col]]
printf(f"elapsed: {toc()}s")

#
# Added shifted ground truth features
#
printf("Adding shifted ground truth features")
lld_data = gt
shifts = [int(re.search(r'\d+$', col).group()) for col in x_cols if col.startswith(gt_col+"_shift")]
tic()
for shift in shifts:
    gt_shift = shift_df(gt, shift)
    lld_data = df_merge(lld_data, gt_shift, how="right")
printf(f"elapsed: {toc()}s")

#
# Drop rows with empty pred_cols
#
pred_cols = x_cols+[base_col]
exclude_cols = set([clim_col, cfsv2_col, 'zeros']) 
lld_data = lld_data.dropna(subset=set(pred_cols) - exclude_cols)

# Add climatology
if clim_col in pred_cols:
    printf("Merging in climatology")
    tic()
    lld_data = clim_merge(lld_data, data_loaders.get_climatology(gt_id))
    toc()

# Add zeros
if 'zeros' in pred_cols:
    lld_data['zeros'] = 0

In [None]:
#
# Add cfsv2 ensemble forecast as feature
#
printf(f"Forming cfsv2 ensemble forecast...")
shift = 15 if horizon == "34w" else 29
first_lead = 14 if horizon == "34w" else 28
last_lead = 29
suffix = "-us" if gt_id.startswith("us_") else ""
if gt_id.endswith("1.5x1.5"):
    prefix = "iri_cfsv2"
    suffix += "1_5"
else:
    prefix = "subx_cfsv2"
tic(); data = data_loaders.get_forecast(prefix+"-"+gt_id.split("_")[1]+suffix, shift=shift); toc()
printf(f"Aggregating lead {first_lead} with shift {shift}")
tic()
cfsv2 = data[['lat','lon','start_date',f'{prefix}_{gt_col}-{first_lead}.5d_shift{shift}']].set_index(
    ['lat','lon','start_date']).squeeze().unstack(['lat','lon']).copy()
toc()
for lead in range(first_lead+1,last_lead+1):
    printf(f"Aggregating lead {lead} with shift {shift+lead-first_lead}")
    tic()
    cfsv2 += data[['lat','lon','start_date',f'{prefix}_{gt_col}-{lead}.5d_shift{shift}']].set_index(
        ['lat','lon','start_date']).squeeze().unstack(['lat','lon']).shift(lead-first_lead)
    toc()
del data
num_leads = last_lead - first_lead + 1
cfsv2 /= num_leads 
# Drop dates with no forecasts and reshape
cfsv2 = cfsv2.dropna().unstack().rename(cfsv2_col)

# Merge cfsv2 forecast with lld_data
printf(f"Merging {cfsv2_col} with lld_data")
tic()
lld_data = pd.merge(lld_data, cfsv2, left_on=['lat','lon','start_date'], 
                    right_index=True)
toc()
del cfsv2

In [None]:
# specify regression model
fit_intercept = True
model = linear_model.LinearRegression(fit_intercept=fit_intercept)

# Form predictions for each grid point (in parallel) using train / test split
# and the selected model
prediction_func = partial(fit_and_predict, model=model)
num_cores = cpu_count()

# Store rmses
rmses = pd.Series(index=target_date_objs, dtype='float64')

# Restrict data to relevant columns and rows for which predictions can be made
relevant_cols = set(
    ['start_date','lat','lon',gt_col,base_col]+x_cols).intersection(lld_data.columns)
lld_data = lld_data[relevant_cols].dropna(subset=x_cols+[base_col])

In [None]:
for target_date_obj in target_date_objs:
    if not any(lld_data.start_date.isin([target_date_obj])):
        printf(f"warning: some features unavailable for target={target_date_obj}; skipping")
        continue    
        
    target_date_str = datetime.strftime(target_date_obj, '%Y%m%d')
    
    # Skip if forecast already produced for this target
    forecast_file = get_forecast_filename(
        model=model_name, submodel=submodel_name, 
        gt_id=gt_id, horizon=horizon, 
        target_date_str=target_date_str)
    
    if True and os.path.isfile(forecast_file):
        printf(f"prior forecast exists for target={target_date_obj}; loading")
        tic()
        preds = pd.read_hdf(forecast_file)
        
        # Add ground truth for later evaluation
        preds = pd.merge(preds, lld_data.loc[lld_data.start_date==target_date_obj,['lat','lon',gt_col]], 
                         on=['lat','lon'])
        
        preds.rename(columns={gt_col:'truth'}, inplace=True)
        toc()
    else:
        printf(f'target={target_date_str}')
        
        # Subset data based on margin
        if margin_in_days is not None:
            tic()
            sub_data = month_day_subset(lld_data, target_date_obj, margin_in_days)
            toc()
        else:
            sub_data = lld_data
            
        # Find the last observable training date for this target
        last_train_date = target_date_obj - start_delta 
        
        # Only train on train_years worth of data
        if train_years != "all":
            tic()
            sub_data = sub_data.loc[sub_data.start_date >= years_ago(last_train_date, train_years)]
            toc()
            
        tic()
        preds = apply_parallel(
            sub_data.groupby(group_by_cols),
            prediction_func, 
            num_cores=num_cores,
            gt_col=gt_col,
            x_cols=x_cols, 
            base_col=base_col, 
            last_train_date=last_train_date,
            test_dates=[target_date_obj])
        
        # Ensure raw precipitation predictions are never less than zero
        if gt_id.endswith("precip"):
            tic()
            preds['pred'] = np.maximum(preds['pred'],0)
            toc()
            
        preds = preds.reset_index()
        
        if True:
            # Save prediction to file in standard format
            save_forecasts(preds.drop(columns=['truth']),
                model=model_name, submodel=submodel_name, 
                gt_id=gt_id, horizon=horizon, 
                target_date_str=target_date_str)
        toc()
    
    # Evaluate and store error
    rmse = np.sqrt(np.square(preds.pred - preds.truth).mean())
    rmses.loc[target_date_obj] = rmse
    print("-rmse: {}, score: {}".format(rmse, mean_rmse_to_score(rmse)))
    mean_rmse = rmses.mean()
    print("-mean rmse: {}, running score: {}".format(mean_rmse, mean_rmse_to_score(mean_rmse)))

if True:
    # Save rmses in standard format
    rmses = rmses.reset_index()
    rmses.columns = ['start_date','rmse']
    save_metric(rmses, model=model_name, submodel=submodel_name, gt_id=gt_id, horizon=horizon, target_dates=target_dates, metric="rmse")