# Batch regression

Carry out regression experiment for a fixed set of predictors and a set of target dates.

In [None]:
# Ensure notebook is being run from base repository directory
import os, sys
try:
    os.chdir("/home/franklyn/forecast_rodeo_ii")
except Exception as err:
    print(f"Warning: unable to change directory; {repr(err)}")
#sys.path.insert(0, "/home/franklyn/rodeo_ii/forecast_rodeo_ii")
from src.utils.notebook_util import isnotebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
else:
    from argparse import ArgumentParser

# Imports 
import numpy as np
import pandas as pd
from sklearn import *
import sys
import json
import subprocess
from datetime import datetime, timedelta
from functools import partial
from multiprocessing import cpu_count
from src.utils.general_util import tic, toc, printf
from src.utils.experiments_util import *
from subseasonal_data.utils import get_measurement_variable
from src.utils.eval_util import get_target_dates, mean_rmse_to_score
from src.utils.fit_and_predict import apply_parallel
from src.utils.models_util import *
from src.models.llr.llr_util import fit_and_predict_wgt, fit_and_predict, years_ago

import pdb

In [None]:
#
# Specify model parameters
#
model_name = "llr"
if not isnotebook():
    # If notebook run as a script, parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("pos_vars",nargs="*")  # gt_id and horizon                                                                                  
    parser.add_argument('--target_dates', '-t', default="std_contest")
    
    # Number of years to use in training ("all" or integer)
    parser.add_argument('--train_years', '-y', default="all")
    
    # Number of month-day combinations on either side of the target combination 
    # to include when training
    # Set to 0 to include only target month-day combo
    # Set to "None" to include entire year
    parser.add_argument('--margin_in_days', '-m', default="None")
    
    # If True, use cfsv2 ensemble forecast as a feature
    parser.add_argument('--use_cfsv2', '-c', default="False")
    args, opt = parser.parse_known_args()
    
    # Assign variables                                                                                                                                     
    gt_id = args.pos_vars[0] # "contest_precip" or "contest_tmp2m"                                                                            
    horizon = args.pos_vars[1] # "34w" or "56w"                                                                                        
    target_dates = args.target_dates
    train_years = args.train_years
    if train_years != "all":
        train_years = int(train_years)
    if args.margin_in_days == "None":
        margin_in_days = None
    else:
        margin_in_days = int(args.margin_in_days)
        
    use_best_cfsv2 = (args.use_cfsv2 == "True")
    
else:
    # Otherwise, specify arguments interactively 
    gt_id = "us_tmp2m"
    horizon = "56w"
    target_dates = "std_paper"
    train_years = "all"
    margin_in_days = None
    use_best_cfsv2 = True

#
# Process model parameters
#

# Get list of target date objects
target_date_objs = pd.Series(get_target_dates(date_str=target_dates,horizon=horizon))

# Sort target_date_objs by day of week
target_date_objs = target_date_objs[target_date_objs.dt.weekday.argsort(kind='stable')]

# Identify measurement variable name
measurement_variable = get_measurement_variable(gt_id) # 'tmp2m' or 'precip'

# Column names for gt_col, clim_col and anom_col 
gt_col = measurement_variable
clim_col = measurement_variable+"_clim"
anom_col = get_measurement_variable(gt_id)+"_anom" # 'tmp2m_anom' or 'precip_anom'

# Store delta between target date and forecast issuance date
start_delta =  timedelta(days=get_start_delta(horizon, gt_id))

In [None]:
#
# Choose regression parameters
#
# Record standard settings of these parameters
base_col = "zeros"    
if (gt_id.endswith("tmp2m")) and (horizon == "34w"):
    cfsv2_col = 'subx_cfsv2_tmp2m' if use_best_cfsv2 else 'subx_cfsv2_tmp2m-28.5d_shift15'
    x_cols = [
    'tmp2m_shift29',
    'tmp2m_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip")) and (horizon == "34w"):
    cfsv2_col = 'subx_cfsv2_precip' if use_best_cfsv2 else 'subx_cfsv2_precip-14.5d_shift15'
    x_cols = [
    'precip_shift29',
    'precip_shift58',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("tmp2m")) and (horizon == "56w"):
    cfsv2_col = 'subx_cfsv2_tmp2m' if use_best_cfsv2 else 'subx_cfsv2_tmp2m-28.5d_shift29'
    x_cols = [
    'tmp2m_shift43',
    'tmp2m_shift86',
    cfsv2_col,
    clim_col
    ] 
elif (gt_id.endswith("precip")) and (horizon == "56w"):
    cfsv2_col = 'subx_cfsv2_precip' if use_best_cfsv2 else 'subx_cfsv2_precip-28.5d_shift29'
    x_cols = [
    'precip_shift43',
    'precip_shift86',
    cfsv2_col,
    clim_col
    ]
group_by_cols = ['lat', 'lon']

# Record submodel names for llr model
submodel_name = get_submodel_name(
    model_name, train_years=train_years, margin_in_days=margin_in_days,
    use_cfsv2=use_best_cfsv2)

printf(f"Submodel name {submodel_name}")


In [None]:
#
# Load lat lon date data
# Exclude clim col when loading data initially
#
pred_cols = x_cols+[base_col]

if use_best_cfsv2:
    non_pred_cols = set([clim_col, cfsv2_col, 'zeros'])    
else:
    non_pred_cols = set([clim_col, 'zeros']) 
    
cols_to_load = set(['lat','lon','start_date',gt_col]+pred_cols) - non_pred_cols   
lld_data = load_combined_data(
    "lat_lon_date_data",gt_id,horizon,model='default',
    columns=cols_to_load)

if 'zeros' in pred_cols:
    lld_data['zeros'] = 0
    
# Drop rows with empty pred_cols
lld_data = lld_data.dropna(subset=set(pred_cols) - non_pred_cols)
if clim_col in pred_cols:
    printf("Merging in climatology")
    tic()
    lld_data = clim_merge(lld_data, get_climatology(gt_id))
    toc()

In [None]:
if use_best_cfsv2:
    #
    # Add cfsv2 ensemble forecast as feature
    #
    printf(f"Forming cfsv2 ensemble forecast...")
    shift = 15 if horizon == "34w" else 29
    first_lead = 14 if horizon == "34w" else 28
    last_lead = 29
    suffix = "-us" if gt_id.startswith("us_") else ""
    tic(); data = get_forecast("subx_cfsv2-"+gt_id.split("_")[1]+suffix, shift=shift); toc()
    printf(f"Aggregating lead {first_lead} with shift {shift}")
    tic()
    cfsv2 = data[['lat','lon','start_date',f'subx_cfsv2_{gt_col}-{first_lead}.5d_shift{shift}']].set_index(
        ['lat','lon','start_date']).squeeze().unstack(['lat','lon']).copy()
    toc()
    for lead in range(first_lead+1,last_lead+1):
        printf(f"Aggregating lead {lead} with shift {shift+lead-first_lead}")
        tic()
        cfsv2 += data[['lat','lon','start_date',f'subx_cfsv2_{gt_col}-{lead}.5d_shift{shift}']].set_index(
            ['lat','lon','start_date']).squeeze().unstack(['lat','lon']).shift(lead-first_lead)
        toc()
    num_leads = last_lead - first_lead + 1
    cfsv2 /= num_leads 
    # Drop dates with no forecasts and reshape
    cfsv2 = cfsv2.dropna().unstack().rename(cfsv2_col)
    
    ##lld_data = lld_data.drop(columns=[cfsv2_col])
    # Merge cfsv2 forecast with lld_data
    printf(f"Merging {cfsv2_col} with lld_data")
    tic()
    lld_data = pd.merge(lld_data, cfsv2, left_on=['lat','lon','start_date'], 
                        right_index=True)
    toc()
    del cfsv2

In [None]:
# specify regression model
fit_intercept = True
model = linear_model.LinearRegression(fit_intercept=fit_intercept, normalize=True)

# Form predictions for each grid point (in parallel) using train / test split
# and the selected model
prediction_func = partial(fit_and_predict, model=model)
prediction_func_weights = partial(fit_and_predict_wgt, model=model)
num_cores = cpu_count()

# Store rmses
rmses = pd.Series(index=target_date_objs, dtype='float64')

# Restrict data to relevant columns and rows for which predictions can be made
relevant_cols = set(
    ['start_date','lat','lon',gt_col,base_col]+x_cols).intersection(lld_data.columns)
lld_data = lld_data[relevant_cols].dropna(subset=x_cols+[base_col])

In [None]:

def get_filename(target_date, gt_id, horizon):
    return f"weights/{target_date.year}{target_date.month:02d}{target_date.day:02d}_{gt_id}_{horizon}_normalized_weights.h5"

def save_file(df, path):
    df.to_hdf(path, key="df")

last_date = None
for target_date_obj in target_date_objs:
    last_date = target_date_obj

for target_date_obj in target_date_objs:
    if target_date_obj != last_date:
        continue
    printf(target_date_obj)
    if not any(lld_data.start_date.isin([target_date_obj])):
        printf(f"warning: some features unavailable for target={target_date_obj}; skipping")
        continue    
        
    target_date_str = datetime.strftime(target_date_obj, '%Y%m%d')
    
    # Skip if forecast already produced for this target

    
    if True:
        printf(f'target={target_date_str}')
        
        # Subset data based on margin
        if margin_in_days is not None:
            tic()
            sub_data = month_day_subset(lld_data, target_date_obj, margin_in_days)
            toc()
        else:
            sub_data = lld_data
            
        # Find the last observable training date for this target
        last_train_date = target_date_obj - start_delta 
        
        # Only train on train_years worth of data
        if train_years != "all":
            tic()
            sub_data = sub_data.loc[sub_data.start_date >= years_ago(last_train_date, train_years)]
            toc()
            
        tic()
        weights = apply_parallel(
            sub_data.groupby(group_by_cols),
            prediction_func_weights, 
            num_cores=num_cores,
            gt_col=gt_col,
            x_cols=x_cols, 
            base_col=base_col, 
            last_train_date=last_train_date,
            test_dates=[target_date_obj])
        path = get_filename(target_date_obj, gt_id, horizon)
        print(path)
        print(weights)
        save_file(weights, path)
        #print(weights)
        # Ensure raw precipitation predictions are never less than zero
        #if gt_id.endswith("precip"):
        #    tic()
        #    preds['pred'] = np.maximum(preds['pred'],0)
        #    toc()
            
        
        toc()
    
    # Evaluate and store error
    #rmse = np.sqrt(np.square(preds.pred - preds.truth).mean())
    #rmses.loc[target_date_obj] = rmse
    #print("-rmse: {}, score: {}".format(rmse, mean_rmse_to_score(rmse)))
    #mean_rmse = rmses.mean()
    #print("-mean rmse: {}, running score: {}".format(mean_rmse, mean_rmse_to_score(mean_rmse)))


In [None]:
## SCRATCH SPACE

# # Will load cfsv2 forecasts from this model
# cfsv2_model = "tuned_cfsv2"
# # TODO: can remove loading of subx data in prior cell
# lld_data = lld_data.drop(columns=[cfsv2_col])
# x_cols = [cfsv2_model if x==cfsv2_col else x for x in x_cols]
# # Restrict data to years in which cfsv2 available
# tic()
# first_year = get_first_year("subx_cfsv2")
# printf(f"Restricting to years >= {first_year}")
# lld_data = lld_data[lld_data.start_date.dt.year >= first_year]
# toc()
# # Preallocate dataframe for storing cfsv2_model forecasts
# printf(f"Allocating space for storing {cfsv2_model} forecasts")
# tic()
# cfsv2 = lld_data[['lat','lon','start_date',gt_col]].set_index(['lat','lon','start_date']).squeeze().unstack(['lat','lon'])
# cfsv2.loc[:] = np.nan
# toc()

# # Identify selected submodel of cfsv2_model
# cfsv2_submodel = get_selected_submodel_name(
#     cfsv2_model, gt_id=gt_id, horizon=horizon)
# # Prepare function for looking up forecast filename for a given target date
# get_filename_fn = partial(get_forecast_filename, gt_id=gt_id, horizon=horizon,
#                           model=cfsv2_model, submodel=cfsv2_submodel)
# # Load cfsv2_model forecasts
# printf(f"Loading {cfsv2_model} forecasts")
# tic()
# for date_obj in cfsv2.index:
#     if date_obj.day == 1 and date_obj.month == 1:
#         toc(); printf(f"-Loading {date_obj.year} forecasts"); tic()
#     try:
#         date_str = datetime.strftime(date_obj, '%Y%m%d')
#         filename = get_filename_fn(target_date_str=date_str)
#         cfsv2.loc[date_obj] = pd.read_hdf(filename).set_index(['lat','lon']).pred
#     except FileNotFoundError as e:
#         pass #printf(f"-warning: no forecast for {date_str}; skipping")
# toc()
# # Drop dates with no forecasts and reshape
# cfsv2 = cfsv2.dropna().unstack().rename(cfsv2_model)

# # Merge cfsv2 forecast with lld_data
# printf(f"Merging {cfsv2_model} with lld_data")
# tic()
# lld_data = pd.merge(lld_data, cfsv2, left_on=['lat','lon','start_date'], 
#                     right_index=True)
# toc()

In [None]:
## SCRATCH SPACE

# if use_best_cfsv2:
#     # Load and process data
#     printf("Loading cfsv2 data and averaging leads")
    
#     # Choose data shift based on horizon and first day to be averaged
#     base_shift = (15 if horizon == "34w" else 29) + cfsv2_first_day - 1
#     tic()
#     if gt_id.startswith('us'):
#         data = get_forecast("subx_cfsv2-"+gt_id.split("_")[1]+'-us', shift=base_shift)  
#     else:
#         data = get_forecast("subx_cfsv2-"+gt_id.split("_")[1], shift=base_shift)  
#     cols = ["subx_cfsv2_"+gt_id.split("_")[1]+"-{}.5d_shift{}".format(col,base_shift) 
#             for col in range(cfsv2_first_lead, cfsv2_last_lead+1)]
#     data[cfsv2_col] = data[cols].mean(axis=1)
#     toc()    

#     printf('Pivoting dataframe to have one row per start_date')
#     tic()
#     data = data[['lat','lon','start_date', cfsv2_col]].set_index(['lat','lon','start_date']).squeeze().unstack(['lat','lon'])
#     toc()

#     printf(f"Computing rolling mean over days {cfsv2_first_day}-{cfsv2_last_day}")
#     days = cfsv2_last_day - cfsv2_first_day + 1
#     tic()
#     data = data.rolling(f"{days}d").mean().dropna(how='any')
#     toc()

#     printf("Pivoting dataframe to long format and resetting index")
#     tic()
#     data = data.unstack().rename(cfsv2_col)
#     toc()    
    
#     printf("Merging cfsv2 data into lld data")
#     tic()
#     lld_data = lld_data.merge(data, on=['lat','lon','start_date'], how='inner')
#     del data
#     toc()