# About

Create a function that can be used in the 2nd stage regression to perform a time series cross validation. 
- Using an expanding window cross validation


The 2nd stage regression predicts the medical outcomes using the predicted PM2.5 (and separately with the actual pm2.5), as well as the same fixed effects from the first stage regression. 

In [155]:
# optional. I'm getting annoying warnings that I just want to ignore:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# basics
import pandas as pd 
import numpy as np
import os 
import re
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import requests
import urllib
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import TimeSeriesSplit

# plotting
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import plotly.express as px
import seaborn as sns

# modeling
from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.gmm import IV2SLS
import xgboost as xgb

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [156]:
# keep this as false unless you want to save out the fitted model objects and results 
save_results = False

# Set Path

Add a new elif section for your path if you want

In [157]:
# local or gdrive
path_source = 'work'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

In [158]:
# read in our modeling data
df = pd.read_csv(os.path.join(data_path, 'modeling_data/modeling_data_joined_11-9.csv'))

# read in cornelia's healthcare data
df1 = pd.read_csv(os.path.join(data_path, 'medical/hematopoietic_cancers.csv')).iloc[:,1:]
df2 = pd.read_csv(os.path.join(data_path, 'medical/pediatric_vasculitis.csv')).iloc[:,1:]
df3 = pd.read_csv(os.path.join(data_path, 'medical/type_1_diabetes.csv')).iloc[:,1:]
df4 = pd.read_csv(os.path.join(data_path, 'medical/resp_cardio.csv')).iloc[:,1:]
df5 = pd.read_csv(os.path.join(data_path, 'medical/injuries_accidents.csv')).iloc[:,1:]

# Data Clean

In [159]:
# med data:

# get all distinct patzip_year_month
all_pats = df1['patzip_year_month'].to_list() + \
  df2['patzip_year_month'].to_list() + \
  df3['patzip_year_month'].to_list() + \
  df4['patzip_year_month'].to_list() + \
  df5['patzip_year_month'].to_list() 
all_pats = list(set(all_pats))
df_med = pd.DataFrame({'patzip_year_month': all_pats})

# rename columns more intuitively
df1 = df1.rename(columns={'number_of_visits': 'number_of_visits_hem_cancers'})
df2 = df2.rename(columns={'number_of_visits': 'number_of_visits_vasc'})
df3 = df3.rename(columns={'number_of_visits': 'number_of_visits_diab'})
df4 = df4.rename(columns={'number_of_visits': 'number_of_visits_resp_cardio'})
df5 = df5.rename(columns={'number_of_visits': 'number_of_visits_injuries'})

# now join all the diagnoses on this dataset
df_med = df_med\
  .merge(df1, on='patzip_year_month', how='left')\
  .merge(df2, on='patzip_year_month', how='left')\
  .merge(df3, on='patzip_year_month', how='left')\
  .merge(df4, on='patzip_year_month', how='left')\
  .merge(df5, on='patzip_year_month', how='left')

# join data
df['year_month'] = df['year_month'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

df['zip_year_month'] = df['school_zip'].astype(str) + '-' +\
  df['year_month'].dt.year.astype(str) + '-' +\
  df['year_month'].dt.month.astype(str)

df = pd.merge(df, df_med, left_on='zip_year_month', right_on='patzip_year_month', how='left')
df = df.drop(columns = 'Unnamed: 0')

# for missing med data, assume there were 0 cases:
med_vars = ['hematopoietic_cancers', 'number_of_visits_hem_cancers', 
  'pediatric_vasculitis', 'number_of_visits_vasc', 
  'type_1_diabetes', 'number_of_visits_diab',
  'resp_cardio', 'number_of_visits_resp_cardio',
  'injuries_accidents', 'number_of_visits_injuries'
  ]
for var in med_vars:
  df[var] = df[var].fillna(0)

# fixing month datatype
df['month'] = df['month'].astype(str)

# Create response variables, which is visits / population
df['y_hematopoietic'] = 1000 * df['number_of_visits_hem_cancers'] / df['total_pop_under19']
df['y_vasculitis'] = 1000 * df['number_of_visits_vasc'] / df['total_pop_under19']
df['y_diabetes'] = 1000 * df['number_of_visits_diab'] / df['total_pop_under19']
df['y_resp_cardio'] = 1000 * df['number_of_visits_resp_cardio'] / df['total_pop_under19']
df['y_injuries'] = 1000 * df['number_of_visits_injuries'] / df['total_pop_under19']

# Create an option for a logged version of the treatment var (log(1+x)). this makes it normally distributed 
df['pm25_log'] = np.log1p(df['pm25'])

# create year trend feature
df['year_trend'] = df['year'] - 1999

# create county_month
df['county_month'] = df.apply(lambda df: df['month'].rjust(2, '0') + '_' + df['school_county_v2'], axis=1)

# create year_month_county (in case we want to just direclty use this var for the interaction effects)
df['year_month_county'] = df.apply(lambda df: str(df['year']) + '_' + df['month'] + '_' + df['school_county_v2'], axis=1)

# no need to one hot encode anymore, b/c data is already encoded 

# filter data to appropriate data range
df = df[df.year >= 2002]

In [160]:
# train/test split 
# keep 2018 as the held out test set 
df_test = df[df.year == 2018]
df = df[df.year != 2018]

In [161]:
# sort data on date
df = df.sort_values('year_month').reset_index(drop=True)

In [162]:
# Select variables for modeling
date_var = 'year_month'
num_vars = ['school_elevation_m', 'nearby_point_source_count', 'school_wspd', 'tax_liability_per_capita', 'school_temperature', 'school_count', 'pm25_r6', 'pm25_r12']
counties = [i for i in df.columns if re.search('^school_county_v2_', i)]
months = [i for i in df.columns if re.search ('^month_', i)]
# potentially use county_month instead of the above 

xvars = num_vars + counties + months 
yvar = ['y_hematopoietic']

# Cross validation function

Note, this function does not yet do the grid search for hyperparams, but just does the CV once. Update this later

In [163]:
def time_series_cv(
  df: pd.DataFrame, 
  xvars: list, 
  yvar: str, 
  hyperparams: dict = {'max_depth': [1, 5, 10], 'subsample': [.8, 1], 'eta': [.1, .3]}, 
  search_type='grid', 
  folds=5, 
  verbose=1):

  ''' 
  Inputs:
  - df: dataframe of your training data
  - xvars: a list of all the xvars to pass to xgboost
  - yvar: string of your target variable
  - verbose: optionality for diff amounts of printouts. Can be 0, 1, 2. 0 = silent, 1 = update after each fold, 2 = update after every single hyperparam combination. 
  - hyperparams: this must be a dictionary of lists. So each key is a xgb hyperparam, then it must have a list of values to tune with. 
    See the default for an example. Can put in an arbitrary number of hyperparam options. 
  
  Output:
  - dictionary with the following keys: ['fold', 'hyperparams', 'rmse_train', 'rmse_test']. 
  '''

  # this dictionary will hold all the final results
  final_res = {'fold':[], 'hyperparams':[], 'rmse_train': [], 'rmse_test': []}

  # get only necessary fields in df
  df = df[xvars + [yvar]]

  # set up the time series split class, to do an expanding window cross fold. 
  tss = TimeSeriesSplit(n_splits=folds)
  tss_folds = tss.split(df)
  all_folds = [i for i in tss_folds]

  # get all combinations of hyperparams
  def expand_grid(hyperparams):
    keys = list(hyperparams.keys())
    hyperparams_df = pd.DataFrame(np.array(np.meshgrid(*[hyperparams[key_i] for key_i in keys])).T.reshape(-1, len(keys)))
    hyperparams_df.columns = keys 
    return hyperparams_df

  df_hyperparams = expand_grid(hyperparams)

  # function to use later
  def get_rmse(dmat_train, df_train):
    ytrue = df_train[yvar].values.flatten()
    yhat = booster.predict(dmat_train)
    rmse = np.mean(((ytrue - yhat)**2)**.5)
    return rmse 

  # loop over each expanding time series window
  for fold_count,fold in enumerate(all_folds):
    if verbose > 0:
      print('Working on fold {}/{}'.format(fold_count+1, folds))

    df_train = df.loc[fold[0]]
    df_test = df.loc[fold[1]]

    # convert to xgb types
    dmat_train = xgb.DMatrix(df_train[xvars], df_train[yvar])
    dmat_test = xgb.DMatrix(df_test[xvars], df_test[yvar])

    # within each time series cross fold, perform a grid search with all hyperparam combinations and evaluate results. 
    if search_type == 'grid':
      for param_set_i in range(df_hyperparams.shape[0]):
        hyperparams_i = {x:y for x,y in zip(df_hyperparams.columns, df_hyperparams.loc[param_set_i].to_list())}
        
        # fix datatype for some vars
        if 'max_depth' in hyperparams_i.keys():
          hyperparams_i['max_depth'] = int(hyperparams_i['max_depth'])

        # fit xgb
        booster = xgb.train(
          hyperparams_i,
          dmat_train,
          num_boost_round=100, 
          early_stopping_rounds=15,
          evals = [(dmat_train, 'train'), (dmat_test, 'test')], 
          verbose_eval=False)
        
        # save results
        rmse_train = get_rmse(dmat_train, df_train)
        rmse_test = get_rmse(dmat_test, df_test)
        final_res['fold'].append(fold_count)
        final_res['hyperparams'].append(hyperparams_i)
        final_res['rmse_train'].append(rmse_train)
        final_res['rmse_test'].append(rmse_test)

        if verbose == 2:
          print('{}: rmse train: {:.3f}, rmse test: {:.3f}'.format(hyperparams_i, rmse_train, rmse_test))

    elif search_type == 'random': 
      pass 
      # haven't done this yet
  
  # print out final best hyperparams before returning the output
  output2 = pd.DataFrame({
    'hyperparams': final_res['hyperparams'],
    'fold': final_res['fold'],
    'rmse_train': final_res['rmse_train'],
    'rmse_test': final_res['rmse_test']
  })
  output2['hyperparams'] = output2['hyperparams'].astype(str)
  output2 = output2.groupby('hyperparams')[['rmse_train', 'rmse_test']].mean().reset_index().sort_values('rmse_test')
  print('best hyperparams: {}'.format(output2.iloc[0,0]))

  
  return final_res
  

In [164]:
output = time_series_cv(df, xvars = num_vars + counties + months, yvar = 'y_hematopoietic', 
  hyperparams = {'max_depth': [1, 5], 'subsample': [.8, 1], 'eta': [.1, .3], 'lambda': [1, .8]}, 
  search_type = 'grid', 
  folds = 5, 
  verbose=1)

Working on fold 1/5
Working on fold 2/5
Working on fold 3/5
Working on fold 4/5
Working on fold 5/5
best hyperparams: {'max_depth': 1, 'subsample': 1.0, 'eta': 0.3, 'lambda': 1.0}


## Optional

Organize the results manually. But I put this in the function to spit the best result at the end anyways. 

But this shows how you can manipulate and inspect the results.

In [165]:
output2 = pd.DataFrame({
  'hyperparams': output['hyperparams'],
  'fold': output['fold'],
  'rmse_train': output['rmse_train'],
  'rmse_test': output['rmse_test']
})
output2

Unnamed: 0,hyperparams,fold,rmse_train,rmse_test
0,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.1,...",0,0.098889,0.131185
1,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0,0.096023,0.128173
2,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.1,...",0,0.124585,0.174960
3,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.1,...",0,0.148360,0.198804
4,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0,0.090540,0.122365
...,...,...,...,...
75,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.1,...",4,0.255362,0.403228
76,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",4,0.254656,0.389784
77,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",4,0.254884,0.387370
78,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.3,...",4,0.238637,0.399839


In [166]:
output2['hyperparams'] = output2['hyperparams'].astype(str)
output_grp = output2.groupby('hyperparams')[['rmse_train', 'rmse_test']].mean().reset_index()
output_grp

Unnamed: 0,hyperparams,rmse_train,rmse_test
0,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.1,...",0.185906,0.27215
1,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.1,...",0.185906,0.272149
2,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0.168777,0.256342
3,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0.168776,0.256342
4,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0.185906,0.271774
5,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0.185906,0.271774
6,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",0.16919,0.255671
7,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",0.16919,0.25567
8,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.1,...",0.201704,0.300596
9,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.1,...",0.202083,0.30038


In [167]:
print('best hyperparams: {}'.format(output_grp.iloc[0,0]))

best hyperparams: {'max_depth': 1, 'subsample': 0.8, 'eta': 0.1, 'lambda': 0.8}
