# About

Create a function that can be used in the 2nd stage regression to perform a time series cross validation. 
- Using an expanding window cross validation


The 2nd stage regression predicts the medical outcomes using the predicted PM2.5 (and separately with the actual pm2.5), as well as the same fixed effects from the first stage regression. 

In [86]:
# optional. I'm getting annoying warnings that I just want to ignore:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# basics
import pandas as pd 
import numpy as np
import os 
import re
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import requests
import urllib
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import TimeSeriesSplit

# plotting
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import plotly.express as px
import seaborn as sns

# modeling
from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.gmm import IV2SLS
import xgboost as xgb

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [87]:
# keep this as false unless you want to save out the fitted model objects and results 
save_results = False

# Set Path

Add a new elif section for your path if you want

In [88]:
# local or gdrive
path_source = 'work'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

In [89]:
# read in our modeling data
df = pd.read_csv(os.path.join(data_path, 'modeling_data/modeling_data_joined_11-9.csv'))

# read in cornelia's healthcare data
df1 = pd.read_csv(os.path.join(data_path, 'medical/hematopoietic_cancers.csv')).iloc[:,1:]
df2 = pd.read_csv(os.path.join(data_path, 'medical/pediatric_vasculitis.csv')).iloc[:,1:]
df3 = pd.read_csv(os.path.join(data_path, 'medical/type_1_diabetes.csv')).iloc[:,1:]
df4 = pd.read_csv(os.path.join(data_path, 'medical/resp_cardio.csv')).iloc[:,1:]
df5 = pd.read_csv(os.path.join(data_path, 'medical/injuries_accidents.csv')).iloc[:,1:]

# Data Clean

In [90]:
# med data:

# get all distinct patzip_year_month
all_pats = df1['patzip_year_month'].to_list() + \
  df2['patzip_year_month'].to_list() + \
  df3['patzip_year_month'].to_list() + \
  df4['patzip_year_month'].to_list() + \
  df5['patzip_year_month'].to_list() 
all_pats = list(set(all_pats))
df_med = pd.DataFrame({'patzip_year_month': all_pats})

# rename columns more intuitively
df1 = df1.rename(columns={'number_of_visits': 'number_of_visits_hem_cancers'})
df2 = df2.rename(columns={'number_of_visits': 'number_of_visits_vasc'})
df3 = df3.rename(columns={'number_of_visits': 'number_of_visits_diab'})
df4 = df4.rename(columns={'number_of_visits': 'number_of_visits_resp_cardio'})
df5 = df5.rename(columns={'number_of_visits': 'number_of_visits_injuries'})

# now join all the diagnoses on this dataset
df_med = df_med\
  .merge(df1, on='patzip_year_month', how='left')\
  .merge(df2, on='patzip_year_month', how='left')\
  .merge(df3, on='patzip_year_month', how='left')\
  .merge(df4, on='patzip_year_month', how='left')\
  .merge(df5, on='patzip_year_month', how='left')

# join data
df['year_month'] = df['year_month'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

df['zip_year_month'] = df['school_zip'].astype(str) + '-' +\
  df['year_month'].dt.year.astype(str) + '-' +\
  df['year_month'].dt.month.astype(str)

df = pd.merge(df, df_med, left_on='zip_year_month', right_on='patzip_year_month', how='left')
df = df.drop(columns = 'Unnamed: 0')

# for missing med data, assume there were 0 cases:
med_vars = ['hematopoietic_cancers', 'number_of_visits_hem_cancers', 
  'pediatric_vasculitis', 'number_of_visits_vasc', 
  'type_1_diabetes', 'number_of_visits_diab',
  'resp_cardio', 'number_of_visits_resp_cardio',
  'injuries_accidents', 'number_of_visits_injuries'
  ]
for var in med_vars:
  df[var] = df[var].fillna(0)

# fixing month datatype
df['month'] = df['month'].astype(str)

# Create response variables, which is visits / population
df['y_hematopoietic'] = 1000 * df['number_of_visits_hem_cancers'] / df['total_pop_under19']
df['y_vasculitis'] = 1000 * df['number_of_visits_vasc'] / df['total_pop_under19']
df['y_diabetes'] = 1000 * df['number_of_visits_diab'] / df['total_pop_under19']
df['y_resp_cardio'] = 1000 * df['number_of_visits_resp_cardio'] / df['total_pop_under19']
df['y_injuries'] = 1000 * df['number_of_visits_injuries'] / df['total_pop_under19']

# Create an option for a logged version of the treatment var (log(1+x)). this makes it normally distributed 
df['pm25_log'] = np.log1p(df['pm25'])

# create year trend feature
df['year_trend'] = df['year'] - 1999

# create county_month
df['county_month'] = df.apply(lambda df: df['month'].rjust(2, '0') + '_' + df['school_county_v2'], axis=1)

# create year_month_county (in case we want to just direclty use this var for the interaction effects)
df['year_month_county'] = df.apply(lambda df: str(df['year']) + '_' + df['month'] + '_' + df['school_county_v2'], axis=1)

# no need to one hot encode anymore, b/c data is already encoded 

# filter data to appropriate data range
df = df[df.year >= 2002]

In [91]:
# train/test split 
# keep 2018 as the held out test set 
df_test = df[df.year == 2018]
df = df[df.year != 2018]

In [92]:
# sort data on date
df = df.sort_values('year_month').reset_index(drop=True)

In [145]:
# Select variables for modeling
date_var = 'year_month'
num_vars = ['school_elevation_m', 'nearby_point_source_count', 'school_wspd', 'tax_liability_per_capita', 'school_temperature', 'school_count', 'pm25_r6', 'pm25_r12']
counties = [i for i in df.columns if re.search('^school_county_v2_', i)]
months = [i for i in df.columns if re.search ('^month_', i)]
# potentially use county_month instead of the above 

xvars = num_vars + counties + months 
yvar = ['y_hematopoietic']

# Cross validation function

Note, this function does not yet do the grid search for hyperparams, but just does the CV once. Update this later

In [162]:
def time_series_cv(df, xvars, yvar, hyperparams = {}, search_type='grid', folds=5, verbosity=0):
  ''' 
  verbosity = 0, 1 or 2. 0 = silent. 1 = . 2 = show results after every cross fold
  '''

  df = df[xvars + [yvar]]

  tss = TimeSeriesSplit(n_splits=folds)
  res = tss.split(df)
  all_folds = [i for i in res]

  res = {'fold':[], 'hyperparams':[], 'rmse_train': [], 'rmse_test': []}

  # add outer for loop to do a grid search on all hyperparams
  for i,fold in enumerate(all_folds):
    df_train = df.loc[fold[0]]
    df_test = df.loc[fold[1]]

    # convert to xgb types
    dmat_train = xgb.DMatrix(df_train[xvars], df_train[yvar])
    dmat_test = xgb.DMatrix(df_test[xvars], df_test[yvar])

    # fit xgb
    hyperparams = { 
      'max_depth': 10, 
      'subsample': .8,
      'eta': .3,  
      'eval_metric': 'rmse', 
      }
    booster = xgb.train(
      hyperparams,
      dmat_train,
      num_boost_round=100,
      early_stopping_rounds=15,
      evals = [(dmat_train, 'train'), (dmat_test, 'test')], 
      verbose_eval=False
      )

    def get_rmse(dmat_train, df_train):
      ytrue = df_train[yvar].values.flatten()
      yhat = booster.predict(dmat_train)
      rmse = np.mean(((ytrue - yhat)**2)**.5)
      return rmse 
    
    # save results
    res['fold'].append(i)
    res['hyperparams'].append(hyperparams)
    res['rmse_train'].append(get_rmse(dmat_train, df_train))
    res['rmse_test'].append(get_rmse(dmat_test, df_test))
  
  return res
  

In [163]:
output = time_series_cv(df, xvars = num_vars + counties + months, yvar = 'y_hematopoietic')
output

{'fold': [0, 1, 2, 3, 4],
 'hyperparams': [{'max_depth': 10,
   'subsample': 0.8,
   'eta': 0.3,
   'eval_metric': 'rmse'},
  {'max_depth': 10, 'subsample': 0.8, 'eta': 0.3, 'eval_metric': 'rmse'},
  {'max_depth': 10, 'subsample': 0.8, 'eta': 0.3, 'eval_metric': 'rmse'},
  {'max_depth': 10, 'subsample': 0.8, 'eta': 0.3, 'eval_metric': 'rmse'},
  {'max_depth': 10, 'subsample': 0.8, 'eta': 0.3, 'eval_metric': 'rmse'}],
 'rmse_train': [0.03660981847353131,
  0.07760893534128871,
  0.11936492482290355,
  0.16205563841875828,
  0.20558345013313914],
 'rmse_test': [0.1496672829559056,
  0.2058065695515374,
  0.2948592276933558,
  0.33937265398584654,
  0.4241127035076277]}