# About

Create a function that can be used in the 2nd stage regression to perform a time series cross validation. 
- Using an expanding window cross validation


The 2nd stage regression predicts the medical outcomes using the predicted PM2.5 (and separately with the actual pm2.5), as well as the same fixed effects from the first stage regression. 

In [1]:
# optional. I'm getting annoying warnings that I just want to ignore:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# basics
import pandas as pd 
import numpy as np
import os 
import re
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import requests
import urllib
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import TimeSeriesSplit

# plotting
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import plotly.express as px
import seaborn as sns

# modeling
from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.gmm import IV2SLS
import xgboost as xgb

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [2]:
# keep this as false unless you want to save out the fitted model objects and results 
save_results = False

Select your predictor, lead, lag times, and lag style here.

In [3]:
predictor = 'central_wind_alignment_180_high'
lead_time = '6'
lag_time = '3'
lag_style = 'fwd'

# define lead time for IV: 'last_month', 'r6', 'r9', 'r12'
IV_lead = "r" + str(lead_time)
HO_lag = lag_style + str(lag_time)

if IV_lead:
    IV_lead_input = "_" + IV_lead 
else:
    # don't add underscore if empty string
    IV_lead_input = IV_lead

# define lag time for Health Outcome: '', 'fwd3', 'cent3', 'fwd6', 'cent6', 'fwd12', 'cent12'
if HO_lag:
    HO_lag_input = "_" + HO_lag 
else:
    # don't add underscore if empty string
    HO_lag_input = HO_lag

# IV options: 1 month, 6 months, 9 months, 12 months
IV_window_col = [f'pm25{IV_lead_input}']

# health outcome options (fwd or cent): 1 month, 3 months, 6 months, 12 months
health_outcome_window_col = [f'y_injuries{HO_lag_input}']

filter_cols = IV_window_col + health_outcome_window_col # columns to filter out at the beginning and end of df, before modeling

target_name_s1 = f'pm25{IV_lead_input}'
predictor_name_s1 = f'{predictor}{IV_lead_input}'

print(f"Stage 1\nTarget Name (target_name_s1) = {target_name_s1}\nPredictor Name (predictor_name_s1) = {predictor_name_s1}")

print(f"\nStage 2\nHealth Outcome Lag Input (HO_lag_input) = {HO_lag_input}")

Stage 1
Target Name (target_name_s1) = pm25_r6
Predictor Name (predictor_name_s1) = central_wind_alignment_180_high_r6

Stage 2
Health Outcome Lag Input (HO_lag_input) = _fwd3


# Set Path

Add a new elif section for your path if you want

In [4]:
# local or gdrive
path_source = 'local'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

  data_path = 'G:\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data'
  fitted_models_path = 'G:\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\W210_Capstone\\fitted_models\\2022-10-23'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  fitted_models_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/fitted_models/2022-10-23'

In [5]:
# read in our modeling data
df = pd.read_csv(os.path.join(data_path, 'modeling_data/modeling_data_joined_11-9.csv'))

# read in cornelia's healthcare data
df1 = pd.read_csv(os.path.join(data_path, 'medical/hematopoietic_cancers.csv')).iloc[:,1:]
df2 = pd.read_csv(os.path.join(data_path, 'medical/pediatric_vasculitis.csv')).iloc[:,1:]
df3 = pd.read_csv(os.path.join(data_path, 'medical/type_1_diabetes.csv')).iloc[:,1:]
df4 = pd.read_csv(os.path.join(data_path, 'medical/resp_cardio.csv')).iloc[:,1:]
df5 = pd.read_csv(os.path.join(data_path, 'medical/injuries_accidents.csv')).iloc[:,1:]

In [6]:
temp = df[(df.year <= 2017) & (df.year >= 2002)].isna().sum()
temp[temp > 0]

pm25_last_month      24
pm25_r6             146
pm25_r9             224
pm25_r12            302
pm25_r24           1065
pm25_slope6         146
pm25_slope9         224
pm25_slope12        302
pm25_slope24       1065
pm25_lag_12mo       302
dtype: int64

In [7]:
temp = df[(df.year <= 2017)].isna().sum()
temp[temp > 0]

pm25_last_month                     1387
pm25_r6                             8322
pm25_r9                            12483
pm25_r12                           16644
pm25_r24                           33288
pm25_slope6                         8322
pm25_slope9                        12483
pm25_slope12                       16644
pm25_slope24                       33288
pm25_lag_12mo                      16644
central_wind_alignment_180_high    15881
ps_pm25_tpy_top_5                  15881
school_to_ps_geod_dist_m_top_5     15881
avg_wspd_top_5                     15881
avg_u                              15881
avg_v                              15881
Izmy_v1_unnormed                   15881
Izmy_v2_nodist_unnormed            15881
Izmy_v3_normed_D_and_TPY           15881
Izmy_v4_nodist_normed_TPY          15881
Izmy_v5_all_normed                 15881
schools                            15881
dtype: int64

In [8]:
temp = df[(df.year >= 2002)].isna().sum()
temp[temp > 0]

pm25_last_month                       24
pm25_r6                              146
pm25_r9                              224
pm25_r12                             302
pm25_r24                            1065
pm25_slope6                          146
pm25_slope9                          224
pm25_slope12                         302
pm25_slope24                        1065
pm25_lag_12mo                        302
central_wind_alignment_180_high    16266
ps_pm25_tpy_top_5                  16266
school_to_ps_geod_dist_m_top_5     16266
avg_wspd_top_5                     16266
avg_u                              16266
avg_v                              16266
Izmy_v1_unnormed                   16266
Izmy_v2_nodist_unnormed            16266
Izmy_v3_normed_D_and_TPY           16266
Izmy_v4_nodist_normed_TPY          16266
Izmy_v5_all_normed                 16266
schools                            16266
dtype: int64

# Data Clean

### Make Rolling Columns for IVs

In [9]:
def roll_selected_cols(df, cols_to_roll:list = ['Izmy_v1_unnormed'\
    ,'Izmy_v2_nodist_unnormed' \
    ,'Izmy_v3_normed_D_and_TPY' \
    ,'Izmy_v4_nodist_normed_TPY' \
    ,'Izmy_v5_all_normed']
    ,rolling_periods:list = [1, 6, 9, 12]):

    """Generates rolling averages for the input variables over the input time periods.
    Inputs: df (pd dataframe): contains the data on a y-m level
            cols_to_roll (list): list of columns to generate rolling avgs--must be in df
            rolling_periods (list): list of time windows (in months) to roll over
            
    Outputs: df: Pandas dataframe containing the new columns
             all_cols: list of list containing the new columns, separated by input type"""
    
    df_int = df.copy().sort_values(['school_zip', 'year_month'])
    
    all_cols_int = []

    # Roll each variable
    for col_index in range(len(cols_to_roll)):
        new_cols = []

        col_to_roll = cols_to_roll[col_index]
        rolling_periods = [1, 6, 9, 12]

        for period in rolling_periods:
            df_int[f'{col_to_roll}_r{period}'] = df_int.groupby('school_zip')[col_to_roll]\
                .apply(lambda x: x.rolling(window=period, min_periods=period, closed='left').mean())
            
            new_cols.append(col_to_roll + "_r" + str(period))

        all_cols_int.append([col_to_roll] + new_cols)
        
    return df_int, all_cols_int

In [10]:
cols_to_roll = ['Izmy_v1_unnormed',
'Izmy_v2_nodist_unnormed',
'Izmy_v3_normed_D_and_TPY',
'Izmy_v4_nodist_normed_TPY',
'Izmy_v5_all_normed',
'central_wind_alignment_180_high']

rolling_periods = [1, 6, 9, 12]

df, all_cols = roll_selected_cols(df=df, cols_to_roll=cols_to_roll, rolling_periods=rolling_periods)

In [11]:
# rename the last month column just to be consistent and safe
df.rename(columns={'pm25_last_month': 'pm25_r1'}, inplace=True)

In [12]:
temp = df[(df.year <= 2017) & (df.year >= 2002)].isna().sum()
temp[temp > 0]

pm25_r1                                  24
pm25_r6                                 146
pm25_r9                                 224
pm25_r12                                302
pm25_r24                               1065
pm25_slope6                             146
pm25_slope9                             224
pm25_slope12                            302
pm25_slope24                           1065
pm25_lag_12mo                           302
Izmy_v1_unnormed_r1                      24
Izmy_v1_unnormed_r6                     146
Izmy_v1_unnormed_r9                     224
Izmy_v1_unnormed_r12                    302
Izmy_v2_nodist_unnormed_r1               24
Izmy_v2_nodist_unnormed_r6              146
Izmy_v2_nodist_unnormed_r9              224
Izmy_v2_nodist_unnormed_r12             302
Izmy_v3_normed_D_and_TPY_r1              24
Izmy_v3_normed_D_and_TPY_r6             146
Izmy_v3_normed_D_and_TPY_r9             224
Izmy_v3_normed_D_and_TPY_r12            302
Izmy_v4_nodist_normed_TPY_r1    

### Clean and Merge in Health Outcome Data

#### Fill in nulls conditionally on merged datasets

- the problem: for each health outcome, we want to fill in the nulls for a zipcode with 0's only if that row occurred after the first non-zero/not null visit in that zipcode for that health outcome. Keep them as nulls otherwise.

- So basically a zipcode will keep the nulls if they're on a date before the first visit seen for that health outcome, nulls will become 0 after the first visit seen for that health outcome.

In [13]:
def filter_nans(df, visits_cols = ['visits_hematopoietic_cancers', 'visits_injuries_accidents',
       'visits_type_1_diabetes', 'visits_pediatric_vasculitis',
       'visits_resp_cardio']):
    """Function to generate columns in place that replace NaNs with 0's only if that 
    row occurred after the first non-zero/not null visit in that zipcode for the specific
    health outcome. Keeps them as nulls otherwise.

    Args:
        df (DataFrame): Input dataframe
        visits_cols (list, optional): list of columns to selectively filter NaNs
    Returns:
        DataFrame with columns replaced with their NaN-filtered versions
    """

    def get_rowIndex(row):
        """Function intended for applying across df rows

        Args:
            row (int): row

        Returns:
            int: index of row
        """
      
        return row.name

    def compare_and_replace(orig_visits, groupby_index, rowIndex):
        """Function intended for applying across df rows
         Generates a new column selectively replacing NaNs with 0's
        Args:
            orig_visits (float): original column that needs to be filtered
            groupby_index (int): first index at which a non-NaN value occurs
            rowIndex (int): column containing indices of each row

        Returns:
            float or NaN
        """
        if rowIndex < groupby_index:
            orig_visits = orig_visits
        else:
            if pd.isnull(orig_visits):
                orig_visits = 0
            else:
                orig_visits = orig_visits
        return orig_visits
    
    # group df by school_zip, year_month
    df_grouped_schools = df.groupby(['school_zip', 'year_month']).tail(1)

    # generate row index
    df_grouped_schools['rowIndex'] = df_grouped_schools.apply(get_rowIndex, axis=1)
    
    # add new columns
    for i in visits_cols:
        try:
            # get the first index where a non-null value occurs in the series
            visits_series = pd.Series(df_grouped_schools[i])
            bool_not_null = visits_series.notnull()
            all_indices_not_null = np.where(bool_not_null)[0]
            groupby_index = all_indices_not_null[0]

            print('col name: {}'.format(i))
            print('groupby index: {}'.format(groupby_index))
        except:
            print("broken")
        try:
            # generate columns
            df_grouped_schools[i] = df_grouped_schools.apply(lambda row: compare_and_replace(row[i], groupby_index, row['rowIndex']), axis=1)
        except KeyError:
            print('Y')

    # drop rowIndex col
    df_grouped_schools.drop(columns=['rowIndex'], inplace=True)

    return df_grouped_schools

In [27]:
# Michelle's dummy_df for testing
dummy_df = pd.DataFrame()
dummy_df['school_zip'] = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]

dummy_df['year_month'] = ['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01', 
                            '2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01',
                            '2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01','2000-05-01', '2000-06-01', '2000-07-01', '2000-08-01']
dummy_df['visits_hematopoietic_cancers'] = [None, None, 1, None, 
                                            None, None, None, None, 
                                            None, None, None, 1, None, None, 1, 1]
dummy_df['visits_injuries_accidents'] = [1, None, 1, None, 
                                        None, None, 1, None, 
                                        1, None, None, None, 1, 1, 1, 1]

display(dummy_df)

Unnamed: 0,school_zip,year_month,visits_hematopoietic_cancers,visits_injuries_accidents
0,1,2000-01-01,,1.0
1,1,2000-02-01,,
2,1,2000-03-01,1.0,1.0
3,1,2000-04-01,,
4,2,2000-01-01,,
5,2,2000-02-01,,
6,2,2000-03-01,,1.0
7,2,2000-04-01,,
8,3,2000-01-01,,1.0
9,3,2000-02-01,,


In [15]:
dummy_df = pd.DataFrame()
dummy_df['school_zip'] = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]

dummy_df['year_month'] = ['2000-01-01', '2000-02-01', '2000-03-01', '2000-04-01']*3
dummy_df['visits_hematopoietic_cancers'] = [None, None, 1, None, 1, None, None, None, None, None, None, None]
dummy_df['visits_injuries_accidents'] = [1, None, 1, None, None, None, None, None, 1, None, None, None]

dummy_df.sort_values(['school_zip', 'year_month'], inplace=True)

display(dummy_df)

Unnamed: 0,school_zip,year_month,visits_hematopoietic_cancers,visits_injuries_accidents
0,1,2000-01-01,,1.0
1,1,2000-02-01,,
2,1,2000-03-01,1.0,1.0
3,1,2000-04-01,,
4,2,2000-01-01,1.0,
5,2,2000-02-01,,
6,2,2000-03-01,,
7,2,2000-04-01,,
8,3,2000-01-01,,1.0
9,3,2000-02-01,,


In [16]:
dummy_df_2 = filter_nans(dummy_df, ['visits_hematopoietic_cancers', 'visits_injuries_accidents'])
display(dummy_df_2)

col name: visits_hematopoietic_cancers
groupby index: 2
col name: visits_injuries_accidents
groupby index: 0


Unnamed: 0,school_zip,year_month,visits_hematopoietic_cancers,visits_injuries_accidents
0,1,2000-01-01,,1.0
1,1,2000-02-01,,0.0
2,1,2000-03-01,1.0,1.0
3,1,2000-04-01,0.0,0.0
4,2,2000-01-01,1.0,0.0
5,2,2000-02-01,0.0,0.0
6,2,2000-03-01,0.0,0.0
7,2,2000-04-01,0.0,0.0
8,3,2000-01-01,0.0,1.0
9,3,2000-02-01,0.0,0.0


In [17]:
# med data:

# get all distinct patzip_year_month
all_pats = df1['patzip_year_month'].to_list() + \
  df2['patzip_year_month'].to_list() + \
  df3['patzip_year_month'].to_list() + \
  df4['patzip_year_month'].to_list() + \
  df5['patzip_year_month'].to_list() 
all_pats = list(set(all_pats))
df_med = pd.DataFrame({'patzip_year_month': all_pats})

# rename columns more intuitively
df1 = df1.rename(columns={'number_of_visits': 'number_of_visits_hem_cancers'})
df2 = df2.rename(columns={'number_of_visits': 'number_of_visits_vasc'})
df3 = df3.rename(columns={'number_of_visits': 'number_of_visits_diab'})
df4 = df4.rename(columns={'number_of_visits': 'number_of_visits_resp_cardio'})
df5 = df5.rename(columns={'number_of_visits': 'number_of_visits_injuries'})

# now join all the diagnoses on this dataset
df_med = df_med\
  .merge(df1, on='patzip_year_month', how='left')\
  .merge(df2, on='patzip_year_month', how='left')\
  .merge(df3, on='patzip_year_month', how='left')\
  .merge(df4, on='patzip_year_month', how='left')\
  .merge(df5, on='patzip_year_month', how='left')

# join data
if isinstance(df.year_month[0], str):
  # if year month is still a string, convert it to datetime
  # don't try if already converted
    df['year_month'] = df['year_month'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

df['zip_year_month'] = df['school_zip'].astype(str) + '-' +\
  df['year_month'].dt.year.astype(str) + '-' +\
  df['year_month'].dt.month.astype(str)

df = pd.merge(df, df_med, left_on='zip_year_month', right_on='patzip_year_month', how='left')
df = df.drop(columns = 'Unnamed: 0')

# for missing med data, assume there were 0 cases:
med_vars = ['hematopoietic_cancers', 'number_of_visits_hem_cancers', 
  'pediatric_vasculitis', 'number_of_visits_vasc', 
  'type_1_diabetes', 'number_of_visits_diab',
  'resp_cardio', 'number_of_visits_resp_cardio',
  'injuries_accidents', 'number_of_visits_injuries'
  ]


# for var in med_vars:
#   df[var] = df[var].fillna(0)


# df.sort_values(['school_zip', 'year_month'], inplace=True)

# Insert code here to populate na's for each HO with 0, only if there was a HO in this zipcode before
df = filter_nans(df, visits_cols = ['number_of_visits_hem_cancers', 'number_of_visits_vasc', 
'number_of_visits_diab', 'number_of_visits_resp_cardio', 'number_of_visits_injuries'])


# fixing month datatype
df['month'] = df['month'].astype(str)

# Create response variables, which is visits / population
df['y_hematopoietic'] = 1000 * df['number_of_visits_hem_cancers'] / df['total_pop_under19']
df['y_vasculitis'] = 1000 * df['number_of_visits_vasc'] / df['total_pop_under19']
df['y_diabetes'] = 1000 * df['number_of_visits_diab'] / df['total_pop_under19']
df['y_resp_cardio'] = 1000 * df['number_of_visits_resp_cardio'] / df['total_pop_under19']
df['y_injuries'] = 1000 * df['number_of_visits_injuries'] / df['total_pop_under19']

# Create an option for a logged version of the treatment var (log(1+x)). this makes it normally distributed 
df['pm25_log'] = np.log1p(df['pm25'])

# create year trend feature
df['year_trend'] = df['year'] - 1999

# create county_month
df['county_month'] = df.apply(lambda df: df['month'].rjust(2, '0') + '_' + df['school_county_v2'], axis=1)

# create year_month_county (in case we want to just direclty use this var for the interaction effects)
df['year_month_county'] = df.apply(lambda df: str(df['year']) + '_' + df['month'] + '_' + df['school_county_v2'], axis=1)

# no need to one hot encode anymore, b/c data is already encoded 



col name: number_of_visits_hem_cancers
groupby index: 38116
col name: number_of_visits_vasc
groupby index: 37959
col name: number_of_visits_diab
groupby index: 38024
col name: number_of_visits_resp_cardio
groupby index: 37921
col name: number_of_visits_injuries
groupby index: 37928


### Make Rolling HO Sum Columns

In [18]:
# train/test split 
# keep 2018 as the held out test set 
df_test_final = df[df.year == 2018]
df = df[df.year != 2018]

In [19]:
# get rolling n month sum

def create_rolling_sum(df, var_name:str = 'number_of_visits_hem_cancers', num_months=3, center_arg:bool = False):
  """
    Creates rolling sums for the number of visits for a given health outcome. 
    Overwrite your dataframe with the output.
    Function saves the result as a column into the dataframe with subscripts 
    - '{var_name}_fwd{number of months}' for forward sums
    - '{var_name}_cent{number of months}' for centered sums

    Function includes the current month as one of the months in num_months.

    Dataframe input MUST be sorted by ['school_zip', 'year_month'] ahead of time.

    `df = df.sort_values(['school_zip', 'year_month'])`

    Suggested: filter out tail end of dates so rolling averages are not filled with imputed values.

  Args:
      `df` (dataframe): dataframe having columns for 'school_zip', datetime 'year_month', and number of visits. Dataframe must be sorted by 
      `var_name` (str, optional): health outcome number of visits. Defaults to 'number_of_visits_hem_cancers'.
      `num_months` (int, optional): Number of months to take rolling sum over. Defaults to 3.
      `center_arg` (bool, optional): If this sum should be centered on current month. Defaults to False.

  Returns:
      `df_int`: returns dataframe with column added
  """
  df_int = df.copy().sort_values(['school_zip', 'year_month'])
  
  if center_arg:
    df_int[f'{var_name}_cent{num_months}'] = df_int.groupby('school_zip')[var_name]\
                                      .apply(lambda x:x.rolling(num_months, center=True).sum())
  else:
    df_int[f'{var_name}_fwd{num_months}'] = df_int.groupby('school_zip')[var_name]\
                                      .apply(lambda x:x.rolling(num_months).sum().shift(1-num_months))

  
  return df_int 

In [20]:
print(min(df.year))
print(max(df.year))

2000
2017


In [21]:
# quality checking the na filtering
df.sort_values(['school_zip', 'year_month'])[df.school_zip == 97635].head(20)

Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_r1,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,ps_pm25_tpy_top_5,school_to_ps_geod_dist_m_top_5,avg_wspd_top_5,avg_u,avg_v,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed,schools,Izmy_v1_unnormed_r1,Izmy_v1_unnormed_r6,Izmy_v1_unnormed_r9,Izmy_v1_unnormed_r12,Izmy_v2_nodist_unnormed_r1,Izmy_v2_nodist_unnormed_r6,Izmy_v2_nodist_unnormed_r9,Izmy_v2_nodist_unnormed_r12,Izmy_v3_normed_D_and_TPY_r1,Izmy_v3_normed_D_and_TPY_r6,Izmy_v3_normed_D_and_TPY_r9,Izmy_v3_normed_D_and_TPY_r12,Izmy_v4_nodist_normed_TPY_r1,Izmy_v4_nodist_normed_TPY_r6,Izmy_v4_nodist_normed_TPY_r9,Izmy_v4_nodist_normed_TPY_r12,Izmy_v5_all_normed_r1,Izmy_v5_all_normed_r6,Izmy_v5_all_normed_r9,Izmy_v5_all_normed_r12,central_wind_alignment_180_high_r1,central_wind_alignment_180_high_r6,central_wind_alignment_180_high_r9,central_wind_alignment_180_high_r12,zip_year_month,patzip_year_month,hematopoietic_cancers,number_of_visits_hem_cancers,pediatric_vasculitis,number_of_visits_vasc,type_1_diabetes,number_of_visits_diab,resp_cardio,number_of_visits_resp_cardio,injuries_accidents,number_of_visits_injuries,y_hematopoietic,y_vasculitis,y_diabetes,y_resp_cardio,y_injuries,pm25_log,year_trend,county_month,year_month_county
310935,2000-01-01,97635,Modoc,Superior California,0.64375,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.084716,30.439583,-17.114467,12.161445,59.715496,0.813687,35.93847,1.809648,0.0,1.542658,59799.084409,38093910.0,109465.250814,-0.85,16.993708,1,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-1,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.49698,1,01_Modoc,2000_1_Modoc
310936,2000-02-01,97635,Modoc,Superior California,3.246875,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.107032,22.712163,-9.383051,19.888865,51.984079,1.141384,35.936472,1.809668,0.0,2.175736,59799.084409,38093910.0,109465.250814,8.014286,16.993708,1,0.64375,,,,,,,,,,2000,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2000-02,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-2,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.446183,1,02_Modoc,2000_2_Modoc
310937,2000-03-01,97635,Modoc,Superior California,2.928125,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.010013,79.405849,133.183512,36.80482,90.582483,0.389806,63.693652,1.443171,0.0,0.7696,59799.084409,38093910.0,109465.250814,8.014286,16.993708,1,3.246875,,,,,,,,,,2000,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2000-03,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-3,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.368162,1,03_Modoc,2000_3_Modoc
310938,2000-04-01,97635,Modoc,Superior California,3.178125,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.070472,62.809087,55.891012,20.208059,13.289984,0.665768,16.749021,1.957576,0.0,1.261063,59799.084409,38093910.0,109465.250814,8.014286,16.993708,1,2.928125,,,,,,,,,,2000,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2000-04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-4,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.429863,1,04_Modoc,2000_4_Modoc
310939,2000-05-01,97635,Modoc,Superior California,2.521875,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.07272,83.632101,66.278386,41.031073,23.677358,1.002663,32.354215,1.844756,0.0,1.932606,59799.084409,38093910.0,109465.250814,10.8,16.993708,1,3.178125,,,,,,,,,,2000,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2000-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-5,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.258994,1,05_Modoc,2000_5_Modoc
310940,2000-06-01,97635,Modoc,Superior California,1.759375,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.050706,97.702941,87.584068,55.101912,44.98304,0.555328,50.042476,1.64222,0.0,1.059949,59799.084409,38093910.0,109465.250814,14.933333,16.993708,1,2.521875,,,,,,,,,,2000,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2000-06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-6,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.015004,1,06_Modoc,2000_6_Modoc
310941,2000-07-01,97635,Modoc,Superior California,3.465625,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.126993,91.838399,88.749302,49.237371,46.148274,0.655311,47.692822,1.673105,0.0,1.183629,59799.084409,38093910.0,109465.250814,19.5,16.993708,1,1.759375,2.379688,,,,0.104375,,,,,2000,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2000-07,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-7,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.496409,1,07_Modoc,2000_7_Modoc
310942,2000-08-01,97635,Modoc,Superior California,6.2375,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.111326,92.383717,82.777136,49.782689,40.176108,0.619544,44.979398,1.707361,0.0,1.127762,59799.084409,38093910.0,109465.250814,8.014286,16.993708,1,3.465625,2.85,,,,-0.087679,,,,,2000,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2000-08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-8,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.979276,1,08_Modoc,2000_8_Modoc
310943,2000-09-01,97635,Modoc,Superior California,3.0125,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.027939,128.657898,-174.687974,86.05687,142.710998,0.225482,114.383934,0.587151,0.0,0.423024,59799.084409,38093910.0,109465.250814,8.014286,16.993708,1,6.2375,3.348438,,,,0.475625,,,,,2000,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2000-09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-9,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.389414,1,09_Modoc,2000_9_Modoc
310944,2000-10-01,97635,Modoc,Superior California,3.190625,1484.68,966.42,14.0,7.0,7.0,19.0,10.0,9.0,24.0,11.0,13.0,24.0,9.0,15.0,81.0,37.0,44.0,348.0,173.0,175.0,28.432759,163951.649076,42.601028,0.026228,69.830872,-66.829062,27.229844,109.430091,0.201628,68.329967,1.369261,0.0,0.377029,59799.084409,38093910.0,109465.250814,6.05,16.993708,1,3.0125,3.3625,2.999306,,,0.343571,0.301719,,,,2000,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2000-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97635-2000-10,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,1.43285,1,10_Modoc,2000_10_Modoc


Make Rolling Columns for HO's

In [22]:
df = df.sort_values(['school_zip', 'year_month'])
starting_cols = list(df.columns)

num_visits_col_names = ['number_of_visits_hem_cancers', 
  'number_of_visits_vasc', 
  'number_of_visits_diab',
  'number_of_visits_resp_cardio',
  'number_of_visits_injuries'
  ]

y_col_names = ['y_hematopoietic', 
  'y_vasculitis', 
  'y_diabetes',
  'y_resp_cardio',
  'y_injuries'
  ]

# 3 months ---
n = 3 # specify number of months

for health_outcome_y_col, health_outcome_visits_col in zip(y_col_names, num_visits_col_names):
    # forward looking columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=False)
    df[f'{health_outcome_y_col}_fwd{n}'] = 1000 * df[f'{health_outcome_visits_col}_fwd{n}'] / df['total_pop_under19']

    # centered columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=True)
    df[f'{health_outcome_y_col}_cent{n}'] = 1000 * df[f'{health_outcome_visits_col}_cent{n}'] / df['total_pop_under19']


# print columns added
ending_cols = list(df.columns)
window_3months_columns = [c for c in ending_cols if c not in starting_cols]
print(f"\nColumns added for health outcomes using 3 month window:\n{window_3months_columns}")
starting_cols = list(df.columns)

# 6 months ---
n = 6 # specify number of months

for health_outcome_y_col, health_outcome_visits_col in zip(y_col_names, num_visits_col_names):
    # forward looking columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=False)
    df[f'{health_outcome_y_col}_fwd{n}'] = 1000 * df[f'{health_outcome_visits_col}_fwd{n}'] / df['total_pop_under19']

    # centered columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=True)
    df[f'{health_outcome_y_col}_cent{n}'] = 1000 * df[f'{health_outcome_visits_col}_cent{n}'] / df['total_pop_under19']


# print columns added
ending_cols = list(df.columns)
window_6months_columns = [c for c in ending_cols if c not in starting_cols]
print(f"\nColumns added for health outcomes using 6 month window:\n{window_6months_columns}")
starting_cols = list(df.columns)


# 12 months ---
n = 12

for health_outcome_y_col, health_outcome_visits_col in zip(y_col_names, num_visits_col_names):
    # forward looking columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=False)
    df[f'{health_outcome_y_col}_fwd{n}'] = 1000 * df[f'{health_outcome_visits_col}_fwd{n}'] / df['total_pop_under19']

    # centered columns
    df = create_rolling_sum(df=df, var_name=health_outcome_visits_col, num_months=n, center_arg=True)
    df[f'{health_outcome_y_col}_cent{n}'] = 1000 * df[f'{health_outcome_visits_col}_cent{n}'] / df['total_pop_under19']


ending_cols = list(df.columns)
window_12months_columns = [c for c in ending_cols if c not in starting_cols]
print(f"\nColumns added for health outcomes using 12 month window:\n{window_12months_columns}")
starting_cols = list(df.columns)


Columns added for health outcomes using 3 month window:
['number_of_visits_hem_cancers_fwd3', 'y_hematopoietic_fwd3', 'number_of_visits_hem_cancers_cent3', 'y_hematopoietic_cent3', 'number_of_visits_vasc_fwd3', 'y_vasculitis_fwd3', 'number_of_visits_vasc_cent3', 'y_vasculitis_cent3', 'number_of_visits_diab_fwd3', 'y_diabetes_fwd3', 'number_of_visits_diab_cent3', 'y_diabetes_cent3', 'number_of_visits_resp_cardio_fwd3', 'y_resp_cardio_fwd3', 'number_of_visits_resp_cardio_cent3', 'y_resp_cardio_cent3', 'number_of_visits_injuries_fwd3', 'y_injuries_fwd3', 'number_of_visits_injuries_cent3', 'y_injuries_cent3']

Columns added for health outcomes using 6 month window:
['number_of_visits_hem_cancers_fwd6', 'y_hematopoietic_fwd6', 'number_of_visits_hem_cancers_cent6', 'y_hematopoietic_cent6', 'number_of_visits_vasc_fwd6', 'y_vasculitis_fwd6', 'number_of_visits_vasc_cent6', 'y_vasculitis_cent6', 'number_of_visits_diab_fwd6', 'y_diabetes_fwd6', 'number_of_visits_diab_cent6', 'y_diabetes_cent6', 

Filter out pre 2002 data here, once rolling HO's are done with 2000-2002 data for features.

In [23]:
# filter data to appropriate data range
df = df[df.year >= 2002]

print(min(df.year))
print(max(df.year))

2002
2017


In [24]:
temp = df[(df.year <= 2017) & (df.year >= 2002)].isna().sum()
print(temp[temp > 0])

pm25_r1                                24
pm25_r6                               146
pm25_r9                               224
pm25_r12                              302
pm25_r24                             1065
                                    ...  
y_resp_cardio_cent12                38177
number_of_visits_injuries_fwd12     45363
y_injuries_fwd12                    45363
number_of_visits_injuries_cent12    38177
y_injuries_cent12                   38177
Length: 110, dtype: int64


### Prepare Data for Modeling

In [25]:
# sort data on date for CV splitting
df = df.sort_values('year_month').reset_index(drop=True)

In [26]:
df.head(5)

Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_r1,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,ps_pm25_tpy_top_5,school_to_ps_geod_dist_m_top_5,avg_wspd_top_5,avg_u,avg_v,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed,schools,Izmy_v1_unnormed_r1,Izmy_v1_unnormed_r6,Izmy_v1_unnormed_r9,Izmy_v1_unnormed_r12,Izmy_v2_nodist_unnormed_r1,Izmy_v2_nodist_unnormed_r6,Izmy_v2_nodist_unnormed_r9,Izmy_v2_nodist_unnormed_r12,Izmy_v3_normed_D_and_TPY_r1,Izmy_v3_normed_D_and_TPY_r6,Izmy_v3_normed_D_and_TPY_r9,Izmy_v3_normed_D_and_TPY_r12,Izmy_v4_nodist_normed_TPY_r1,Izmy_v4_nodist_normed_TPY_r6,Izmy_v4_nodist_normed_TPY_r9,Izmy_v4_nodist_normed_TPY_r12,Izmy_v5_all_normed_r1,Izmy_v5_all_normed_r6,Izmy_v5_all_normed_r9,Izmy_v5_all_normed_r12,central_wind_alignment_180_high_r1,central_wind_alignment_180_high_r6,central_wind_alignment_180_high_r9,central_wind_alignment_180_high_r12,zip_year_month,patzip_year_month,hematopoietic_cancers,number_of_visits_hem_cancers,pediatric_vasculitis,number_of_visits_vasc,type_1_diabetes,number_of_visits_diab,resp_cardio,number_of_visits_resp_cardio,injuries_accidents,number_of_visits_injuries,y_hematopoietic,y_vasculitis,y_diabetes,y_resp_cardio,y_injuries,pm25_log,year_trend,county_month,year_month_county,number_of_visits_hem_cancers_fwd3,y_hematopoietic_fwd3,number_of_visits_hem_cancers_cent3,y_hematopoietic_cent3,number_of_visits_vasc_fwd3,y_vasculitis_fwd3,number_of_visits_vasc_cent3,y_vasculitis_cent3,number_of_visits_diab_fwd3,y_diabetes_fwd3,number_of_visits_diab_cent3,y_diabetes_cent3,number_of_visits_resp_cardio_fwd3,y_resp_cardio_fwd3,number_of_visits_resp_cardio_cent3,y_resp_cardio_cent3,number_of_visits_injuries_fwd3,y_injuries_fwd3,number_of_visits_injuries_cent3,y_injuries_cent3,number_of_visits_hem_cancers_fwd6,y_hematopoietic_fwd6,number_of_visits_hem_cancers_cent6,y_hematopoietic_cent6,number_of_visits_vasc_fwd6,y_vasculitis_fwd6,number_of_visits_vasc_cent6,y_vasculitis_cent6,number_of_visits_diab_fwd6,y_diabetes_fwd6,number_of_visits_diab_cent6,y_diabetes_cent6,number_of_visits_resp_cardio_fwd6,y_resp_cardio_fwd6,number_of_visits_resp_cardio_cent6,y_resp_cardio_cent6,number_of_visits_injuries_fwd6,y_injuries_fwd6,number_of_visits_injuries_cent6,y_injuries_cent6,number_of_visits_hem_cancers_fwd12,y_hematopoietic_fwd12,number_of_visits_hem_cancers_cent12,y_hematopoietic_cent12,number_of_visits_vasc_fwd12,y_vasculitis_fwd12,number_of_visits_vasc_cent12,y_vasculitis_cent12,number_of_visits_diab_fwd12,y_diabetes_fwd12,number_of_visits_diab_cent12,y_diabetes_cent12,number_of_visits_resp_cardio_fwd12,y_resp_cardio_fwd12,number_of_visits_resp_cardio_cent12,y_resp_cardio_cent12,number_of_visits_injuries_fwd12,y_injuries_fwd12,number_of_visits_injuries_cent12,y_injuries_cent12
0,2002-01-01,90001,Los Angeles,Los Angeles County,25.95,44.728889,43.703333,6104.0,3152.0,2952.0,6402.0,3247.0,3155.0,5548.0,2833.0,2715.0,5176.0,2644.0,2532.0,23230.0,11876.0,11354.0,55007.0,27550.0,27457.0,14.241154,3854.812685,-90.196586,0.946478,-154.458522,-154.458522,64.261936,64.261936,0.946478,64.261936,1.389537,0.0,0.946478,20914.811067,2417758.0,43.953642,13.055556,13.533333,9,24.25,23.711111,21.916667,21.534722,21.597222,2.130476,1.409722,0.674301,0.182174,28.9,2002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2002-01,98.489179,10.899675,4588.668683,1.613005,-0.156038,-0.43018,1576.746029,6524624.0,84143.666276,1938.337589,24.095761,9.0,1606.042783,950.380771,956.31766,1135.161651,6647722.0,3938219.0,3963170.0,4699318.0,85811.700922,50320.041423,50549.510465,60161.4137,1977.688381,1159.086322,1164.120554,1384.706252,24.573337,14.408932,14.474805,17.22764,100.145705,81.877448,80.910924,86.218027,90001-2002-1,,,,,,,,,,,,,,,,,3.293983,3,01_Los Angeles,2002_1_Los Angeles,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2002-01-01,95032,Santa Clara,San Francisco Bay Area,14.452941,109.63,319.928,1331.0,692.0,639.0,1696.0,857.0,839.0,1668.0,878.0,790.0,1329.0,692.0,637.0,6024.0,3119.0,2905.0,24559.0,11805.0,12754.0,4.530227,3910.947744,-7.191587,0.532895,155.848108,155.848108,152.348099,152.348099,0.532895,152.348099,0.118512,0.0,0.532895,111585.326852,76527879.0,3116.082862,8.06,6.76,5,9.323529,8.459804,8.04183,9.051961,9.070833,0.934958,0.442255,-0.281201,0.056974,20.417646,2002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2002-01,94.642239,6.942746,8797.753061,1.730511,0.072981,-0.710522,485.572354,4010015.0,24352.658575,1132.407231,6.973789,5.0,705.333851,391.254328,371.527674,424.577171,5288854.0,2857631.0,2770540.0,3196890.0,35285.58344,19187.117727,18088.600925,20845.47253,1499.007556,798.012208,768.869351,892.686346,10.105253,5.494569,5.180018,5.969593,93.468277,73.88684,71.603108,76.054355,95032-2002-1,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.737799,3,01_Santa Clara,2002_1_Santa Clara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2002-01-01,95030,Santa Clara,San Francisco Bay Area,10.452941,109.066,345.62,607.0,313.0,294.0,822.0,410.0,412.0,925.0,469.0,456.0,713.0,364.0,349.0,3067.0,1556.0,1511.0,13189.0,6380.0,6809.0,2.175908,4673.044299,-12.693103,0.532895,155.848108,155.848108,168.541211,168.541211,0.532895,168.541211,0.02192,0.0,0.532895,162450.242658,69275160.0,5252.495261,8.08,6.6,5,7.241177,6.97549,6.699346,7.340196,7.527941,0.641176,0.287059,-0.179515,0.028363,15.429412,2002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2002-01,95.594938,10.123391,9420.384162,1.758564,0.072981,-0.710522,695.346324,6879286.0,36413.411473,2042.141099,10.427835,5.0,889.398965,658.527066,700.295949,721.31807,8338712.0,6392849.0,6923723.0,7066090.0,46046.590518,34435.959188,36774.607985,37762.770948,2465.746907,1909.932992,2074.196269,2110.30093,13.187194,9.861528,10.531408,10.814468,94.413291,82.223435,81.100181,83.808464,95030-2002-1,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.438247,3,01_Santa Clara,2002_1_Santa Clara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2002-01-01,95023,San Benito,Central Coast,11.059477,114.488182,105.148182,4205.0,2186.0,2019.0,4426.0,2295.0,2131.0,4283.0,2236.0,2047.0,3836.0,2013.0,1823.0,16750.0,8730.0,8020.0,47772.0,24118.0,23654.0,46.247058,18713.14646,133.870754,0.725243,146.389028,147.364543,24.309225,25.28474,0.692416,24.796983,1.833716,0.0,0.659588,50048.317917,27042656.0,566.077535,8.137879,9.018182,22,5.178432,6.048693,5.518736,5.216285,5.974918,0.610775,0.390915,0.282593,-0.03332,4.311765,2002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2002-01,99.499927,20.212486,24098.907534,1.405112,0.0531,-0.066502,504.144795,10445670.0,27555.457643,3200.245917,7.890637,22.0,606.822759,643.073062,720.174648,690.136032,12736170.0,13511290.0,15101130.0,14472730.0,33119.272392,35065.855965,39278.150294,37644.235844,3896.008702,4128.929148,4615.708094,4424.227677,9.484581,10.041747,11.248242,10.780325,92.365976,118.553533,122.835261,117.363322,95023-2002-1,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.489851,3,01_San Benito,2002_1_San Benito,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2002-01-01,95020,Santa Clara,San Francisco Bay Area,12.170423,65.35625,57.51,4464.0,2220.0,2244.0,4655.0,2358.0,2297.0,4423.0,2209.0,2214.0,4098.0,2128.0,1970.0,17640.0,8915.0,8725.0,51428.0,25740.0,25688.0,5.239086,4205.539888,-74.684682,0.532895,155.848108,155.848108,129.467209,129.467209,0.532895,129.467209,0.394559,0.0,0.532895,58030.428768,39520097.0,768.454869,9.769643,9.7,16,6.153521,6.736854,6.139437,5.922418,6.595716,0.735573,0.459437,0.270802,-0.027795,5.874648,2002,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2002-01,91.535615,21.32508,17349.120006,1.485578,0.129142,-0.682608,907.765736,10751460.0,49574.600327,3310.046526,14.195592,16.0,1265.965223,830.267728,848.027963,921.749565,14706190.0,10694740.0,11204990.0,11776570.0,69007.459607,45314.853377,46336.241409,50350.947978,4517.248715,3279.080322,3436.209077,3613.691123,19.761813,12.9763,13.268981,14.418763,97.89888,89.77778,88.901001,90.610545,95020-2002-1,,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.577974,3,01_Santa Clara,2002_1_Santa Clara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Select variables for modeling
date_var = 'year_month' 

# stage 1 variables
instruments_cols = ['Izmy_v1_unnormed',
'Izmy_v2_nodist_unnormed',
'Izmy_v3_normed_D_and_TPY',
'Izmy_v4_nodist_normed_TPY',
'Izmy_v5_all_normed',
'central_wind_alignment_180_high']

stage_1_IVs = [s + IV_lead_input for s in instruments_cols]
stage_1_target = [target_name_s1]

# stage 2 variables
stage_2_HO_targets = [s + HO_lag_input for s in y_col_names]

num_vars = ['school_elevation_m', 'nearby_point_source_count', 'school_wspd', \
            'tax_liability_per_capita', 'school_temperature', 'school_count', 'pm25_r6', 'pm25_r12']
counties = [i for i in df.columns if re.search('^school_county_v2_', i)]
months = [i for i in df.columns if re.search ('^month_', i)]
# potentially use county_month instead of the above 

xvars = num_vars + counties + months 
yvar = ['y_hematopoietic']

# Stage 1 XGBoost


We want to predict PM2.5 using out chosen instrument and lead time. and add the column as pm

# Cross validation function



In [9]:
def time_series_cv(
  df: pd.DataFrame, 
  xvars: list, 
  yvar: str, 
  hyperparams: dict = {'max_depth': [1, 5, 10], 'subsample': [.8, 1], 'eta': [.1, .3]}, 
  search_type='grid', 
  folds=5, 
  verbose=1):

  ''' 
  Inputs:
  - df: dataframe of your training data
  - xvars: a list of all the xvars to pass to xgboost
  - yvar: string of your target variable
  - verbose: optionality for diff amounts of printouts. Can be 0, 1, 2. 0 = silent, 1 = update after each fold, 2 = update after every single hyperparam combination. 
  - hyperparams: this must be a dictionary of lists. So each key is a xgb hyperparam, then it must have a list of values to tune with. 
    See the default for an example. Can put in an arbitrary number of hyperparam options. 
  
  Output:
  - dictionary with the following keys: ['fold', 'hyperparams', 'rmse_train', 'rmse_test']. 
  '''

  # this dictionary will hold all the final results
  final_res = {'fold':[], 'hyperparams':[], 'rmse_train': [], 'rmse_test': []}

  # get only necessary fields in df
  df = df[xvars + [yvar]]

  # set up the time series split class, to do an expanding window cross fold. 
  tss = TimeSeriesSplit(n_splits=folds)
  tss_folds = tss.split(df)
  all_folds = [i for i in tss_folds]

  # get all combinations of hyperparams
  def expand_grid(hyperparams):
    keys = list(hyperparams.keys())
    hyperparams_df = pd.DataFrame(np.array(np.meshgrid(*[hyperparams[key_i] for key_i in keys])).T.reshape(-1, len(keys)))
    hyperparams_df.columns = keys 
    return hyperparams_df

  df_hyperparams = expand_grid(hyperparams)

  # function to use later
  def get_rmse(dmat_train, df_train):
    ytrue = df_train[yvar].values.flatten()
    yhat = booster.predict(dmat_train)
    rmse = np.mean(((ytrue - yhat)**2)**.5)
    return rmse 

  # loop over each expanding time series window
  for fold_count,fold in enumerate(all_folds):
    if verbose > 0:
      print('Working on fold {}/{}'.format(fold_count+1, folds))

    df_train = df.loc[fold[0]]
    df_test = df.loc[fold[1]]

    # convert to xgb types
    dmat_train = xgb.DMatrix(df_train[xvars], df_train[yvar])
    dmat_test = xgb.DMatrix(df_test[xvars], df_test[yvar])

    # within each time series cross fold, perform a grid search with all hyperparam combinations and evaluate results. 
    if search_type == 'grid':
      for param_set_i in range(df_hyperparams.shape[0]):
        hyperparams_i = {x:y for x,y in zip(df_hyperparams.columns, df_hyperparams.loc[param_set_i].to_list())}
        
        # fix datatype for some vars
        if 'max_depth' in hyperparams_i.keys():
          hyperparams_i['max_depth'] = int(hyperparams_i['max_depth'])

        # fit xgb
        booster = xgb.train(
          hyperparams_i,
          dmat_train,
          num_boost_round=100, 
          early_stopping_rounds=15,
          evals = [(dmat_train, 'train'), (dmat_test, 'test')], 
          verbose_eval=False)
        
        # save results
        rmse_train = get_rmse(dmat_train, df_train)
        rmse_test = get_rmse(dmat_test, df_test)
        final_res['fold'].append(fold_count)
        final_res['hyperparams'].append(hyperparams_i)
        final_res['rmse_train'].append(rmse_train)
        final_res['rmse_test'].append(rmse_test)

        if verbose == 2:
          print('{}: rmse train: {:.3f}, rmse test: {:.3f}'.format(hyperparams_i, rmse_train, rmse_test))

    elif search_type == 'random': 
      pass 
      # haven't done this yet
  
  # print out final best hyperparams before returning the output
  output2 = pd.DataFrame({
    'hyperparams': final_res['hyperparams'],
    'fold': final_res['fold'],
    'rmse_train': final_res['rmse_train'],
    'rmse_test': final_res['rmse_test']
  })
  output2['hyperparams'] = output2['hyperparams'].astype(str)
  output2 = output2.groupby('hyperparams')[['rmse_train', 'rmse_test']].mean().reset_index().sort_values('rmse_test')
  print('best hyperparams: {}'.format(output2.iloc[0,0]))

  
  return final_res
  

In [10]:
output = time_series_cv(df, xvars = num_vars + counties + months, yvar = 'y_hematopoietic', 
  hyperparams = {'max_depth': [1, 5], 'subsample': [.8, 1], 'eta': [.1, .3], 'lambda': [1, .8]}, 
  search_type = 'grid', 
  folds = 5, 
  verbose=1)

Working on fold 1/5
Working on fold 2/5
Working on fold 3/5
Working on fold 4/5
Working on fold 5/5
best hyperparams: {'max_depth': 1, 'subsample': 1.0, 'eta': 0.3, 'lambda': 1.0}


## Optional

Organize the results manually. But I put this in the function to spit the best result at the end anyways. 

But this shows how you can manipulate and inspect the results.

In [14]:
output2 = pd.DataFrame({
  'hyperparams': output['hyperparams'],
  'fold': output['fold'],
  'rmse_train': output['rmse_train'],
  'rmse_test': output['rmse_test']
})
output2

Unnamed: 0,hyperparams,fold,rmse_train,rmse_test
0,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.1,...",0,0.098889,0.131185
1,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0,0.096023,0.128173
2,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.1,...",0,0.124585,0.174960
3,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.1,...",0,0.148360,0.198804
4,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0,0.090540,0.122365
...,...,...,...,...
75,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.1,...",4,0.255362,0.403228
76,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",4,0.254656,0.389784
77,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",4,0.254884,0.387370
78,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.3,...",4,0.238637,0.399839


In [15]:
output2['hyperparams'] = output2['hyperparams'].astype(str)
output_grp = output2.groupby('hyperparams')[['rmse_train', 'rmse_test']].mean().reset_index().sort_values('rmse_test')
output_grp

Unnamed: 0,hyperparams,rmse_train,rmse_test
7,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",0.16919,0.25567
6,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.3,...",0.16919,0.255671
3,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0.168776,0.256342
2,"{'max_depth': 1, 'subsample': 0.8, 'eta': 0.3,...",0.168777,0.256342
11,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.3,...",0.152497,0.267273
10,"{'max_depth': 5, 'subsample': 0.8, 'eta': 0.3,...",0.151557,0.2676
15,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.3,...",0.154371,0.269291
14,"{'max_depth': 5, 'subsample': 1.0, 'eta': 0.3,...",0.154071,0.270412
5,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0.185906,0.271774
4,"{'max_depth': 1, 'subsample': 1.0, 'eta': 0.1,...",0.185906,0.271774


In [16]:
print('best hyperparams: {}'.format(output_grp.iloc[0,0]))

best hyperparams: {'max_depth': 1, 'subsample': 1.0, 'eta': 0.3, 'lambda': 1.0}
