# Modeling Data Clean:

This script does the following:
- Loads our main joined dataset 
- Creates features
  - Compute rolling averages of prior pm2.5 levels
  - One-hot encode categorical features: school_region, month
  - Get year from date
- Saves the dataset
- This final dataset should be used in modeling. We should be able to load and join cornelia's data to this final dataset to have a script cornelia can run on her end. 
  

In [4]:
# optional. I'm getting annoying warnings that I just want to ignore:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# basics
import pandas as pd 
import numpy as np
import os 
import re
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import requests
import urllib

# plotting
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import plotly.express as px
import seaborn as sns

# modeling
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from statsmodels.sandbox.regression.gmm import IV2SLS

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [5]:
# local or gdrive
path_source = 'local'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  #env_path = '/content/gdrive/MyDrive/.env'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  #env_path = '/content/gdrive/MyDrive/.env'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'

# Read in Data

In [6]:
# read in full joined dataset:
df = pd.read_parquet(os.path.join(data_path, 'joined_data/joined_data_with_temperatures_10-16-22/'))

# Clean Main Data

In [7]:
print(f'starting row count: {df.shape[0]}')

# clean up column names. Make them all lower cased, and replace spaces with "_"
df.columns = df.columns.str.replace("\.*\s+", "_").str.lower()

# filter out na pm2.5 values (133,261 of them)
df = df[~df['pm25'].isna()]

# 1,584 records are missing population in the year 2000. B/c the 2000 data had less zip codes.
# Filter them out since we aren't using 2000 anyways. 
df = df[~df['population_10_14'].isna()]

# 3 zips have 0 population, just filter those out
df = df[df['total_population'] != 0]
# do the same for populations under 19
df = df[df['total_pop_under19'] != 0]

# fix some datatypes
num_vars = ['angle_to_school', 'ps_elevation_m', 'pm25', 'point_source_pm25_tpy', 
            'dist_school_to_ps_m', 'angle_to_school', 'avg_wind_alignment_cosine',
            'total_population', 'total_population_male', 'total_population_female', 
            'population_0_4', 'population_0_4_male', 'population_0_4_female',
            'population_5_9', 'population_5_9_male', 'population_5_9_female',
            'population_10_14', 'population_10_14_male', 'population_10_14_female',
            'population_15_19', 'population_15_19_male', 'population_15_19_female',
            'total_pop_under19', 'point_source_lat', 'point_source_lon', 
            'school_elevation_m', 'pop_under19_male', 'pop_under19_female', 'ps_wind_lat', 'ps_wind_lon', 'ps_wspd_merge', 
            'school_wdir_wrt_0n', 'ps_wdir_wrt_0n', 'school_wind_alignment', 'ps_wind_alignment', 'avg_wind_speed', 
            'avg_wind_alignment', 'nearby_point_source_count', 'school_wspd',
            'ca_agi_per_returns', 'total_tax_liability']

print('Converting numeric vars')
for var in tqdm(num_vars):
  df[var] = df[var].astype(float)

print('converting int vars')
int_vars = ['school_zip']
for var in int_vars:
  df[var] = df[var].astype(int)

# convert to date time:
df['year_month'] = df['year_month'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

print('Mean imputing a few vars')
mean_impute_vars = ['ca_agi_per_returns', 'total_tax_liability']
for var in tqdm(mean_impute_vars):
  df[var] = df[var].fillna(df[var].mean())

# get tax per capita (watch out for inf and nan)
df['tax_liability_per_capita'] = df['total_tax_liability'] / df['total_population']

# don't need these fields:
df = df.drop(columns=['index'])

# change "san diego - imperial" to just "san diego"
# the "-" symbol was causing issues down the line with modeling
df['school_region_name'] = df['school_region_name'].map(lambda x: re.sub('San Diego - Imperial', 'San Diego', x))

print(f'ending row count: {df.shape[0]}')

starting row count: 2471552
Converting numeric vars


  0%|          | 0/41 [00:00<?, ?it/s]

converting int vars
Mean imputing a few vars


  0%|          | 0/2 [00:00<?, ?it/s]

ending row count: 2336707


# Group the Data

In [8]:
# pre-grouping checks:
n = df[['year_month', 'school_zip']].drop_duplicates().shape[0]
print(f'count of distinct year-mo-zips: {n}')

# including the population counts in this
n = df[['year_month', 'school_zip', 'school_county_v2', 'school_region_name',
  'pop_under19_male', 'pop_under19_female', 'total_pop_under19', 'pm25', 
  'ca_agi_per_returns', 'total_tax_liability']]\
  .drop_duplicates().shape[0]
print(f'count of distinct for all join vars: {n}')

count of distinct year-mo-zips: 312345
count of distinct for all join vars: 312345


In [9]:
# maybe not all of these should use 'mean', but doing it this way for now. 
mean_vars = [
  # lat/lon (probs dont need these)
  #'point_source_lat', 'point_source_lon', 'school_lat', 'school_lon', 'ps_wind_lat', 'ps_wind_lon',

  # elevation:
  'school_elevation_m', 'ps_elevation_m',

  # pop
  'population_0_4', 'population_0_4_male', 'population_0_4_female', 
  'population_5_9', 'population_5_9_male', 'population_5_9_female', 
  'population_10_14', 'population_10_14_male', 'population_10_14_female', 
  'population_15_19', 'population_15_19_male','population_15_19_female', 
  'total_pop_under19', 'pop_under19_male', 'pop_under19_female', 
  'total_population', 'total_population_male', 'total_population_female', 
  
  # pm2.5 vars
  'point_source_pm25_tpy',

  # distance/wind/angles
  'dist_school_to_ps_m', 'angle_to_school', 'ps_wspd_merge', 'school_wdir_wrt_0n', 'ps_wdir_wrt_0n',
  'school_wind_alignment', 'ps_wind_alignment', 'avg_wind_speed', 'avg_wind_alignment', 'avg_wind_alignment_cosine', 
  'nearby_point_source_count', 'school_wspd',

  # tax
  'ca_agi_per_returns', 'total_tax_liability', 'tax_liability_per_capita',

  # temp
  'school_temperature', 'ps_temperature'
  ]

count_vars = ['cdscode']

mean_dict = {var:(var, 'mean') for var in mean_vars}
count_dict = {var:(var, 'count') for var in count_vars}
agg_dict = {**mean_dict, **count_dict}

grp_vars = ['year_month', 'school_zip', 'school_county_v2', 'school_region_name', 'pm25']

df_grp = df\
  .groupby(grp_vars)\
  .agg(**agg_dict)\
  .reset_index()

df_grp = df_grp.rename(columns = {'cdscode':'school_count'})

print(f'Num rows of grouped df: {df_grp.shape[0]}')

Num rows of grouped df: 312345


# Create rolling avg vars
- rolling avg of pm2.5 for prior n months
- get best fit regression line of pm2.5 levels over prior n month periods

In [10]:
# create a df rolling avg
df_avgs = df_grp[['year_month', 'school_zip', 'pm25']].sort_values(['school_zip', 'year_month'])

# get rolling n month avg
def create_rolling_avg(df, num_months=6):
  df[f'pm25_r{num_months}'] = df.groupby('school_zip')['pm25']\
    .apply(lambda x: x.rolling(window=num_months, min_periods=num_months, closed='left').mean())
    
  return df 


df_avgs = create_rolling_avg(df_avgs, 1)
df_avgs = create_rolling_avg(df_avgs, 6)
df_avgs = create_rolling_avg(df_avgs, 9)
df_avgs = create_rolling_avg(df_avgs, 12)
df_avgs = create_rolling_avg(df_avgs, 24)

# count num obs over past n months (don't need this, but keeping it commented just in case)
# df_avgs['pm25_6mo_count'] = df_avgs.groupby('school_zip')['pm25'].apply(lambda x: x.rolling(6, 1, closed='left').apply(lambda x: len(x)))
# df_avgs['pm25_9mo_count'] = df_avgs.groupby('school_zip')['pm25'].apply(lambda x: x.rolling(9, 1, closed='left').apply(lambda x: len(x)))
# df_avgs['pm25_12mo_count'] = df_avgs.groupby('school_zip')['pm25'].apply(lambda x: x.rolling(12, 1, closed='left').apply(lambda x: len(x)))

# get pm25 for last month
df_avgs = df_avgs.rename(columns={'pm25_r1': 'pm25_last_month'})

Find trend
- This feature will be very correlated with the `pm25_last_month` feature. Discuss with cornelia whether there is an issue with multiple colinearity. 

In [11]:
def get_slope_pm25_per_month(df, num_months=6):
    def calcSlope(y):
        regr = LinearRegression()
        x_temp = np.array(list(range(len(y)))).reshape(-1, 1)

        try:
            regr.fit(x_temp, y)
            return regr.coef_[0]
        except:
            return None
    
    df[f'pm25_slope{num_months}'] = df.groupby('school_zip')['pm25']\
      .apply(lambda x: x.rolling(window=num_months, min_periods=num_months, closed='left')\
      .apply(lambda y: calcSlope(y)))

    return df

In [12]:
df_avgs = get_slope_pm25_per_month(df_avgs, 6)
df_avgs = get_slope_pm25_per_month(df_avgs, 9)
df_avgs = get_slope_pm25_per_month(df_avgs, 12)
df_avgs = get_slope_pm25_per_month(df_avgs, 24)

In [13]:
# get pm2.5 value from 12 months ago
df_avgs['pm25_lag_12mo'] = df_avgs.groupby('school_zip')['pm25'].shift(12)

5 schools have some time gaps. But only 5 of them. So not too much to worry about.

In [14]:
# needs to be date/num type
df_avgs['num_days'] = df_avgs.groupby('school_zip')['year_month'].apply(lambda x: x.diff())
df_avgs.value_counts('num_days')

num_days
31 days      180818
30 days      104104
28 days       19214
29 days        6816
92 days           1
458 days          1
761 days          1
1188 days         1
1553 days         1
dtype: int64

Join the rolling averages back to main dataset

In [15]:
df_avgs = df_avgs.drop(columns=['pm25', 'num_days'])
# df_avgs = df_avgs.drop(columns=['pm25'])
df_grp = pd.merge(df_grp, df_avgs, on=['school_zip', 'year_month'], how='left')

## One-hot encode categorical features
- School region
- Month

In [16]:
# create year var
df_grp['year'] = df_grp['year_month'].dt.year

# create month. convert to string and make it two digits so we can one-hot encode
df_grp['month'] = df_grp['year_month'].dt.month.map(lambda x: str(x).rjust(2, '0'))

In [17]:
encode_cols = ['school_region_name', 'month']

df_one_hot = pd.get_dummies(df_grp[['year_month', 'school_zip'] + encode_cols])
df_one_hot.columns = df_one_hot.columns.str.replace("\.*\s+", "_").str.lower()

In [18]:
encode_cols = ['school_region_name', 'month']

df_one_hot = pd.get_dummies(df_grp[['year_month', 'school_zip'] + encode_cols])
df_one_hot.columns = df_one_hot.columns.str.replace("\.*\s+", "_").str.lower()

#df_grp = pd.merge(df_grp.drop(columns=encode_cols), df_one_hot, on=['year_month', 'school_zip'], how='left')
df_grp = pd.merge(df_grp, df_one_hot, on=['year_month', 'school_zip'], how='left')

In [19]:
df_grp.shape[0]

312345

# Export Results

In [20]:
df_grp.to_csv(os.path.join(data_path, 'modeling_data/modeling_data_2022-10-16.csv'), index=False)