## This notebook has two major sections. The first loads in the large, combined dataset and outputs monthly files.

## The second loads in these monthly files and performs instrument calculations and aggregations.

# Begin Splitting

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
import pandas as pd 
import numpy as np
import os 
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from math import radians, cos, sin, asin, sqrt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, row_number, round, substring, count, when, isnan, min, max, avg, stddev_samp, abs, sum, count
from pyspark.ml.feature import MinMaxScaler, StandardScaler, VectorAssembler
from pyspark.ml import Pipeline

from datetime import date, timedelta

import datetime

from pyspark_dist_explore import hist
import matplotlib.pyplot as plt

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

pd.set_option('display.max_columns', None)

In [4]:
spark = SparkSession.builder.master("local[*]").config("spark.executor.memory", "80g").config("spark.driver.memory", "80g").config("spark.driver.maxResultSize","0").getOrCreate()

In [5]:
gdrive_path = 'I:\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\'
local_path = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\'

In [65]:
wind_grid_points = spark.read.option("header",True).csv(os.path.join(local_path,'all_wind_measurements.csv'))

In [66]:
wind_grid_points = wind_grid_points.withColumnRenamed('lat','wind_lat').withColumnRenamed('lon','wind_lon')

In [67]:
used_grid_points = spark.read.option("header",True).csv(os.path.join(local_path,'all_used_wind_sites.csv'))

In [68]:
used_grid_points.limit(5).show()

+----------+---------+-----------+
|grid_index| wind_lat|   wind_lon|
+----------+---------+-----------+
|       443|37.779999|-122.160004|
|       722|34.529999|-120.410004|
|       631|38.279999|-120.910004|
|       589|39.279999|-121.160004|
|      1105|33.779999|-117.910004|
+----------+---------+-----------+



In [69]:
used_grid_points.printSchema()

root
 |-- grid_index: string (nullable = true)
 |-- wind_lat: string (nullable = true)
 |-- wind_lon: string (nullable = true)



In [70]:
used_grid_points = used_grid_points.withColumn("wind_lat",used_grid_points.wind_lat.cast('double'))
used_grid_points = used_grid_points.withColumn("wind_lon",used_grid_points.wind_lon.cast('double'))

In [71]:
wind_grid_points = wind_grid_points.withColumn("wind_lat",wind_grid_points.wind_lat.cast('double'))
wind_grid_points = wind_grid_points.withColumn("wind_lon",wind_grid_points.wind_lon.cast('double'))
wind_grid_points = wind_grid_points.withColumn("u",wind_grid_points.u.cast('double'))
wind_grid_points = wind_grid_points.withColumn("v",wind_grid_points.v.cast('double'))
wind_grid_points = wind_grid_points.withColumn("wdir",wind_grid_points.wdir.cast('double'))
wind_grid_points = wind_grid_points.withColumn("wspd",wind_grid_points.wspd.cast('double'))
wind_grid_points = wind_grid_points.drop('_c0')

In [72]:
wind_grid_points = wind_grid_points.withColumn("wind_lat",round(col('wind_lat'),6))
wind_grid_points = wind_grid_points.withColumn("wind_lon",round(col('wind_lon'),6))
wind_grid_points = wind_grid_points.withColumn("u",round(col('u'),6))
wind_grid_points = wind_grid_points.withColumn("v",round(col('v'),6))
wind_grid_points = wind_grid_points.withColumn("wdir",round(col('wdir'),6))
wind_grid_points = wind_grid_points.withColumn("wspd",round(col('wspd'),6))

In [73]:
wind_grid_points.limit(5).show()

+---------+-----------+--------+---------+----------+--------+-------------------+
| wind_lat|   wind_lon|       u|        v|      wdir|    wspd|           Datetime|
+---------+-----------+--------+---------+----------+--------+-------------------+
|42.279999|-124.410004|1.316132| -4.17089|287.513185|4.373617|2001-01-01 00:00:00|
|42.029999|-124.410004|1.720276|-4.124691|292.639445|4.469052|2001-01-01 00:00:00|
|41.779999|-124.410004|2.337209|-4.626282| 296.80302|5.183149|2001-01-01 00:00:00|
|41.529999|-124.410004|2.451185|-5.043875|295.918485|5.607939|2001-01-01 00:00:00|
|41.279999|-124.410004| 2.09671|-5.050475|292.545832|5.468408|2001-01-01 00:00:00|
+---------+-----------+--------+---------+----------+--------+-------------------+



In [74]:
wind_grid_points.printSchema()

root
 |-- wind_lat: double (nullable = true)
 |-- wind_lon: double (nullable = true)
 |-- u: double (nullable = true)
 |-- v: double (nullable = true)
 |-- wdir: double (nullable = true)
 |-- wspd: double (nullable = true)
 |-- Datetime: string (nullable = true)



## Use inner join to filter out unused points

In [75]:
used_wind_observations = wind_grid_points.join(used_grid_points, ['wind_lat','wind_lon'], how='inner')

In [76]:
used_wind_observations.count()

71825712

### Pre-compute min-max for scaled version of instrument below

In [77]:
min_wspd = used_wind_observations.select(min('wspd')).collect()
max_wspd = used_wind_observations.select(max('wspd')).collect()

print("Max wspd is ",max_wspd,"; min wspd is",min_wspd,".",sep="")

Max wspd is [Row(max(wspd)=19.395623)]; min wspd is[Row(min(wspd)=0.000415)].


In [78]:
min_wspd = min_wspd[0][0]
max_wspd = max_wspd[0][0]

In [79]:
print("Max wspd is ",max_wspd,"; min wspd is ",min_wspd,".",sep="")

Max wspd is 19.395623; min wspd is 0.000415.


   ## Save off subset

In [26]:
used_wind_observations.write.parquet(os.path.join(local_path,'wind_subset'))

## Start splitting out --

In [57]:
# create by-month data structure

month_bins_pd = pd.date_range(start='2000-12-01',end='2017-12-01',freq='m')

month_bins = []

for month in month_bins_pd:
    month_bins.append(datetime.datetime.strftime(month+timedelta(days=1), "%Y-%m"))

print(month_bins)

['2001-01', '2001-02', '2001-03', '2001-04', '2001-05', '2001-06', '2001-07', '2001-08', '2001-09', '2001-10', '2001-11', '2001-12', '2002-01', '2002-02', '2002-03', '2002-04', '2002-05', '2002-06', '2002-07', '2002-08', '2002-09', '2002-10', '2002-11', '2002-12', '2003-01', '2003-02', '2003-03', '2003-04', '2003-05', '2003-06', '2003-07', '2003-08', '2003-09', '2003-10', '2003-11', '2003-12', '2004-01', '2004-02', '2004-03', '2004-04', '2004-05', '2004-06', '2004-07', '2004-08', '2004-09', '2004-10', '2004-11', '2004-12', '2005-01', '2005-02', '2005-03', '2005-04', '2005-05', '2005-06', '2005-07', '2005-08', '2005-09', '2005-10', '2005-11', '2005-12', '2006-01', '2006-02', '2006-03', '2006-04', '2006-05', '2006-06', '2006-07', '2006-08', '2006-09', '2006-10', '2006-11', '2006-12', '2007-01', '2007-02', '2007-03', '2007-04', '2007-05', '2007-06', '2007-07', '2007-08', '2007-09', '2007-10', '2007-11', '2007-12', '2008-01', '2008-02', '2008-03', '2008-04', '2008-05', '2008-06', '2008-07'

In [60]:
# create compare column

used_wind_observations = used_wind_observations.withColumn('y-m', substring('Datetime', 1,7))

In [61]:
used_wind_observations.limit(25).show()

+---------+-----------+---------+---------+----------+--------+-------------------+----------+-------+
| wind_lat|   wind_lon|        u|        v|      wdir|    wspd|           Datetime|grid_index|    y-m|
+---------+-----------+---------+---------+----------+--------+-------------------+----------+-------+
|42.029999|-124.160004| 0.170099| -1.91193|275.084065|1.919482|2001-01-01 00:00:00|        44|2001-01|
|41.779999|-124.160004| 0.610318|-2.619918|283.113362|2.690066|2001-01-01 00:00:00|       123|2001-01|
|41.529999|-124.160004| 0.897872|-3.260106|285.398186|3.381489|2001-01-01 00:00:00|       124|2001-01|
|41.279999|-124.160004| 0.849772|-3.431103| 283.91038|3.534768|2001-01-01 00:00:00|       125|2001-01|
|41.029999|-124.160004| 0.826245|-3.327305|283.945749|3.428358|2001-01-01 00:00:00|       126|2001-01|
|40.779999|-124.160004| 0.781805|-3.174908|283.833566|3.269749|2001-01-01 00:00:00|       127|2001-01|
|40.529999|-124.160004| 0.707041|-2.862313|283.875283|2.948346|2001-01-01

In [75]:
# Check for nulls

used_wind_observations.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in used_wind_observations.columns]).show()

+--------+--------+---+---+----+----+--------+----------+---+
|wind_lat|wind_lon|  u|  v|wdir|wspd|Datetime|grid_index|y-m|
+--------+--------+---+---+----+----+--------+----------+---+
|       0|       0|  0|  0|   0|   0|       0|         0|  0|
+--------+--------+---+---+----+----+--------+----------+---+



In [84]:
# Check for large values

used_wind_observations.select([count(when((col(c).cast('integer') > 360), c)).alias(c) for c in used_wind_observations.columns]).show()

+--------+--------+---+---+----+----+--------+----------+---+
|wind_lat|wind_lon|  u|  v|wdir|wspd|Datetime|grid_index|y-m|
+--------+--------+---+---+----+----+--------+----------+---+
|       0|       0|  0|  0|   0|   0|       0|  62139672|  0|
+--------+--------+---+---+----+----+--------+----------+---+



In [91]:
# Check for zeros (are all of these valid??)

used_wind_observations.select([count(when((col(c).cast('float') == 0), c)).alias(c) for c in used_wind_observations.columns]).show()

+--------+--------+---+---+----+----+--------+----------+---+
|wind_lat|wind_lon|  u|  v|wdir|wspd|Datetime|grid_index|y-m|
+--------+--------+---+---+----+----+--------+----------+---+
|       0|       0|  0|  0|   0|   0|       0|         0|  0|
+--------+--------+---+---+----+----+--------+----------+---+



In [92]:
print("U columns with 0 values:",used_wind_observations.filter(col('u') == 0).count())
print("V columns with 0 values:",used_wind_observations.filter(col('v') == 0).count())

U columns with 0 values: 0
V columns with 0 values: 0


In [90]:
# Check for u and v being 0

used_wind_observations.filter((col('u') + col('v')) == 0).count()

14

In [64]:
# create and write out dataframe for each month

for ym in month_bins:
    
    print("Now working on",ym,"subset.")
    
    file_name = os.path.join(local_path,'wind_subset_by_month\\', ym)
    
    temp_df = used_wind_observations.filter(col("y-m") == ym)
    
    print("Total observations for ",ym,": ",temp_df.count(),".",sep="")
    print("Now writing")
    
    temp_df.write.parquet(file_name)

Now working on 2001-01 subset.
Total observations for 2001-01: 358608.
Now writing
Now working on 2001-02 subset.
Total observations for 2001-02: 323904.
Now writing
Now working on 2001-03 subset.
Total observations for 2001-03: 358608.
Now writing
Now working on 2001-04 subset.
Total observations for 2001-04: 347040.
Now writing
Now working on 2001-05 subset.
Total observations for 2001-05: 358608.
Now writing
Now working on 2001-06 subset.
Total observations for 2001-06: 347040.
Now writing
Now working on 2001-07 subset.
Total observations for 2001-07: 358608.
Now writing
Now working on 2001-08 subset.
Total observations for 2001-08: 358608.
Now writing
Now working on 2001-09 subset.
Total observations for 2001-09: 347040.
Now writing
Now working on 2001-10 subset.
Total observations for 2001-10: 358608.
Now writing
Now working on 2001-11 subset.
Total observations for 2001-11: 347040.
Now writing
Now working on 2001-12 subset.
Total observations for 2001-12: 358608.
Now writing
Now 

Now working on 2009-04 subset.
Total observations for 2009-04: 347040.
Now writing
Now working on 2009-05 subset.
Total observations for 2009-05: 358608.
Now writing
Now working on 2009-06 subset.
Total observations for 2009-06: 347040.
Now writing
Now working on 2009-07 subset.
Total observations for 2009-07: 358608.
Now writing
Now working on 2009-08 subset.
Total observations for 2009-08: 358608.
Now writing
Now working on 2009-09 subset.
Total observations for 2009-09: 347040.
Now writing
Now working on 2009-10 subset.
Total observations for 2009-10: 358608.
Now writing
Now working on 2009-11 subset.
Total observations for 2009-11: 347040.
Now writing
Now working on 2009-12 subset.
Total observations for 2009-12: 358608.
Now writing
Now working on 2010-01 subset.
Total observations for 2010-01: 358608.
Now writing
Now working on 2010-02 subset.
Total observations for 2010-02: 323904.
Now writing
Now working on 2010-03 subset.
Total observations for 2010-03: 358608.
Now writing
Now 

Now working on 2017-07 subset.
Total observations for 2017-07: 358608.
Now writing
Now working on 2017-08 subset.
Total observations for 2017-08: 358608.
Now writing
Now working on 2017-09 subset.
Total observations for 2017-09: 347040.
Now writing
Now working on 2017-10 subset.
Total observations for 2017-10: 358608.
Now writing
Now working on 2017-11 subset.
Total observations for 2017-11: 347040.
Now writing
Now working on 2017-12 subset.
Total observations for 2017-12: 358608.
Now writing


# End Splitting

___________________


# Begin Calculations/Aggregation Step-through
## If you want to just run this, go down to Begin Calculation/Aggregation Loop

## Load in lookup tables and data sources

## Quick overview

### Pre-compute scalars for distance and TPY norming:
#### Load in `school_year_to_point_lookup_top_5_filtered`, select point_source_index, point_source_pm25_tpy,school_to_ps_geod_dist_m
#### Calculate avg, stddev_samp, max, min (AFTER standard scaling)
#### Save as scalars (outside of loop)

### Make list to hold Pandas dataframes of aggregated instruments

## Start loop

### initialize empty Pandas dataframe

### Load in month of wind data
### Temporarily store backup of wind readings for self-joining (remove lat/lon/y-m) (temp_wind_readings_df)
### Temporarily store simple averages of wdir/wspd per zip code (to Pandas?) (temp_df_avgs_by_zip)

--- wind_temp_df

## Perform joins:

### First join: inner: wind points to schools from pre-computed lookup (school_lookup)

---compute zip code avgs, save off

--add column for current year
join year lookup for ps
drop column for current year

### Second itty-bitty join: measurement year to ps_lookup year to avoid duplicates

### third join: left: schools to top five point sources from pre-computed lookup (school_to_ps_lookup)
--join on CDSCode and lookup_year

### fourth join: left: point sources to associated wind grid points from pre-computed lookup (ps_lookup)
### Fifth join: left: point source wind grid indices to wind measurements at the same time marker (wind_temp_df)
join on grid_index and Datetime
renamed u, v, wspd, wdir_wrt_0N

### Compute Θd for each row (wind_alignment)
#### **Be sure to subtract the raw value from 180 so that high values indicate good alignment**

### Add columns for normed TPY and Dps (ps_pm25_tpy_normed, school_to_ps_geod_dist_m_normed)

((X - Xmin) / (Xmax - Xmin))

min_wspd
max_wspd
ps_TPY_mean
ps_dist_mean
ps_TPY_sd
ps_dist_sd
ps_TPY_min
ps_dist_min
ps_TPY_max
ps_dist_max


### Add columns for normed Θd and wspd for v5 (wind_alignment_normed, wspd_normed)

### Compute each instrument for each row:
#### Izmd_v1_unnormed
#### Izmd_v2_nodist_unnormed
#### Izmd_v3_normed_D_and_TPY
#### Izmd_v4_nodist_normed_TPY
#### Izmd_v5_all_normed

### Save off completely un-aggregated version (wind_subset_by_month_joined_unaggregated / yyyy-mm)

### Aggregate to CDSCode level, summing each instrument

### Aggregate at school zip code, averaging each instrument 

### Rejoin with simple avgs

### Save off version aggregated at school level (aggregated_inst_by_month / yyyy-mm)

### Convert aggregated version to Pandas df and append to list

## End loop

### Append list of aggregated instruments into single dataframe and save off

## Below, we will walk through a single example month to check the code.
## After that, we will define the loop to run through all months and run it.

In [7]:
# load files

local_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\'

school_lookup = spark.read.option("header",True).csv(os.path.join(local_dir, 'wind_grid_to_school_lookup_filtered.csv'))
ps_year_lookup = spark.read.option("header",True).csv(os.path.join(local_dir, 'year_lookup.csv'))
school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(local_dir, 'school_year_to_point_lookup_top_5_filtered.csv'))
ps_lookup = spark.read.option("header",True).csv(os.path.join(local_dir, 'wind_grid_to_ps_point_lookup_filtered.csv'))
school_filter = spark.read.option("header",True).csv(os.path.join(local_dir,'school_ym_filter.csv'))

In [164]:
school_lookup.limit(5).show()

+-----------------+--------------+----------+----------+-----------+--------------------------+
|school_grid_index|       CDSCode|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+-----------------+--------------+----------+----------+-----------+--------------------------+
|              789|10101080119628|     93706| 36.730273|-119.807915|               10656.24466|
|              866|10621096005839|     93631| 36.554793|-119.504582|               8904.455363|
|              827|10621176109920|     93720| 36.875241|-119.759962|               13828.73923|
|              827|10621176116313|     93611|   36.8173|-119.674236|                4329.91988|
|              753|10621251030477|     93234| 36.208894|-120.098567|               9630.587612|
+-----------------+--------------+----------+----------+-----------+--------------------------+



In [165]:
school_to_ps_lookup.limit(5).show()

+--------------+--------+-----------------+---------------------+----------------+------------------------+---------------+----------------+
|       CDSCode|ps_index|point_source_year|point_source_pm25_tpy|point_source_zip|school_to_ps_geod_dist_m|angle_to_school|ps_distance_rank|
+--------------+--------+-----------------+---------------------+----------------+------------------------+---------------+----------------+
|10621171030071|     111|             2002|           2.30478261|           93612|             710.4600631|   -53.65448057|               1|
|10621171030071|     624|             2002|               10.089|           93727|              6602.12932|    -1.83659538|               2|
|10621171030071|     787|             2002|              29.2146|           93711|             11296.27967|    100.0364033|               3|
|10621171030071|      95|             2002|          2.230020551|           93721|             14026.48193|    32.45898722|               4|
|106211710300

In [166]:
ps_lookup.limit(5).show()

+-------------+--------+----------------------+
|ps_grid_index|ps_index|wind_to_ps_geod_dist_m|
+-------------+--------+----------------------+
|          443|    1019|           9658.554153|
|          722|    1097|           8926.231706|
|          631|     110|           11822.85566|
|          589|    1103|           13572.37684|
|         1105|     124|           15949.34107|
+-------------+--------+----------------------+



In [456]:
# TPY/dist stats part 1

cols_to_drop = ['point_source_index','CDSCode', 'point_source_year', 'point_source_zip', 'angle_to_school', 'ps_distance_rank']

ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct().cache()

ps_agg.limit(5).show()

+--------+---------+-----------+-------+-----------+------+------------------------+
|ps_index|   ps_lat|     ps_lon|ps_year|ps_pm25_tpy|ps_zip|school_to_ps_geod_dist_m|
+--------+---------+-----------+-------+-----------+------+------------------------+
|    2885| 36.90345| -119.75703|   2008| 9.64825383| 93730|             6462.600748|
|    2674|33.790827|-118.229607|   2008|4.684987571| 90744|             3323.244242|
|    2713|33.957819|-118.191938|   2008|   5.425192| 90280|             2446.774931|
|    4933| 36.98572| -120.11198|   2014|  2.4149127| 93637|              38425.2545|
|    2152| 38.72779| -121.31947|   2008|  1.8022843| 95747|             12671.59479|
+--------+---------+-----------+-------+-----------+------+------------------------+



In [113]:
# TPY/dist stats part 2
# compute mean/sd scalars

ps_stats = ps_agg.select(avg('point_source_pm25_tpy'), avg('school_to_ps_geod_dist_m'), 
                         stddev_samp('point_source_pm25_tpy'), stddev_samp('school_to_ps_geod_dist_m')).collect()

ps_stats

ps_stats = ps_stats[0]

print(ps_stats)

Row(avg(point_source_pm25_tpy)=13.225952014379654, avg(school_to_ps_geod_dist_m)=10453.018640947166, stddev_samp(point_source_pm25_tpy)=50.486910084637685, stddev_samp(school_to_ps_geod_dist_m)=13818.527648504192)


In [114]:
# TPY/dist stats part 3
# save out scalars for mean/sd

ps_TPY_mean = ps_stats[0]
ps_dist_mean = ps_stats[1]
ps_TPY_sd = ps_stats[2]
ps_dist_sd = ps_stats[3]

print("avg tpy:", ps_TPY_mean)

avg tpy: 13.225952014379654


In [116]:
# TPY/dist stats part 4
# use scalars to scale values

ps_agg = ps_agg.withColumn('TPY_norm', (col('point_source_pm25_tpy') - ps_TPY_mean)/ps_TPY_sd).withColumn('dist_norm', (col('school_to_ps_geod_dist_m') - ps_dist_mean)/ps_dist_sd)

ps_stats_mm = ps_agg.select(min('TPY_norm'), min('dist_norm'), max('TPY_norm'), max('dist_norm')).collect()

ps_stats_mm = ps_stats_mm[0]

print(ps_stats_mm)

Row(min(TPY_norm)=-0.2464301038332982, min(dist_norm)=-0.7541348842439959, max(TPY_norm)=61.78923457101895, max(dist_norm)=12.084924082134723)


In [118]:
# TPY/dist stats part 3
# save out scalars for min/max

ps_TPY_min = ps_stats_mm[0]
ps_dist_min = ps_stats_mm[1]
ps_TPY_max = ps_stats_mm[2]
ps_dist_max = ps_stats_mm[3]

print("min_norm_tpy:", ps_TPY_min)

min_norm_tpy: -0.2464301038332982


## Data structures (testing)

In [6]:
# testing only
parquet_file = '2001-01'

zmy_agg_list = []
school_my_agg_list = []
df_avgs_list = []

In [7]:
in_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\wind_subset_by_month\\'

out_dir_unagged = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\raw_my_spark_dfs'
out_dir_zmy = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\zmy_agged_dfs\\'
out_dir_school_my = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\school_my_agged_dfs\\'

for parquet_file in os.listdir(in_dir):
    
    if parquet_file == '2001-01':
    
        # for holding augmented df at the zip code level
        temp_zmy_df = pd.DataFrame()
        
        # for holding augmented df at the school level
        temp_school_my_df = pd.DataFrame()

        # for holding summmary statistics
        temp_df_avgs = pd.DataFrame()

        # read in one month
        temp_meas_df = spark.read.parquet(os.path.join(in_dir, parquet_file))

In [8]:
temp_meas_df.limit(5).show()

+---------+-----------+--------+---------+----------+--------+-------------------+----------+-------+
| wind_lat|   wind_lon|       u|        v|      wdir|    wspd|           Datetime|grid_index|    y-m|
+---------+-----------+--------+---------+----------+--------+-------------------+----------+-------+
|42.029999|-124.160004|0.170099| -1.91193|275.084065|1.919482|2001-01-01 00:00:00|        44|2001-01|
|41.779999|-124.160004|0.610318|-2.619918|283.113362|2.690066|2001-01-01 00:00:00|       123|2001-01|
|41.529999|-124.160004|0.897872|-3.260106|285.398186|3.381489|2001-01-01 00:00:00|       124|2001-01|
|41.279999|-124.160004|0.849772|-3.431103| 283.91038|3.534768|2001-01-01 00:00:00|       125|2001-01|
|41.029999|-124.160004|0.826245|-3.327305|283.945749|3.428358|2001-01-01 00:00:00|       126|2001-01|
+---------+-----------+--------+---------+----------+--------+-------------------+----------+-------+



In [5]:
max_wspd = 19.395623
min_wspd = 0.000415

In [8]:
# compute wind dir wrt 0N

temp_meas_df = temp_meas_df.withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))/(3.141592653589793238462)).cast('double'))

AnalysisException: Column 'u' does not exist. Did you mean one of the following? [y-m, wspd, school_u, Datetime, school_v, wdir_wrt_0N, school_wind_lat, school_wind_lon, school_grid_index];
'Project [school_wind_lat#689, school_wind_lon#700, school_u#711, school_v#722, wspd#628, Datetime#629, school_grid_index#733, y-m#631, cast(((ATAN2('u, 'v) * 180) / 3.141592653589793) as double) AS wdir_wrt_0N#1619]
+- Project [school_wind_lat#689, school_wind_lon#700, school_u#711, school_v#722, wspd#628, Datetime#629, school_grid_index#733, y-m#631, wdir_wrt_0N#641]
   +- Project [school_wind_lat#689, school_wind_lon#700, school_u#711, school_v#722, wdir#627, wspd#628, Datetime#629, grid_index#630 AS school_grid_index#733, y-m#631, wdir_wrt_0N#641]
      +- Project [school_wind_lat#689, school_wind_lon#700, school_u#711, v#626 AS school_v#722, wdir#627, wspd#628, Datetime#629, grid_index#630, y-m#631, wdir_wrt_0N#641]
         +- Project [school_wind_lat#689, school_wind_lon#700, u#625 AS school_u#711, v#626, wdir#627, wspd#628, Datetime#629, grid_index#630, y-m#631, wdir_wrt_0N#641]
            +- Project [school_wind_lat#689, wind_lon#624 AS school_wind_lon#700, u#625, v#626, wdir#627, wspd#628, Datetime#629, grid_index#630, y-m#631, wdir_wrt_0N#641]
               +- Project [wind_lat#623 AS school_wind_lat#689, wind_lon#624, u#625, v#626, wdir#627, wspd#628, Datetime#629, grid_index#630, y-m#631, wdir_wrt_0N#641]
                  +- Project [wind_lat#623, wind_lon#624, u#625, v#626, wdir#627, wspd#628, Datetime#629, grid_index#630, y-m#631, cast(((ATAN2(u#625, v#626) * cast(180 as double)) / 3.141592653589793) as double) AS wdir_wrt_0N#641]
                     +- Relation [wind_lat#623,wind_lon#624,u#625,v#626,wdir#627,wspd#628,Datetime#629,grid_index#630,y-m#631] parquet


In [14]:
# drop lat/lon, wdir, and y-m and store temp df to re-join for ps wind readings (dropped can be recovered if needed)
# this assumes wdir calc is correct--that is verified below but these were run out of order

wind_temp_df = temp_meas_df.drop('wind_lat','wind_lon','wdir','y-m')

wind_temp_df.cache()

wind_temp_df.limit(10).show()

+---------+---------+--------+-------------------+----------+-------------------+
|        u|        v|    wspd|           Datetime|grid_index|        wdir_wrt_0N|
+---------+---------+--------+-------------------+----------+-------------------+
| 0.170099| -1.91193|1.919482|2001-01-01 00:00:00|        44| 174.91594219384388|
| 0.610318|-2.619918|2.690066|2001-01-01 00:00:00|       123| 166.88664073725573|
| 0.897872|-3.260106|3.381489|2001-01-01 00:00:00|       124| 164.60181000653935|
| 0.849772|-3.431103|3.534768|2001-01-01 00:00:00|       125| 166.08961753606766|
| 0.826245|-3.327305|3.428358|2001-01-01 00:00:00|       126| 166.05424716779632|
| 0.781805|-3.174908|3.269749|2001-01-01 00:00:00|       127| 166.16642852745528|
| 0.707041|-2.862313|2.948346|2001-01-01 00:00:00|       128|  166.1247092524355|
| 1.005051|-3.470103|3.612719|2001-01-01 00:00:00|       129| 163.84732058815214|
|-0.543557|-1.178743|1.298033|2001-01-01 00:00:00|       161|-155.24402584649116|
|-0.377821|-1.73

In [12]:
wind_temp_df.select(min('wspd')).collect()

[Row(min(wspd)=0.001715)]

In [15]:
# rename for explicitness of measurements

temp_meas_df = (temp_meas_df
                .withColumnRenamed('wind_lat','school_wind_lat')
                .withColumnRenamed('wind_lon','school_wind_lon')
                .withColumnRenamed('u','school_u')
                .withColumnRenamed('v','school_v')
                .withColumnRenamed('grid_index','school_grid_index')
               ).drop('wdir') # wdir is wrt 0° E and is confusing; y-m not needed

In [26]:
spot_check_df = temp_meas_df.filter(col('wdir_wrt_0N').cast('double') < 90)
spot_check_df = spot_check_df.filter(col('wdir_wrt_0N').cast('double') > -90)

In [27]:
# spot check calculations (run out of order but this checks the wind dir calc above)
spot_check_df.limit(50).show()

+---------------+---------------+---------+--------+--------+-------------------+-----------------+-------+-------------------+
|school_wind_lat|school_wind_lon| school_u|school_v|    wspd|           Datetime|school_grid_index|    y-m|        wdir_wrt_0N|
+---------------+---------------+---------+--------+--------+-------------------+-----------------+-------+-------------------+
|      38.529999|    -123.160004| 1.102296|0.010436|1.102346|2001-01-01 00:00:00|              288|2001-01|  89.45756775785236|
|      41.529999|    -122.910004| 0.147095|0.096835|0.176108|2001-01-01 00:00:00|              314|2001-01|  56.64244828556451|
|      38.779999|    -122.910004|-0.087131|0.147834|  0.1716|2001-01-01 00:00:00|              325|2001-01|-30.514418744787303|
|      38.529999|    -122.910004| 0.246955|0.670424|0.714462|2001-01-01 00:00:00|              326|2001-01| 20.221599925347913|
|      38.279999|    -122.910004| 0.448765|0.647025|0.787421|2001-01-01 00:00:00|              327|2001-

In [28]:
temp_meas_df.count()

358608

In [29]:
combined_df = temp_meas_df.join(school_lookup, ['school_grid_index'], how='inner')

In [30]:
combined_df.limit(5).show()

combined_df.count()

+-----------------+---------------+---------------+--------+---------+--------+-------------------+-------+------------------+-------------+----------+----------+-----------+--------------------------+
|school_grid_index|school_wind_lat|school_wind_lon|school_u| school_v|    wspd|           Datetime|    y-m|       wdir_wrt_0N|      CDSCode|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+-----------------+---------------+---------------+--------+---------+--------+-------------------+-------+------------------+-------------+----------+----------+-----------+--------------------------+
|               44|      42.029999|    -124.160004|0.170099| -1.91193|1.919482|2001-01-01 00:00:00|2001-01|174.91594219384388|8618206005458|     95567| 41.927578| -124.15199|               11395.56698|
|              123|      41.779999|    -124.160004|0.610318|-2.619918|2.690066|2001-01-01 00:00:00|2001-01|166.88664073725573|8618206005391|     95531| 41.755659|-124.206615|               472

9892968

In [31]:
# filter out schools which were closed, etc--as discovered elsewhere in pipeline
school_filter = school_filter.drop('_c0').withColumnRenamed('cdscode','CDSCode')

# drop join column y-m; it is not needed
combined_df = combined_df.join(school_filter, ['CDSCode','y-m'], how='inner').drop('y-m')

In [32]:
combined_df.limit(5).show()

combined_df.count()

+--------------+-----------------+---------------+---------------+---------+---------+--------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|       CDSCode|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|    wspd|           Datetime|        wdir_wrt_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+--------------+-----------------+---------------+---------------+---------+---------+--------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|10621176005904|              827|      36.779999|    -119.660004| 1.478731|-0.845149|1.703209|2001-01-01 00:00:00| 119.74955034130205|     93612| 36.814847|-119.703312|               5467.587759|
|10621176005904|              827|      36.779999|    -119.660004| 1.615188|-1.308341|2.078602|2001-01-01 01:00:00| 129.00830191243978|     93612| 36.814847|-119.703312|               5467.587759|
|10621176005904

6704184

In [499]:
# spot check for nulls

combined_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in combined_df.columns]).show()

+-------+-----------------+---------------+---------------+--------+--------+----+--------+-----------+----------+----------+----------+--------------------------+
|CDSCode|school_grid_index|school_wind_lat|school_wind_lon|school_u|school_v|wspd|Datetime|wdir_wrt_0N|school_zip|school_lat|school_lon|wind_to_school_geod_dist_m|
+-------+-----------------+---------------+---------------+--------+--------+----+--------+-----------+----------+----------+----------+--------------------------+
|      0|                0|              0|              0|       0|       0|   0|       0|          0|         0|         0|         0|                         0|
+-------+-----------------+---------------+---------------+--------+--------+----+--------+-----------+----------+----------+----------+--------------------------+



In [33]:
# compute zip code averages for wdir, wspd

zip_avgs = (combined_df.groupBy('school_zip')
            .avg('wspd','wdir_wrt_0N', 'school_u','school_v')
            .withColumnRenamed("school_zip","zip_code")
            .withColumnRenamed("avg(wspd)","avg_wspd_at_school")
            .withColumnRenamed("avg(wdir_wrt_0N)","avg_wdir_0N")
            .withColumnRenamed("avg(school_u)","avg_u")
            .withColumnRenamed("avg(school_v)","avg_v")
            .toPandas()
           )

zip_avgs['y-m'] = parquet_file

display(zip_avgs)

df_avgs_list.append(temp_df_avgs)

Unnamed: 0,zip_code,avg_wspd_at_school,avg_wdir_0N,avg_u,avg_v,y-m
0,91910,2.616727,-2.110364,0.156748,-0.378105,2017-12
1,92027,1.787543,-26.541009,-0.249706,0.063381,2017-12
2,93450,1.896387,-42.428058,-0.387293,-0.170559,2017-12
3,93013,1.940763,-28.054886,0.038671,-0.458093,2017-12
4,92879,1.615685,-44.518868,-0.568318,0.005619,2017-12
...,...,...,...,...,...,...
1358,95006,3.705027,22.822876,0.355557,-0.879218,2017-12
1359,95720,2.678182,-57.539202,-1.659426,0.309281,2017-12
1360,96059,2.604784,-65.429912,-1.661736,0.082171,2017-12
1361,94956,4.064473,10.049878,-0.040667,-0.277412,2017-12


In [34]:
combined_df = (combined_df
               .withColumnRenamed('wspd','school_wspd')
               .withColumnRenamed('wdir_wrt_0N','school_wdir_0N') 
              )

combined_df.limit(5).show()

+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|       CDSCode|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|           Datetime|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|10621176005904|              827|      36.779999|    -119.660004| 1.478731|-0.845149|   1.703209|2001-01-01 00:00:00| 119.74955034130205|     93612| 36.814847|-119.703312|               5467.587759|
|10621176005904|              827|      36.779999|    -119.660004| 1.615188|-1.308341|   2.078602|2001-01-01 01:00:00| 129.00830191243978|     93612| 36.814847|-119.703312|               5467.587759|


In [35]:
combined_df.count()

6704184

In [503]:
combined_df.printSchema()

root
 |-- CDSCode: string (nullable = true)
 |-- school_grid_index: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- school_wdir_0N: double (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)



In [484]:
combined_df.limit(5).show()

+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|       CDSCode|    y-m|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|           Datetime|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+
|10621176005904|2001-01|              827|      36.779999|    -119.660004| 1.478731|-0.845149|   1.703209|2001-01-01 00:00:00| 119.74955034130205|     93612| 36.814847|-119.703312|               5467.587759|
|10621176005904|2001-01|              827|      36.779999|    -119.660004| 1.615188|-1.308341|   2.078602|2001-01-01 01:00:00| 129.00830191243978|     93612| 36.814847|

In [36]:
# need to lookup by CDSCode and year, so substring for year

combined_df = combined_df.withColumn("year", substring(col('Datetime'),1,4))

combined_df.limit(5).show()

+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+----+
|       CDSCode|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|           Datetime|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|year|
+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+----+
|10621176005904|              827|      36.779999|    -119.660004| 1.478731|-0.845149|   1.703209|2001-01-01 00:00:00| 119.74955034130205|     93612| 36.814847|-119.703312|               5467.587759|2001|
|10621176005904|              827|      36.779999|    -119.660004| 1.615188|-1.308341|   2.078602|2001-01-01 01:00:00| 129.00830191243978|     93612| 36.814847|-119.703312|        

In [37]:
# join in ps <-> year lookup

combined_df = combined_df.join(ps_year_lookup, ['year'], how='left').drop('year')

combined_df.count()

6704184

In [506]:
combined_df.limit(5).show()

+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+-------+
|       CDSCode|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|           Datetime|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|ps_year|
+--------------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+-------------------+----------+----------+-----------+--------------------------+-------+
|10621176005904|              827|      36.779999|    -119.660004| 1.478731|-0.845149|   1.703209|2001-01-01 00:00:00| 119.74955034130205|     93612| 36.814847|-119.703312|               5467.587759|   2002|
|10621176005904|              827|      36.779999|    -119.660004| 1.615188|-1.308341|   2.078602|2001-01-01 01:00:00| 129.00830191243978|     93612| 36.814847|-119.703

In [38]:
combined_df = combined_df.join(school_to_ps_lookup, ['CDSCode','ps_year'], how='left')

In [508]:
combined_df.limit(5).show()

+--------------+-------+-----------------+---------------+---------------+--------+---------+-----------+-------------------+------------------+----------+----------+-----------+--------------------------+--------+------+------+-----------+------+------------------------+---------------+----------------+
|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon|school_u| school_v|school_wspd|           Datetime|    school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|ps_index|ps_lat|ps_lon|ps_pm25_tpy|ps_zip|school_to_ps_geod_dist_m|angle_to_school|ps_distance_rank|
+--------------+-------+-----------------+---------------+---------------+--------+---------+-----------+-------------------+------------------+----------+----------+-----------+--------------------------+--------+------+------+-----------+------+------------------------+---------------+----------------+
|10619946005730|   2002|              790|      36.529999|    -119.910004| 0.72168

In [509]:
combined_df.count()

6704184

In [39]:
combined_df = combined_df.join(ps_lookup, ['ps_index'], how='left')

# length should not have changed

combined_df.count()

33520920

In [40]:
combined_df = combined_df.withColumnRenamed('geod_dist_m', 'school_to_ps_geod_dist_m')

In [512]:
combined_df.limit(5).show()

+--------+--------------+-------+-----------------+---------------+---------------+--------+---------+-----------+-------------------+------------------+----------+----------+-----------+--------------------------+------+------+-----------+------+------------------------+---------------+----------------+-------------+----------------------+
|ps_index|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon|school_u| school_v|school_wspd|           Datetime|    school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|ps_lat|ps_lon|ps_pm25_tpy|ps_zip|school_to_ps_geod_dist_m|angle_to_school|ps_distance_rank|ps_grid_index|wind_to_ps_geod_dist_m|
+--------+--------------+-------+-----------------+---------------+---------------+--------+---------+-----------+-------------------+------------------+----------+----------+-----------+--------------------------+------+------+-----------+------+------------------------+---------------+----------------+---------

In [41]:
combined_df = combined_df.withColumnRenamed("ps_grid_index","grid_index")

In [42]:
# join in saved-off wind measurements

combined_df = combined_df.join(wind_temp_df, ['grid_index',"Datetime"], how='left')

In [43]:
combined_df = (combined_df
                .withColumnRenamed('u','ps_u')
                .withColumnRenamed('v','ps_v')
                .withColumnRenamed('wspd','ps_wspd')
                .withColumnRenamed('wdir_wrt_0N','ps_wdir_0N')
               )

In [386]:
combined_df.count()

49464840

In [387]:
combined_df.limit(5).show()

+----------+-------------------+--------+-------------+-------+-----------------+---------------+---------------+--------+---------+-----------+------------------+----------+----------+-----------+--------------------------+---------+-----------+-----------+------+-----------+---------------+----------------+----------------------+--------+---------+--------+------------------+
|grid_index|           Datetime|ps_index|      CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon|school_u| school_v|school_wspd|    school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|   ps_lat|     ps_lon|ps_pm25_tpy|ps_zip|geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|    ps_u|     ps_v| ps_wspd|        ps_wdir_0N|
+----------+-------------------+--------+-------------+-------+-----------------+---------------+---------------+--------+---------+-----------+------------------+----------+----------+-----------+--------------------------+---------+----

In [388]:
combined_df.printSchema()

root
 |-- grid_index: string (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- ps_index: string (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- ps_year: string (nullable = true)
 |-- school_grid_index: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- school_wdir_0N: double (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)
 |-- ps_lat: string (nullable = true)
 |-- ps_lon: string (nullable = true)
 |-- ps_pm25_tpy: string (nullable = true)
 |-- ps_zip: string (nullable = true)
 |-- geod_dist_m: string (nullable = true)
 |-- angle_to_school: string (nullable = true)
 |-- ps_distance_rank: string (nullable = true)
 |--

In [389]:
combined_df = combined_df.withColumn("angle_to_school",col("angle_to_school").cast("double"))

In [390]:
combined_df.printSchema()

root
 |-- grid_index: string (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- ps_index: string (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- ps_year: string (nullable = true)
 |-- school_grid_index: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- school_wdir_0N: double (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)
 |-- ps_lat: string (nullable = true)
 |-- ps_lon: string (nullable = true)
 |-- ps_pm25_tpy: string (nullable = true)
 |-- ps_zip: string (nullable = true)
 |-- geod_dist_m: string (nullable = true)
 |-- angle_to_school: double (nullable = true)
 |-- ps_distance_rank: string (nullable = true)
 |--

In [391]:
# spot check for nulls

combined_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in combined_df.columns]).show()

+----------+--------+--------+-------+-------+-----------------+---------------+---------------+--------+--------+-----------+--------------+----------+----------+----------+--------------------------+------+------+-----------+------+-----------+---------------+----------------+----------------------+----+----+-------+----------+
|grid_index|Datetime|ps_index|CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon|school_u|school_v|school_wspd|school_wdir_0N|school_zip|school_lat|school_lon|wind_to_school_geod_dist_m|ps_lat|ps_lon|ps_pm25_tpy|ps_zip|geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|ps_u|ps_v|ps_wspd|ps_wdir_0N|
+----------+--------+--------+-------+-------+-----------------+---------------+---------------+--------+--------+-----------+--------------+----------+----------+----------+--------------------------+------+------+-----------+------+-----------+---------------+----------------+----------------------+----+----+-------+----------+
|   

In [44]:
# pyspark version of below

            
combined_df = (combined_df \
              .withColumn("school_angle_diff",(col('angle_to_school') - col('school_wdir_0N')).cast('double'))
              .withColumn("ps_angle_diff",(col('angle_to_school') - col('ps_wdir_0N')).cast('double'))
              )

combined_df = (combined_df \
              .withColumn("school_wind_alignment",
                         when(col("school_angle_diff") < -180, col("school_angle_diff") + 360) \
                        .when(col("school_angle_diff") > 180, col("school_angle_diff") - 360) \
                        .otherwise(col("school_angle_diff")).cast('double')) \
              .withColumn("ps_wind_alignment",
                         when(col("ps_angle_diff") < -180, col("ps_angle_diff") + 360) \
                        .when(col("ps_angle_diff") > 180, col("ps_angle_diff") - 360) \
                        .otherwise(col("ps_angle_diff")).cast('double'))
              ).drop("school_angle_diff").drop("ps_angle_diff")

In [392]:
# first angle is wind angle, second angle is heading to school (both wrt 0N)

def calculateDifferenceBetweenAngles(firstAngle, secondAngle):
    difference = secondAngle - firstAngle
    if (difference <= -180): 
        difference += 360
        return difference
    elif (difference >= 180): 
        difference -= 360
        return difference
    return difference

udf_calculateDifferenceBetweenAngles = F.udf(calculateDifferenceBetweenAngles)

In [393]:
combined_df = (combined_df
               .withColumn("school_wind_alignment",
                           udf_calculateDifferenceBetweenAngles(
                           col('school_wdir_0N'), col('angle_to_school')).cast('double'))
               .withColumn("ps_wind_alignment",
                           udf_calculateDifferenceBetweenAngles(
                           col('ps_wdir_0N'), col('angle_to_school')).cast('double'))
              )

In [394]:
combined_df.limit(5).show()

+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+------------------+----------+----------+-----------+--------------------------+---------+-----------+-----------+------+-----------+---------------+----------------+----------------------+--------+---------+--------+-----------------+---------------------+-------------------+
|grid_index|           Datetime|ps_index|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|    school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|   ps_lat|     ps_lon|ps_pm25_tpy|ps_zip|geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|    ps_u|     ps_v| ps_wspd|       ps_wdir_0N|school_wind_alignment|  ps_wind_alignment|
+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+------

In [395]:
# spot check for nulls

combined_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in combined_df.columns]).show()

+----------+--------+--------+-------+-------+-----------------+---------------+---------------+--------+--------+-----------+--------------+----------+----------+----------+--------------------------+------+------+-----------+------+-----------+---------------+----------------+----------------------+----+----+-------+----------+---------------------+-----------------+
|grid_index|Datetime|ps_index|CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon|school_u|school_v|school_wspd|school_wdir_0N|school_zip|school_lat|school_lon|wind_to_school_geod_dist_m|ps_lat|ps_lon|ps_pm25_tpy|ps_zip|geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|ps_u|ps_v|ps_wspd|ps_wdir_0N|school_wind_alignment|ps_wind_alignment|
+----------+--------+--------+-------+-------+-----------------+---------------+---------------+--------+--------+-----------+--------------+----------+----------+----------+--------------------------+------+------+-----------+------+-----------+----------

In [45]:
combined_df = combined_df.withColumn("central_wind_alignment_180_high", 
                                     (180 - abs(((col('school_wind_alignment') + col('ps_wind_alignment'))/2))).cast('double'))

In [46]:
combined_df.limit(20).show()

+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+--------+-----------+------------------+----------+----------+-----------+--------------------------+---------+-----------+-----------+------+------------------------+---------------+----------------+----------------------+--------+---------+--------+-----------------+---------------------+-------------------+-------------------------------+
|grid_index|           Datetime|ps_index|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon| school_u|school_v|school_wspd|    school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|   ps_lat|     ps_lon|ps_pm25_tpy|ps_zip|school_to_ps_geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|    ps_u|     ps_v| ps_wspd|       ps_wdir_0N|school_wind_alignment|  ps_wind_alignment|central_wind_alignment_180_high|
+----------+-------------------+--------+--------------+-------+

In [400]:
# compute normed TPY and dist using scalars from above

combined_df = (combined_df
               .withColumn('ps_pm25_tpy_normed', 
                           ((((col('ps_pm25_tpy') - ps_TPY_mean) / ps_TPY_sd) - ps_TPY_min) / (ps_TPY_max - ps_TPY_min)).cast('double'))
               .withColumn('school_to_ps_geod_dist_m_normed',((((col('school_to_ps_geod_dist_m') - ps_dist_mean) / ps_dist_sd) - ps_dist_min) / (ps_dist_max - ps_dist_min)).cast('double'))
              )

In [407]:
# compute normed wspd and wind alignment for Instrument v5

combined_df = (combined_df
               .withColumn('avg_wspd',((col('school_wspd') + col('ps_wspd'))/2).cast('double'))
              )

combined_df = (combined_df
               .withColumn('central_wind_alignment_180_high_normed',
                           (col('central_wind_alignment_180_high')/180).cast('double'))
               .withColumn('avg_wspd_normed',
                           ((col('avg_wspd') - min_wspd) / (max_wspd - min_wspd)).cast('double'))
              )

In [409]:
combined_df.limit(10).show()

+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+----------+----------+-----------+--------------------------+---------+-----------+-----------+------+------------------------+---------------+----------------+----------------------+--------+---------+--------+-----------------+---------------------+-------------------+-------------------------------+--------------------+-------------------------------+--------------------------------------+--------------------+-------------------+
|grid_index|           Datetime|ps_index|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|   ps_lat|     ps_lon|ps_pm25_tpy|ps_zip|school_to_ps_geod_dist_m|angle_to_school|ps_distance_rank|wind_to_ps_geod_dist_m|    ps_u|     ps_v| ps_wspd|       ps_wdir_0N|sc

In [410]:
combined_df = (combined_df
               .withColumn('Izmd_v1_unnormed',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy') * (col('avg_wspd') / col('school_to_ps_geod_dist_m'))).cast('double'))
               .withColumn('Izmd_v2_nodist_unnormed',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy') * col('avg_wspd')).cast('double'))
               .withColumn('Izmd_v3_normed_D_and_TPY',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy_normed') * (col('avg_wspd') / col('school_to_ps_geod_dist_m_normed'))).cast('double'))
               .withColumn('Izmd_v4_nodist_normed_TPY',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy_normed') * col('avg_wspd')).cast('double'))
               .withColumn('Izmd_v5_all_normed',(col('central_wind_alignment_180_high_normed') * col('ps_pm25_tpy_normed') * (col('avg_wspd_normed') / col('school_to_ps_geod_dist_m_normed'))).cast('double'))
              )

In [411]:
combined_df.limit(10).show()

+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+----------+----------+-----------+--------------------------+---------+-----------+-----------+------+------------------------+---------------+----------------+----------------------+--------+---------+--------+-----------------+---------------------+-------------------+-------------------------------+--------------------+-------------------------------+--------------------------------------+--------------------+-------------------+--------------------+-----------------------+------------------------+-------------------------+--------------------+
|grid_index|           Datetime|ps_index|       CDSCode|ps_year|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|school_wspd|     school_wdir_0N|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|   ps_lat|     ps_lon|ps_pm25_tpy|ps_zip|school_to_ps

In [412]:
combined_df.columns

['grid_index',
 'Datetime',
 'ps_index',
 'CDSCode',
 'ps_year',
 'school_grid_index',
 'school_wind_lat',
 'school_wind_lon',
 'school_u',
 'school_v',
 'school_wspd',
 'school_wdir_0N',
 'school_zip',
 'school_lat',
 'school_lon',
 'wind_to_school_geod_dist_m',
 'ps_lat',
 'ps_lon',
 'ps_pm25_tpy',
 'ps_zip',
 'school_to_ps_geod_dist_m',
 'angle_to_school',
 'ps_distance_rank',
 'wind_to_ps_geod_dist_m',
 'ps_u',
 'ps_v',
 'ps_wspd',
 'ps_wdir_0N',
 'school_wind_alignment',
 'ps_wind_alignment',
 'central_wind_alignment_180_high',
 'ps_pm25_tpy_normed',
 'school_to_ps_geod_dist_m_normed',
 'central_wind_alignment_180_high_normed',
 'avg_wspd_normed',
 'avg_wspd',
 'Izmd_v1_unnormed',
 'Izmd_v2_nodist_unnormed',
 'Izmd_v3_normed_D_and_TPY',
 'Izmd_v4_nodist_normed_TPY',
 'Izmd_v5_all_normed']

In [415]:
# start selecting and aggregating down

cols_to_select = ['CDSCode',
                'school_zip',
                'ps_distance_rank',
                'ps_pm25_tpy_normed',
                'ps_pm25_tpy',
                'angle_to_school',
                'school_to_ps_geod_dist_m_normed',
                'school_to_ps_geod_dist_m',
                 'central_wind_alignment_180_high',
                 'central_wind_alignment_180_high_normed',
                 'avg_wspd_normed',
                 'avg_wspd',
                 'Izmd_v1_unnormed',
                 'Izmd_v2_nodist_unnormed',
                 'Izmd_v3_normed_D_and_TPY',
                 'Izmd_v4_nodist_normed_TPY',
                 'Izmd_v5_all_normed']

combined_df = combined_df.select(*cols_to_select)

combined_df.limit(5).show()

+--------------+----------+----------------+--------------------+-----------+---------------+-------------------------------+------------------------+-------------------------------+--------------------------------------+--------------------+------------------+--------------------+-----------------------+------------------------+-------------------------+--------------------+
|       CDSCode|school_zip|ps_distance_rank|  ps_pm25_tpy_normed|ps_pm25_tpy|angle_to_school|school_to_ps_geod_dist_m_normed|school_to_ps_geod_dist_m|central_wind_alignment_180_high|central_wind_alignment_180_high_normed|     avg_wspd_normed|          avg_wspd|    Izmd_v1_unnormed|Izmd_v2_nodist_unnormed|Izmd_v3_normed_D_and_TPY|Izmd_v4_nodist_normed_TPY|  Izmd_v5_all_normed|
+--------------+----------+----------------+--------------------+-----------+---------------+-------------------------------+------------------------+-------------------------------+--------------------------------------+--------------------+

In [417]:
### AGGREGATION 1/3: reduce to m-y-school-ps(-zip) level ###

group_by_cols = ['CDSCode',
                'school_zip',
                'ps_distance_rank',
                'ps_pm25_tpy_normed',
                'ps_pm25_tpy',
                'angle_to_school',
                'school_to_ps_geod_dist_m_normed',
                'school_to_ps_geod_dist_m']

combined_df = combined_df.groupBy(*group_by_cols) \
                .agg(
                avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                avg('avg_wspd').alias('avg_wspd'), \
                sum('Izmd_v1_unnormed').alias('Izmd_v1_unnormed'), \
                sum('Izmd_v2_nodist_unnormed').alias('Izmd_v2_nodist_unnormed'), \
                sum('Izmd_v3_normed_D_and_TPY').alias('Izmd_v3_normed_D_and_TPY'), \
                sum('Izmd_v4_nodist_normed_TPY').alias('Izmd_v4_nodist_normed_TPY'), \
                sum('Izmd_v5_all_normed').alias('Izmd_v5_all_normed') \
                    )

combined_df.limit(10).show()

+--------------+----------+----------------+--------------------+-----------+---------------+-------------------------------+------------------------+-------------------------------+--------------------------------------+-------------------+------------------+------------------+-----------------------+------------------------+-------------------------+------------------+
|       CDSCode|school_zip|ps_distance_rank|  ps_pm25_tpy_normed|ps_pm25_tpy|angle_to_school|school_to_ps_geod_dist_m_normed|school_to_ps_geod_dist_m|central_wind_alignment_180_high|central_wind_alignment_180_high_normed|    avg_wspd_normed|          avg_wspd|  Izmd_v1_unnormed|Izmd_v2_nodist_unnormed|Izmd_v3_normed_D_and_TPY|Izmd_v4_nodist_normed_TPY|Izmd_v5_all_normed|
+--------------+----------+----------------+--------------------+-----------+---------------+-------------------------------+------------------------+-------------------------------+--------------------------------------+-------------------+-----------

In [418]:
combined_df.count()

66485

In [419]:
### AGGREGATION 2/3: reduce to m-y-school(-zip) level ###

combined_df = combined_df.groupBy("CDSCode", "school_zip") \
                .agg(
                avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                avg('ps_pm25_tpy').alias('ps_pm25_tpy'), \
                avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m'), \
                avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                avg('avg_wspd').alias('avg_wspd'), \
                sum('Izmd_v1_unnormed').alias('Izmd_v1_unnormed'), \
                sum('Izmd_v2_nodist_unnormed').alias('Izmd_v2_nodist_unnormed'), \
                sum('Izmd_v3_normed_D_and_TPY').alias('Izmd_v3_normed_D_and_TPY'), \
                sum('Izmd_v4_nodist_normed_TPY').alias('Izmd_v4_nodist_normed_TPY'), \
                sum('Izmd_v5_all_normed').alias('Izmd_v5_all_normed') \
                    )

combined_df.limit(10).show()

+--------------+----------+-------------------------------+--------------------+-------------------------------+------------------+------------------------+--------------------------------------+-------------------+------------------+------------------+-----------------------+------------------------+-------------------------+------------------+
|       CDSCode|school_zip|central_wind_alignment_180_high|  ps_pm25_tpy_normed|school_to_ps_geod_dist_m_normed|       ps_pm25_tpy|school_to_ps_geod_dist_m|central_wind_alignment_180_high_normed|    avg_wspd_normed|          avg_wspd|  Izmd_v1_unnormed|Izmd_v2_nodist_unnormed|Izmd_v3_normed_D_and_TPY|Izmd_v4_nodist_normed_TPY|Izmd_v5_all_normed|
+--------------+----------+-------------------------------+--------------------+-------------------------------+------------------+------------------------+--------------------------------------+-------------------+------------------+------------------+-----------------------+------------------------+--

In [420]:
combined_df.count()

13297

In [421]:
### AGGREGATION 3/3: reduce to m-y-zip level ###

combined_df = combined_df.groupBy("school_zip") \
                .agg(
                avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                avg('ps_pm25_tpy').alias('ps_pm25_tpy'), \
                avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m'), \
                avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                avg('avg_wspd').alias('avg_wspd'), \
                avg('Izmd_v1_unnormed').alias('Izmd_v1_unnormed'), \
                avg('Izmd_v2_nodist_unnormed').alias('Izmd_v2_nodist_unnormed'), \
                avg('Izmd_v3_normed_D_and_TPY').alias('Izmd_v3_normed_D_and_TPY'), \
                avg('Izmd_v4_nodist_normed_TPY').alias('Izmd_v4_nodist_normed_TPY'), \
                avg('Izmd_v5_all_normed').alias('Izmd_v5_all_normed'), \
                count('CDSCode')
                )

In [422]:
combined_df.limit(10).show()

+----------+-------------------------------+--------------------+-------------------------------+------------------+------------------------+--------------------------------------+-------------------+------------------+------------------+-----------------------+------------------------+-------------------------+------------------+
|school_zip|central_wind_alignment_180_high|  ps_pm25_tpy_normed|school_to_ps_geod_dist_m_normed|       ps_pm25_tpy|school_to_ps_geod_dist_m|central_wind_alignment_180_high_normed|    avg_wspd_normed|          avg_wspd|  Izmd_v1_unnormed|Izmd_v2_nodist_unnormed|Izmd_v3_normed_D_and_TPY|Izmd_v4_nodist_normed_TPY|Izmd_v5_all_normed|
+----------+-------------------------------+--------------------+-------------------------------+------------------+------------------------+--------------------------------------+-------------------+------------------+------------------+-----------------------+------------------------+-------------------------+------------------+
|

In [423]:
combined_df.count()

1539

In [430]:
display(pd_combined_df)

Unnamed: 0,school_zip,central_wind_alignment_180_high,ps_pm25_tpy_normed,school_to_ps_geod_dist_m_normed,ps_pm25_tpy,school_to_ps_geod_dist_m,central_wind_alignment_180_high_normed,avg_wspd_normed,avg_wspd,Izmd_v1_unnormed,Izmd_v2_nodist_unnormed,Izmd_v3_normed_D_and_TPY,Izmd_v4_nodist_normed_TPY,Izmd_v5_all_normed,y-m
0,93545,83.838107,0.001836,0.285797,6.534494,50737.137513,0.465767,0.073056,1.417343,56.619465,3.092897e+06,2725.352792,881.296346,0.780409,2017-12
1,90022,76.816557,0.001958,0.024536,6.918356,4385.052865,0.426759,0.088330,1.713602,747.958785,3.161758e+06,37474.416083,886.255079,10.731574,2017-12
2,95134,86.754583,0.001530,0.014368,5.576702,2581.073635,0.481970,0.106103,2.058303,1914.158122,3.717731e+06,94362.605192,1018.616360,27.023731,2017-12
3,91910,84.753907,0.011993,0.027924,38.345966,4986.259941,0.470855,0.135873,2.635697,8183.436869,3.248345e+07,459356.513832,10158.477004,131.558119,2017-12
4,95519,106.226910,0.009749,0.069164,31.318293,12302.859938,0.590150,0.183478,3.559006,4756.258363,4.565250e+07,264556.840783,14215.170805,75.771189,2017-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,95595,76.609741,0.007131,0.313867,23.119979,55717.276230,0.425610,0.122226,2.371016,257.009172,1.453019e+07,14059.435628,4478.538944,4.026420,2017-12
1535,95697,95.570250,0.001927,0.034893,6.821147,6222.677918,0.530946,0.126618,2.456203,816.911272,5.200833e+06,40635.870048,1459.008794,11.637578,2017-12
1536,90068,105.093625,0.001554,0.050988,5.651148,9078.079417,0.583853,0.099029,1.921112,503.231207,4.150721e+06,24776.972339,1138.069072,7.095565,2017-12
1537,92285,114.008252,0.017967,0.248821,57.058044,44177.017038,0.633379,0.099679,1.933707,1091.287026,4.455261e+07,61042.581036,14020.187832,17.481067,2017-12


In [424]:
pd_combined_df = combined_df.toPandas()

pd_combined_df['y-m'] = parquet_file

zmy_agg_list.append(combined_df)

NameError: name 'zmy_agg_list' is not defined

In [432]:
pd_combined_df.shape

(1539, 15)

In [434]:
# test final join

df_merged = pd.merge(pd_combined_df, zip_avgs, left_on=["school_zip","y-m"], right_on=["zip_code", "y-m"], how="left")

In [436]:
display(df_merged)

Unnamed: 0,school_zip,central_wind_alignment_180_high,ps_pm25_tpy_normed,school_to_ps_geod_dist_m_normed,ps_pm25_tpy,school_to_ps_geod_dist_m,central_wind_alignment_180_high_normed,avg_wspd_normed,avg_wspd_x,Izmd_v1_unnormed,Izmd_v2_nodist_unnormed,Izmd_v3_normed_D_and_TPY,Izmd_v4_nodist_normed_TPY,Izmd_v5_all_normed,y-m,zip_code,avg_wspd_y,avg_wdir_0N,avg_u,avg_v
0,93545,83.838107,0.001836,0.285797,6.534494,50737.137513,0.465767,0.073056,1.417343,56.619465,3.092897e+06,2725.352792,881.296346,0.780409,2017-12,93545,1.316469,77.580772,0.943365,-0.255368
1,90022,76.816557,0.001958,0.024536,6.918356,4385.052865,0.426759,0.088330,1.713602,747.958785,3.161758e+06,37474.416083,886.255079,10.731574,2017-12,90022,1.713602,-72.940519,-0.489077,-0.455836
2,95134,86.754583,0.001530,0.014368,5.576702,2581.073635,0.481970,0.106103,2.058303,1914.158122,3.717731e+06,94362.605192,1018.616360,27.023731,2017-12,95134,2.068261,-16.739172,-0.410638,0.179738
3,91910,84.753907,0.011993,0.027924,38.345966,4986.259941,0.470855,0.135873,2.635697,8183.436869,3.248345e+07,459356.513832,10158.477004,131.558119,2017-12,91910,2.654365,0.210515,0.202613,-0.397664
4,95519,106.226910,0.009749,0.069164,31.318293,12302.859938,0.590150,0.183478,3.559006,4756.258363,4.565250e+07,264556.840783,14215.170805,75.771189,2017-12,95519,3.834603,-31.618201,-1.502319,0.392380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1534,95595,76.609741,0.007131,0.313867,23.119979,55717.276230,0.425610,0.122226,2.371016,257.009172,1.453019e+07,14059.435628,4478.538944,4.026420,2017-12,95595,1.746149,-62.096406,-1.213069,0.062697
1535,95697,95.570250,0.001927,0.034893,6.821147,6222.677918,0.530946,0.126618,2.456203,816.911272,5.200833e+06,40635.870048,1459.008794,11.637578,2017-12,95697,2.438686,54.012624,0.218650,-0.464267
1536,90068,105.093625,0.001554,0.050988,5.651148,9078.079417,0.583853,0.099029,1.921112,503.231207,4.150721e+06,24776.972339,1138.069072,7.095565,2017-12,90068,2.000370,-43.854822,-0.257081,-0.751116
1537,92285,114.008252,0.017967,0.248821,57.058044,44177.017038,0.633379,0.099679,1.933707,1091.287026,4.455261e+07,61042.581036,14020.187832,17.481067,2017-12,92285,2.152793,66.609685,1.068993,-0.526304


## Basic Instrument form (distance is present in some versions only)

## $$I_{zmy} = \sum_{ps=1}^{3} \sum_{d_{m}=1}^{D_{m}}\theta_{downstream_{zd_{m}}} \times TPY_{ps} \times \frac{S_{zd_{m}}}{D_{ps}}$$

V1: as written, no normalizing - Our original IV </br>
V2: no dividing by distance, no normalizing - Cornelia wants this</br>
V3: as written, normalizing - normalize TPY, Dps (z-score, min max)</br>
V4: no dividing by distance, normalizing - normalize TPY, Dps (z-score, min max)</br>
V5: as written, all quantities normed


# End Calculations/Aggregation Step-through

___________________


# Begin Calculation/Aggregation Loop

In [6]:
# Setup directories/variables

dest = 'remote' # 'local'

local_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\'
remote_dir = 'I:\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\'

if dest == 'remote':
    file_path = remote_dir
else:
    file_path = local_dir

in_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\wind_subset_by_month\\'
out_dir_zmy_raw_avgs = os.path.join(file_path,'spark_outputs/naive_zmy_avgs/')
out_dir_unagged = os.path.join(file_path,'spark_outputs/raw_my_spark_dfs_top1/')
out_dir_zmy = os.path.join(file_path,'spark_outputs/zmy_agged_dfs/')

In [136]:
# testing     
"""Process and Aggregate Monthly Data
Before running this, set input/output directories above.

Inputs: early_stopping (int): for testing, set to max iterations to perform
Outputs: combined Pandas dataframe with all aggregated y-m data
"""
### Opening Section: Data Load and Preprocessing ###

early_stopping = 1

# read in files
school_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_school_lookup_filtered.csv'))
ps_year_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/year_lookup.csv'))
school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/school_year_to_point_lookup_top10_filtered.csv'))
ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_ps_point_lookup_top10_filtered.csv'))
school_filter = spark.read.option("header",True).csv(os.path.join(file_path,'lookup tables/school_ym_filter.csv'))

# Calculate and save out statistics for first normalizing (standard scaler)
cols_to_drop = ['ps_index','CDSCode', 'ps_year', 'ps_zip', 'angle_to_school', 'ps_distance_rank']
ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct().cache()

ps_stats = ps_agg.select(avg('ps_pm25_tpy'), avg('school_to_ps_geod_dist_m'), 
                         stddev_samp('ps_pm25_tpy'), stddev_samp('school_to_ps_geod_dist_m')).collect()

ps_stats = ps_stats[0]

ps_TPY_mean = ps_stats[0]
ps_dist_mean = ps_stats[1]
ps_TPY_sd = ps_stats[2]
ps_dist_sd = ps_stats[3]

# Calculate and save statistics for second normalizing (min-max)
ps_agg = ps_agg.withColumn('TPY_norm', (col('ps_pm25_tpy') - ps_TPY_mean)/ps_TPY_sd).withColumn('dist_norm', (col('school_to_ps_geod_dist_m') - ps_dist_mean)/ps_dist_sd)
ps_stats_mm = ps_agg.select(min('TPY_norm'), min('dist_norm'), max('TPY_norm'), max('dist_norm')).collect()
ps_stats_mm = ps_stats_mm[0]

ps_TPY_min = ps_stats_mm[0]
ps_dist_min = ps_stats_mm[1]
ps_TPY_max = ps_stats_mm[2]
ps_dist_max = ps_stats_mm[3]

# lists to contain pandas dataframes
zmy_agg_list = []
df_avgs_list = []

counter = 0

# loop through files
for parquet_file in os.listdir(in_dir):

    if (early_stopping == 0 or counter < early_stopping):

        print("Now processing",parquet_file)

        # for holding augmented df at the zip code level
        temp_zmy_df = pd.DataFrame()

        # for holding augmented df at the school level
        temp_school_my_df = pd.DataFrame()

        # read in one month
        temp_meas_df = spark.read.parquet(os.path.join(in_dir, parquet_file))

        temp_meas_df = (temp_meas_df
                        .withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))
                                                   /(3.141592653589793238462)).cast('double')
                                   )
                        )

        # drop lat/lon, wdir, and y-m and store temp df to re-join for ps wind readings (dropped can be recovered if needed)
        wind_temp_df = temp_meas_df.drop('wind_lat','wind_lon','wdir','y-m')
        wind_temp_df.cache()

        # rename for explicitness of measurements
        temp_meas_df = (temp_meas_df
                        .withColumnRenamed('wind_lat','school_wind_lat')
                        .withColumnRenamed('wind_lon','school_wind_lon')
                        .withColumnRenamed('u','school_u')
                        .withColumnRenamed('v','school_v')
                        .withColumnRenamed('grid_index','school_grid_index')
                       ).drop('wdir') # wdir is wrt 0° E and is confusing
        
        counter += 1

Now processing 2001-01
Now processing 2001-02
Now processing 2001-03
Now processing 2001-04
Now processing 2001-05
Now processing 2001-06
Now processing 2001-07
Now processing 2001-08
Now processing 2001-09
Now processing 2001-10
Now processing 2001-11
Now processing 2001-12
Now processing 2002-01
Now processing 2002-02
Now processing 2002-03
Now processing 2002-04
Now processing 2002-05
Now processing 2002-06
Now processing 2002-07
Now processing 2002-08
Now processing 2002-09
Now processing 2002-10
Now processing 2002-11
Now processing 2002-12
Now processing 2003-01
Now processing 2003-02
Now processing 2003-03
Now processing 2003-04
Now processing 2003-05
Now processing 2003-06
Now processing 2003-07
Now processing 2003-08
Now processing 2003-09
Now processing 2003-10
Now processing 2003-11
Now processing 2003-12
Now processing 2004-01
Now processing 2004-02
Now processing 2004-03
Now processing 2004-04
Now processing 2004-05
Now processing 2004-06
Now processing 2004-07
Now process

In [143]:
combined_df = temp_meas_df.join(school_lookup, ['school_grid_index'], how='inner')

In [146]:
combined_df.printSchema()

root
 |-- school_grid_index: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- wspd: double (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- y-m: string (nullable = true)
 |-- wdir_wrt_0N: double (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)



In [144]:
temp_meas_df.printSchema()

root
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- wspd: double (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- school_grid_index: string (nullable = true)
 |-- y-m: string (nullable = true)
 |-- wdir_wrt_0N: double (nullable = true)



In [145]:
school_lookup.printSchema()

root
 |-- school_grid_index: string (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)



In [155]:
school_filter.printSchema()

root
 |-- y-m: string (nullable = true)
 |-- CDSCode: string (nullable = true)



In [154]:
school_filter = school_filter.drop('_c0').withColumnRenamed('cdscode','CDSCode')

In [147]:
combined_df.show()

+-----------------+---------------+---------------+---------+---------+--------+-------------------+-------+-------------------+-----------+----------+----------+-----------+--------------------------+
|school_grid_index|school_wind_lat|school_wind_lon| school_u| school_v|    wspd|           Datetime|    y-m|        wdir_wrt_0N|    CDSCode|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+-----------------+---------------+---------------+---------+---------+--------+-------------------+-------+-------------------+-----------+----------+----------+-----------+--------------------------+
|              275|      41.779999|    -123.160004|-1.681037|-0.647547|1.801445|2017-12-10 03:00:00|2017-12|-111.06702753751394|4.77046E+13|     96086| 41.842518|-123.193087|                7468.31985|
|              275|      41.779999|    -123.160004|-1.681037|-0.647547|1.801445|2017-12-10 03:00:00|2017-12|-111.06702753751394|4.77046E+13|     96086| 41.838551| -123.18893|               693

In [142]:
school_lookup.show()

+-----------------+-----------+----------+----------+-----------+--------------------------+
|school_grid_index|    CDSCode|school_zip|school_lat| school_lon|wind_to_school_geod_dist_m|
+-----------------+-----------+----------+----------+-----------+--------------------------+
|              789|1.01011E+13|     93706| 36.730273|-119.807915|               10656.24466|
|              866|1.06211E+13|     93631| 36.554793|-119.504582|               8904.455363|
|              827|1.06212E+13|     93720| 36.875241|-119.759962|               13828.73923|
|              827|1.06212E+13|     93611|   36.8173|-119.674236|                4329.91988|
|              753|1.06213E+13|     93234| 36.208894|-120.098567|               9630.587612|
|              789|1.06217E+13|     93703| 36.766774| -119.79984|               9943.805591|
|              789|1.06217E+13|     93704| 36.799444|-119.811523|               9051.034654|
|              789|1.06217E+13|     93704| 36.808151|-119.807492|     

In [137]:
temp_meas_df.show()

+---------------+---------------+---------+---------+--------+-------------------+-----------------+-------+-------------------+
|school_wind_lat|school_wind_lon| school_u| school_v|    wspd|           Datetime|school_grid_index|    y-m|        wdir_wrt_0N|
+---------------+---------------+---------+---------+--------+-------------------+-----------------+-------+-------------------+
|      41.779999|    -123.160004|-1.681037|-0.647547|1.801445|2017-12-10 03:00:00|              275|2017-12|-111.06702753751394|
|      40.779999|    -123.160004|-0.465244|-0.689127|0.831473|2017-12-10 03:00:00|              279|2017-12| -145.9758937657448|
|      40.529999|    -123.160004|-0.096408|-0.579673|0.587635|2017-12-10 03:00:00|              280|2017-12|-170.55731365363118|
|      39.779999|    -123.160004|-1.034598|-0.695242|1.246497|2017-12-10 03:00:00|              283|2017-12|-123.90080103304723|
|      39.279999|    -123.160004| -0.98825|-0.635317|1.174847|2017-12-10 03:00:00|              2

In [22]:
school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/school_year_to_point_lookup_top20_5tpy_filtered.csv'))
cols_to_drop = ['_c0','ps_index','CDSCode', 'ps_year', 'ps_zip', 'angle_to_school', 'ps_distance_rank']
ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct().cache()
school_to_ps_lookup.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- point_source_index: string (nullable = true)
 |-- point_source_lat: string (nullable = true)
 |-- point_source_lon: string (nullable = true)
 |-- point_source_year: string (nullable = true)
 |-- point_source_pm25_tpy: string (nullable = true)
 |-- point_source_zip: string (nullable = true)
 |-- geod_dist_m: string (nullable = true)
 |-- angle_to_school: string (nullable = true)
 |-- ps_distance_rank: string (nullable = true)



In [23]:
school_to_ps_lookup = (school_to_ps_lookup
                      .withColumnRenamed("point_source_index","ps_index")
                      .withColumnRenamed("point_source_lat","ps_lat")
                      .withColumnRenamed("point_source_lon","ps_lon")
                      .withColumnRenamed("point_source_year","ps_year")
                      .withColumnRenamed("point_source_pm25_tpy","ps_pm25_tpy")
                      .withColumnRenamed("point_source_zip","ps_zip")
                      .withColumnRenamed("geod_dist_m","school_to_ps_geod_dist_m")
                      .drop('_c0')
                      )

In [24]:
school_to_ps_lookup.printSchema()

root
 |-- CDSCode: string (nullable = true)
 |-- ps_index: string (nullable = true)
 |-- ps_lat: string (nullable = true)
 |-- ps_lon: string (nullable = true)
 |-- ps_year: string (nullable = true)
 |-- ps_pm25_tpy: string (nullable = true)
 |-- ps_zip: string (nullable = true)
 |-- school_to_ps_geod_dist_m: string (nullable = true)
 |-- angle_to_school: string (nullable = true)
 |-- ps_distance_rank: string (nullable = true)



In [174]:
def aggregate_zmy(early_stopping: int = 0):
    """Process and Aggregate Monthly Data
    Before running this, set input/output directories above.

    Inputs: early_stopping (int): for testing, set to max iterations to perform
    Outputs: combined Pandas dataframe with all aggregated y-m data
    """
    ### Opening Section: Data Load and Preprocessing ###

    # read in files
    school_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_school_lookup_filtered.csv'))
    ps_year_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/year_lookup.csv'))
    school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/school_year_to_point_lookup_top20_5tpy_filtered.csv'))
    ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_ps_point_lookup_top10_filtered.csv'))
    school_filter = spark.read.option("header",True).csv(os.path.join(file_path,'lookup tables/school_ym_filter.csv'))
    
    school_to_ps_lookup = (school_to_ps_lookup
                          .withColumnRenamed("point_source_index","ps_index")
                          .withColumnRenamed("point_source_lat","ps_lat")
                          .withColumnRenamed("point_source_lon","ps_lon")
                          .withColumnRenamed("point_source_year","ps_year")
                          .withColumnRenamed("point_source_pm25_tpy","ps_pm25_tpy")
                          .withColumnRenamed("point_source_zip","ps_zip")
                          .withColumnRenamed("geod_dist_m","school_to_ps_geod_dist_m")
                          .drop('_c0')
                          )
    
    # Calculate and save out statistics for first normalizing (standard scaler)
    cols_to_drop = ['ps_index','CDSCode', 'ps_year', 'ps_zip', 'angle_to_school', 'ps_distance_rank']
    ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct()

    ps_stats = ps_agg.select(avg('ps_pm25_tpy'), avg('school_to_ps_geod_dist_m'), 
                             stddev_samp('ps_pm25_tpy'), stddev_samp('school_to_ps_geod_dist_m')).collect()

    ps_stats = ps_stats[0]

    ps_TPY_mean = ps_stats[0]
    ps_dist_mean = ps_stats[1]
    ps_TPY_sd = ps_stats[2]
    ps_dist_sd = ps_stats[3]

    # Calculate and save statistics for second normalizing (min-max)
    ps_agg = ps_agg.withColumn('TPY_norm', (col('ps_pm25_tpy') - ps_TPY_mean)/ps_TPY_sd).withColumn('dist_norm', (col('school_to_ps_geod_dist_m') - ps_dist_mean)/ps_dist_sd)
    ps_stats_mm = ps_agg.select(min('TPY_norm'), min('dist_norm'), max('TPY_norm'), max('dist_norm')).collect()
    ps_stats_mm = ps_stats_mm[0]

    ps_TPY_min = ps_stats_mm[0]
    ps_dist_min = ps_stats_mm[1]
    ps_TPY_max = ps_stats_mm[2]
    ps_dist_max = ps_stats_mm[3]

    # lists to contain pandas dataframes
    zmy_agg_list = []
    df_avgs_list = []
    
    counter = 0

    # loop through files
    for parquet_file in os.listdir(in_dir):
    
        
        if (early_stopping == 0 or counter < early_stopping):
            
            print("Now processing",parquet_file)

            # for holding augmented df at the zip code level
            temp_zmy_df = pd.DataFrame()

            # for holding augmented df at the school level
            temp_school_my_df = pd.DataFrame()

            # read in one month
            temp_meas_df = spark.read.parquet(os.path.join(in_dir, parquet_file))

            temp_meas_df = (temp_meas_df
                            .withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))
                                                       /(3.141592653589793238462)).cast('double')
                                       )
                            )

            # drop lat/lon, wdir, and y-m and store temp df to re-join for ps wind readings (dropped can be recovered if needed)
            wind_temp_df = temp_meas_df.drop('wind_lat','wind_lon','wdir','y-m')
            wind_temp_df.cache()

            # rename for explicitness of measurements
            temp_meas_df = (temp_meas_df
                            .withColumnRenamed('wind_lat','school_wind_lat')
                            .withColumnRenamed('wind_lon','school_wind_lon')
                            .withColumnRenamed('u','school_u')
                            .withColumnRenamed('v','school_v')
                            .withColumnRenamed('grid_index','school_grid_index')
                           ).drop('wdir') # wdir is wrt 0° E and is confusing

            combined_df = temp_meas_df.join(school_lookup, ['school_grid_index'], how='inner')

            # filter out schools which were closed, etc--as discovered elsewhere in pipeline
            school_filter = school_filter.drop('_c0').withColumnRenamed('cdscode','CDSCode')

            # for testing
            # combined_df.limit(5).show()
            
            # drop join column y-m; it is not needed
            combined_df = combined_df.join(school_filter, ['CDSCode','y-m'], how='inner').drop('y-m')
            
            # for testing
            # combined_df.limit(5).show()
            
            # compute zip code averages for wdir, wspd
            zip_avgs = (combined_df.groupBy('school_zip')
                        .avg('wspd','wdir_wrt_0N', 'school_u','school_v')
                        .withColumnRenamed("school_zip","zip_code")
                        .withColumnRenamed("avg(wspd)","avg_wspd_at_school")
                        .withColumnRenamed("avg(wdir_wrt_0N)","avg_wdir_0N")
                        .withColumnRenamed("avg(school_u)","avg_u")
                        .withColumnRenamed("avg(school_v)","avg_v")
                        .toPandas()
                       )

            zip_avgs['y-m'] = parquet_file

            df_avgs_list.append(zip_avgs)

            combined_df = (combined_df
                           .withColumnRenamed('wspd','school_wspd')
                           .withColumnRenamed('wdir_wrt_0N','school_wdir_0N') 
                          )
            # for testing
            # combined_df.limit(5).show()

            ### Middle Section: joins ###

            # need to lookup by CDSCode and year, so substring for year
            combined_df = combined_df.withColumn("year", substring(col('Datetime'),1,4))

            # join in ps <-> year lookup
            combined_df = combined_df.join(ps_year_lookup, ['year'], how='left').drop('year')

            # use lookup year to join in PSs to each school
            combined_df = combined_df.join(school_to_ps_lookup, ['CDSCode','ps_year'], how='left')

            # join lookup table to get nearest wind grid index for each PS
            combined_df = combined_df.join(ps_lookup, ['ps_index'], how='left')

            # rename for clarity
            combined_df = combined_df.withColumnRenamed('geod_dist_m', 'school_to_ps_geod_dist_m')

            # rename of ease/cleanliness of join           
            combined_df = combined_df.withColumnRenamed("ps_grid_index","grid_index")
            
            # filter for top n only if desired
            combined_df = combined_df.filter(col('ps_distance_rank') == '1')

            # join wind measurements at PS that we had saved off
            combined_df = combined_df.join(wind_temp_df, ['grid_index',"Datetime"], how='left')

            # rename for clarity
            combined_df = (combined_df
                            .withColumnRenamed('u','ps_u')
                            .withColumnRenamed('v','ps_v')
                            .withColumnRenamed('wspd','ps_wspd')
                            .withColumnRenamed('wdir_wrt_0N','ps_wdir_0N')
                           )
        
            combined_df = combined_df.withColumn("angle_to_school",col("angle_to_school").cast("double"))

            combined_df.cache()
            
            # for testing
            # combined_df.limit(5).show()
            
            ### Second-to-last Section: Computations ###

            # function to compute better difference between alignments, factoring the zero-crossing
            # first angle is wind angle, second angle is heading to school (both wrt 0N)
            
            combined_df = (combined_df \
                          .withColumn("school_angle_diff",(col('angle_to_school') - col('school_wdir_0N')).cast('double'))
                          .withColumn("ps_angle_diff",(col('angle_to_school') - col('ps_wdir_0N')).cast('double'))
                          )

            combined_df.cache()
            
            combined_df = (combined_df \
                          .withColumn("school_wind_alignment",
                                     when(col("school_angle_diff") < -180, col("school_angle_diff") + 360) \
                                    .when(col("school_angle_diff") > 180, col("school_angle_diff") - 360) \
                                    .otherwise(col("school_angle_diff")).cast('double')) \
                          .withColumn("ps_wind_alignment",
                                     when(col("ps_angle_diff") < -180, col("ps_angle_diff") + 360) \
                                    .when(col("ps_angle_diff") > 180, col("ps_angle_diff") - 360) \
                                    .otherwise(col("ps_angle_diff")).cast('double'))
                          ).drop("school_angle_diff").drop("ps_angle_diff")

            combined_df.cache()
            
            combined_df = combined_df.withColumn("central_wind_alignment_180_high", 
                                                 (180 - abs(((col('school_wind_alignment') + col('ps_wind_alignment'))/2))).cast('double'))

            combined_df.cache()
            
            # compute normed TPY and dist using scalars from above
            combined_df = (combined_df
                           .withColumn('ps_pm25_tpy_normed', 
                                       ((((col('ps_pm25_tpy') - ps_TPY_mean) / ps_TPY_sd) - ps_TPY_min) / (ps_TPY_max - ps_TPY_min)).cast('double'))
                           .withColumn('school_to_ps_geod_dist_m_normed',((((col('school_to_ps_geod_dist_m') - ps_dist_mean) / ps_dist_sd) - ps_dist_min) / (ps_dist_max - ps_dist_min)).cast('double'))
                          )
            
            combined_df.cache()

            # compute normed wspd and wind alignment for Instrument v5

            combined_df = (combined_df
                           .withColumn('avg_wspd',((col('school_wspd') + col('ps_wspd'))/2).cast('double'))
                          )
            
            combined_df.cache()

            combined_df = (combined_df
                           .withColumn('central_wind_alignment_180_high_normed',
                                       (col('central_wind_alignment_180_high')/180).cast('double'))
                           .withColumn('avg_wspd_normed',
                                       ((col('avg_wspd') - min_wspd) / (max_wspd - min_wspd)).cast('double'))
                          )
            
            combined_df.cache()

            combined_df = (combined_df
                           .withColumn('Izmy_v1_unnormed',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy') * (col('avg_wspd') / col('school_to_ps_geod_dist_m'))).cast('double'))
                           .withColumn('Izmy_v2_nodist_unnormed',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy') * col('avg_wspd')).cast('double'))
                           .withColumn('Izmy_v3_normed_D_and_TPY',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy_normed') * (col('avg_wspd') / col('school_to_ps_geod_dist_m_normed'))).cast('double'))
                           .withColumn('Izmy_v4_nodist_normed_TPY',(col('central_wind_alignment_180_high') * col('ps_pm25_tpy_normed') * col('avg_wspd')).cast('double'))
                           .withColumn('Izmy_v5_all_normed',(col('central_wind_alignment_180_high_normed') * col('ps_pm25_tpy_normed') * (col('avg_wspd_normed') / col('school_to_ps_geod_dist_m_normed'))).cast('double'))
                          )

            # for testing
            # combined_df.limit(5).show()
            
            ### Final Section: Aggregations and Saving Out ###

            # write out raw (unaggregated) df
            # file_name = os.path.join(out_dir_unagged, parquet_file)
            # combined_df.write.parquet(file_name)

            # start selecting and aggregating down

            cols_to_select = ['CDSCode',
                            'school_zip',
                            'ps_distance_rank',
                            'ps_pm25_tpy_normed',
                            'ps_pm25_tpy',
                            'angle_to_school',
                            'school_to_ps_geod_dist_m_normed',
                            'school_to_ps_geod_dist_m',
                             'central_wind_alignment_180_high',
                             'central_wind_alignment_180_high_normed',
                             'avg_wspd_normed',
                             'avg_wspd',
                             'Izmy_v1_unnormed',
                             'Izmy_v2_nodist_unnormed',
                             'Izmy_v3_normed_D_and_TPY',
                             'Izmy_v4_nodist_normed_TPY',
                             'Izmy_v5_all_normed']

            combined_df = combined_df.select(*cols_to_select)

            ### AGGREGATION 1/3: reduce to m-y-school-ps(-zip) level ###

            group_by_cols = ['CDSCode',
                            'school_zip',
                            'ps_distance_rank',
                            'ps_pm25_tpy_normed',
                            'ps_pm25_tpy',
                            'angle_to_school',
                            'school_to_ps_geod_dist_m_normed',
                            'school_to_ps_geod_dist_m']

            combined_df = combined_df.groupBy(*group_by_cols) \
                            .agg(
                            avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                            avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                            avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                            avg('avg_wspd').alias('avg_wspd'), \
                            sum('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                            sum('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                            sum('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                            sum('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                            sum('Izmy_v5_all_normed').alias('Izmy_v5_all_normed') \
                                )

            ### AGGREGATION 2/3: reduce to m-y-school(-zip) level ###

            combined_df = combined_df.groupBy("CDSCode", "school_zip") \
                            .agg(
                            avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                            avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                            avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                            avg('ps_pm25_tpy').alias('ps_pm25_tpy'), \
                            avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m'), \
                            avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                            avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                            avg('avg_wspd').alias('avg_wspd'), \
                            sum('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                            sum('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                            sum('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                            sum('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                            sum('Izmy_v5_all_normed').alias('Izmy_v5_all_normed') \
                                )

            ### AGGREGATION 3/3: reduce to m-y-zip level ###

            combined_df = combined_df.groupBy("school_zip") \
                            .agg(
                            avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                            avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                            avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                            avg('ps_pm25_tpy').alias('ps_pm25_tpy_top_1'), \
                            avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m_top_1'), \
                            avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                            avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                            avg('avg_wspd').alias('avg_wspd_top_1'), \
                            avg('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                            avg('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                            avg('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                            avg('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                            avg('Izmy_v5_all_normed').alias('Izmy_v5_all_normed'), \
                            count('CDSCode').alias('schools')).toPandas()

            # combined_df['y-m'] = parquet_file
            
            # display(combined_df)

            zmy_agg_list.append(combined_df)

            # counter for early stopping
            counter += 1
    df_avgs = pd.concat(df_avgs_list)
    # df_avgs.to_csv(os.path.join(out_dir_zmy_raw_avgs,'df_zmy_avgs.csv'))
                            
    df = pd.concat(zmy_agg_list)
    
    df_merged = pd.merge(df, df_avgs, left_on=["school_zip","y-m"], right_on=["zip_code", "y-m"], how="left").drop(columns = 'zip_code')

    return df_merged

In [175]:
df_Izmy = aggregate_zmy()

Now processing 2001-01
Now processing 2001-02
Now processing 2001-03
Now processing 2001-04
Now processing 2001-05
Now processing 2001-06
Now processing 2001-07
Now processing 2001-08
Now processing 2001-09
Now processing 2001-10
Now processing 2001-11
Now processing 2001-12
Now processing 2002-01
Now processing 2002-02
Now processing 2002-03
Now processing 2002-04
Now processing 2002-05
Now processing 2002-06
Now processing 2002-07
Now processing 2002-08
Now processing 2002-09
Now processing 2002-10
Now processing 2002-11
Now processing 2002-12
Now processing 2003-01
Now processing 2003-02
Now processing 2003-03
Now processing 2003-04
Now processing 2003-05
Now processing 2003-06
Now processing 2003-07
Now processing 2003-08
Now processing 2003-09
Now processing 2003-10
Now processing 2003-11
Now processing 2003-12
Now processing 2004-01
Now processing 2004-02
Now processing 2004-03
Now processing 2004-04
Now processing 2004-05
Now processing 2004-06
Now processing 2004-07
Now process

Py4JJavaError: An error occurred while calling o279098.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 31678.0 failed 1 times, most recent failure: Lost task 0.0 in stage 31678.0 (TID 177046) (Lyons-Desktop-CPPC.hscs.virginia.edu executor driver): TaskResultLost (result lost from block manager)
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:431)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:137)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:191)
	at java.util.concurrent.FutureTask.run(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)


In [9]:
# Setup directories/variables

max_wspd = 19.395623
min_wspd = 0.000415

dest = 'remote' # or, optionally, 'local'

local_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\'
remote_dir = 'I:\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\'

if dest == 'remote':
    file_path = remote_dir
else:
    file_path = local_dir

in_dir = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\data\\wind_subset_by_month\\'
out_dir_zmy_raw_avgs = os.path.join(file_path,'spark_outputs/naive_zmy_avgs/')
out_dir_unagged = os.path.join(file_path,'spark_outputs/raw_my_spark_dfs_top1/')
out_dir_zmy = os.path.join(file_path,'spark_outputs/zmy_agged_dfs/')

In [26]:
used_site_lookup = spark.read.option("header",True).csv(os.path.join(local_dir, 'all_used_wind_sites.csv'))

# used_site_lookup = used_site_lookup.drop('wind_lat').drop('wind_lon')

In [None]:
temp_meas_df.count()

In [27]:
# pulling wind data only for graphing

# read in one month
temp_meas_df = spark.read.parquet(os.path.join(in_dir, '2015-04'))

temp_meas_df = (temp_meas_df
                .withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))
                                           /(3.141592653589793238462)).cast('double')
                           )
                )

# temp_meas_df = temp_meas_df.toPandas()

# temp_meas_df.to_csv(os.path.join(local_dir,'wind_plotting_data_full_grid_2015-04.csv'))

In [28]:
temp_meas_df.count()

347040

In [29]:
temp_meas_df = temp_meas_df.join(used_site_lookup, ['grid_index', 'wind_lat','wind_lon'], how='inner')

In [30]:
temp_meas_df.count()

347040

In [31]:
temp_meas_df.limit(5).show()

+----------+---------+-----------+--------+---------+----------+--------+-------------------+-------+------------------+
|grid_index| wind_lat|   wind_lon|       u|        v|      wdir|    wspd|           Datetime|    y-m|       wdir_wrt_0N|
+----------+---------+-----------+--------+---------+----------+--------+-------------------+-------+------------------+
|        44|42.029999|-124.160004| 2.21919|-0.628957|344.176342|2.306597|2015-04-01 00:00:00|2015-04|105.82365114590112|
|       123|41.779999|-124.160004| 2.57809|-1.276175|333.664235| 2.87666|2015-04-01 00:00:00|2015-04|116.33577504217752|
|       124|41.529999|-124.160004|2.486899|-2.000245|321.189825|3.191496|2015-04-01 00:00:00|2015-04|128.81016973605261|
|       125|41.279999|-124.160004|2.106133|-2.545203|309.607458|3.303613|2015-04-01 00:00:00|2015-04|140.39254379681486|
|       126|41.029999|-124.160004|1.539785| -3.06158|296.699567|3.426983|2015-04-01 00:00:00|2015-04| 153.3004379814922|
+----------+---------+----------

In [43]:
one_day_df.count()

11568

In [44]:
one_day_df = one_day_df.toPandas()
one_day_df.to_csv(os.path.join(local_dir,'wind_plotting_one_day.csv'))

In [42]:
one_day_df = temp_meas_df.filter(F.substring('Datetime', 1, 10) == '2015-04-01')

In [None]:
# compute wind averages only




In [10]:
# code was failing near final stage on a subsequent run--hijacking the process so I can grab/fix outputs

# read in files
school_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_school_lookup_filtered.csv'))
ps_year_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/year_lookup.csv'))
school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'schools/school_year_to_point_lookup_top20_20tpy_filtered.csv'))
ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_ps_point_lookup_top10_filtered.csv'))
school_filter = spark.read.option("header",True).csv(os.path.join(file_path,'lookup tables/school_ym_filter.csv'))
nearby_ps = spark.read.option("header",True).csv(os.path.join(file_path,'schools/pointsources_within_5km_by_school_by_year.csv'))

nearby_ps = nearby_ps.withColumnRenamed('point_source_year','ps_year').drop('_c0')

school_to_ps_lookup = (school_to_ps_lookup
                      .withColumnRenamed("point_source_index","ps_index")
                      .withColumnRenamed("point_source_lat","ps_lat")
                      .withColumnRenamed("point_source_lon","ps_lon")
                      .withColumnRenamed("point_source_year","ps_year")
                      .withColumnRenamed("point_source_pm25_tpy","ps_pm25_tpy")
                      .withColumnRenamed("point_source_zip","ps_zip")
                      .withColumnRenamed("geod_dist_m","school_to_ps_geod_dist_m")
                      .drop('_c0')
                      )

school_lookup.cache()
ps_year_lookup.cache()
school_to_ps_lookup.cache()
ps_lookup.cache()
school_filter.cache()

# Calculate and save out statistics for first normalizing (standard scaler)
cols_to_drop = ['ps_index','CDSCode', 'ps_year', 'ps_zip', 'angle_to_school', 'ps_distance_rank']
ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct()

ps_stats = ps_agg.select(avg('ps_pm25_tpy'), avg('school_to_ps_geod_dist_m'), 
                         stddev_samp('ps_pm25_tpy'), stddev_samp('school_to_ps_geod_dist_m')).collect()

ps_stats = ps_stats[0]

ps_TPY_mean = ps_stats[0]
ps_dist_mean = ps_stats[1]
ps_TPY_sd = ps_stats[2]
ps_dist_sd = ps_stats[3]

# Calculate and save statistics for second normalizing (min-max)
ps_agg = ps_agg.withColumn('TPY_norm', (col('ps_pm25_tpy') - ps_TPY_mean)/ps_TPY_sd).withColumn('dist_norm', (col('school_to_ps_geod_dist_m') - ps_dist_mean)/ps_dist_sd)
ps_stats_mm = ps_agg.select(min('TPY_norm'), min('dist_norm'), max('TPY_norm'), max('dist_norm')).collect()
ps_stats_mm = ps_stats_mm[0]

ps_TPY_min = ps_stats_mm[0]
ps_dist_min = ps_stats_mm[1]
ps_TPY_max = ps_stats_mm[2]
ps_dist_max = ps_stats_mm[3]

# lists to contain pandas dataframes
zmy_agg_list = []
df_avgs_list = []

counter = 0

# loop through files
for parquet_file in os.listdir(in_dir):

    print("Now processing",parquet_file)

    # for holding augmented df at the zip code level
    temp_zmy_df = pd.DataFrame()

    # for holding augmented df at the school level
    temp_school_my_df = pd.DataFrame()

    # read in one month
    temp_meas_df = spark.read.parquet(os.path.join(in_dir, parquet_file))

    temp_meas_df = (temp_meas_df
                    .withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))
                                               /(3.141592653589793238462)).cast('double')
                               )
                    )

    # drop lat/lon, wdir, and y-m and store temp df to re-join for ps wind readings (dropped can be recovered if needed)
    wind_temp_df = temp_meas_df.drop('wind_lat','wind_lon','wdir','y-m')
    wind_temp_df.cache()

    # rename for explicitness of measurements
    temp_meas_df = (temp_meas_df
                    .withColumnRenamed('wind_lat','school_wind_lat')
                    .withColumnRenamed('wind_lon','school_wind_lon')
                    .withColumnRenamed('u','school_u')
                    .withColumnRenamed('v','school_v')
                    .withColumnRenamed('grid_index','school_grid_index')
                   ).drop('wdir') # wdir is wrt 0° E and is confusing

    combined_df = temp_meas_df.join(school_lookup, ['school_grid_index'], how='inner')

    # filter out schools which were closed, etc--as discovered elsewhere in pipeline
    school_filter = school_filter.drop('_c0').withColumnRenamed('cdscode','CDSCode')

    # for testing
    # combined_df.limit(5).show()

    # drop join column y-m; it is not needed
    combined_df = combined_df.join(school_filter, ['CDSCode','y-m'], how='inner').drop('y-m')

    # for testing
    # combined_df.limit(5).show()

    # compute zip code averages for wdir, wspd
    zip_avgs = (combined_df.groupBy('school_zip')
                .avg('wspd','wdir_wrt_0N', 'school_u','school_v')
                .withColumnRenamed("school_zip","zip_code")
                .withColumnRenamed("avg(wspd)","avg_wspd_at_school")
                .withColumnRenamed("avg(wdir_wrt_0N)","avg_wdir_0N")
                .withColumnRenamed("avg(school_u)","avg_u")
                .withColumnRenamed("avg(school_v)","avg_v")
                .toPandas()
               )

    zip_avgs['y-m'] = parquet_file

    df_avgs_list.append(zip_avgs)

    combined_df = (combined_df
                   .withColumnRenamed('wspd','school_wspd')
                   .withColumnRenamed('wdir_wrt_0N','school_wdir_0N') 
                  )
    # for testing
    # combined_df.limit(5).show()

    ### Middle Section: joins ###


    # need to lookup by CDSCode and year, so substring for year
    combined_df = combined_df.withColumn("year", substring(col('Datetime'),1,4))

    # join in ps <-> year lookup
    combined_df = combined_df.join(ps_year_lookup, ['year'], how='left').drop('year')

    # use lookup year to join in PSs to each school
    combined_df = combined_df.join(school_to_ps_lookup, ['CDSCode','ps_year'], how='left')

    # join in count of nearby PSs by year and CDSCode
    combined_df = combined_df.join(nearby_ps, ['CDSCode','ps_year'], how='left')
    
    combined_df = combined_df.withColumn('ps_within_5km_count', F.coalesce(col('ps_within_5km_count'),F.lit(0)))
    
    # join lookup table to get nearest wind grid index for each PS
    combined_df = combined_df.join(ps_lookup, ['ps_index'], how='left')

    # rename for clarity
    combined_df = combined_df.withColumnRenamed('geod_dist_m', 'school_to_ps_geod_dist_m')

    # rename of ease/cleanliness of join           
    combined_df = combined_df.withColumnRenamed("ps_grid_index","grid_index")
    
    combined_df = combined_df.withColumn("ps_distance_rank",col("ps_distance_rank").cast('int'))

    # filter for top n only if desired
    combined_df = combined_df.filter(col('ps_distance_rank') <= '20')

    # join wind measurements at PS that we had saved off
    combined_df = combined_df.join(wind_temp_df, ['grid_index',"Datetime"], how='left')

    # rename for clarity
    combined_df = (combined_df
                    .withColumnRenamed('u','ps_u')
                    .withColumnRenamed('v','ps_v')
                    .withColumnRenamed('wspd','ps_wspd')
                    .withColumnRenamed('wdir_wrt_0N','ps_wdir_0N')
                   )

    combined_df = combined_df.withColumn("angle_to_school",col("angle_to_school").cast("double"))

    
    # for testing
    # combined_df.limit(5).show()

    ### Second-to-last Section: Computations ###

    # function to compute better difference between alignments, factoring the zero-crossing
    # first angle is wind angle, second angle is heading to school (both wrt 0N)

    combined_df = (combined_df \
                  .withColumn("school_angle_diff",(col('angle_to_school') - col('school_wdir_0N')).cast('double'))
                  .withColumn("ps_angle_diff",(col('angle_to_school') - col('ps_wdir_0N')).cast('double'))
                  )

    combined_df = (combined_df \
                  .withColumn("school_wind_alignment",
                             when(col("school_angle_diff") < -180, col("school_angle_diff") + 360) \
                            .when(col("school_angle_diff") > 180, col("school_angle_diff") - 360) \
                            .otherwise(col("school_angle_diff")).cast('double')) \
                  .withColumn("ps_wind_alignment",
                             when(col("ps_angle_diff") < -180, col("ps_angle_diff") + 360) \
                            .when(col("ps_angle_diff") > 180, col("ps_angle_diff") - 360) \
                            .otherwise(col("ps_angle_diff")).cast('double'))
                  ).drop("school_angle_diff").drop("ps_angle_diff")

    
    combined_df = combined_df.withColumn("central_wind_alignment_180_high", 
                                         (180 - abs(((col('school_wind_alignment') + col('ps_wind_alignment'))/2))).cast('double'))
    
    # compute normed TPY and dist using scalars from above
    combined_df = (combined_df \
                   .withColumn('ps_pm25_tpy_normed', 
                               ((((col('ps_pm25_tpy') - ps_TPY_mean) / ps_TPY_sd) - ps_TPY_min) / (ps_TPY_max - ps_TPY_min)).cast('double'))
                   .withColumn('school_to_ps_geod_dist_m_normed',((((col('school_to_ps_geod_dist_m') - ps_dist_mean) / ps_dist_sd) - ps_dist_min) / (ps_dist_max - ps_dist_min)).cast('double'))
                  )
    
    combined_df = combined_df.withColumn('school_to_ps_geod_dist_m_normed_0_high', (F.lit(1) - col('school_to_ps_geod_dist_m_normed')).cast('double'))
    
    # compute normed wspd and wind alignment for Instrument v5

    combined_df = (combined_df \
                   .withColumn('avg_wspd',((col('school_wspd') + col('ps_wspd'))/2).cast('double'))
                  )

    combined_df = (combined_df \
                   .withColumn('central_wind_alignment_180_high_normed',
                               (col('central_wind_alignment_180_high')/180).cast('double'))
                   .withColumn('avg_wspd_normed',
                               ((col('avg_wspd') - min_wspd) / (max_wspd - min_wspd)).cast('double'))
                  )
    
    combined_df = (combined_df \
                   .withColumn('new_alignment_90_high',
                             when((col('central_wind_alignment_180_high') - 90) < 0, 0) \
                            .otherwise(col('central_wind_alignment_180_high') - 90).cast('double')) \
                  )

    combined_df = (combined_df \
                   .withColumn('new_alignment_90_high_normed',
                               (col('new_alignment_90_high')/90).cast('double'))
                  )
    
    # compute elevation differential
    combined_df = (combined_df
                   .withColumn('school_elevation_m',col('school_elevation_m').cast('double'))
                   .withColumn('ps_m',col('ps_elevation_m').cast('double'))
                  )
    
    combined_df = (combined_df
               .withColumn('elevation_diff_m',(col('ps_elevation_m') - col('school_elevation_m')).cast('double'))
                  )
    
    combined_df = (combined_df
           .withColumn('wspd_ratio_ps_sch',(col('ps_wspd') / (col('school_wspd')+0.0001)).cast('double'))
              )
    
    combined_df = (combined_df
           .withColumn('wspd_ratio_sch_ps',(col('school_wspd') / (col('ps_wspd')+0.0001)).cast('double'))
              )
    
    combined_df = (combined_df
                   .withColumn('Izmy_v1_unnormed',(col('new_alignment_90_high') * col('ps_pm25_tpy') * (col('avg_wspd') / col('school_to_ps_geod_dist_m'))).cast('double'))
                   .withColumn('Izmy_v1_unnormed_wspd_ratio',(col('new_alignment_90_high') * col('ps_pm25_tpy') * (col('wspd_ratio_ps_sch') / col('school_to_ps_geod_dist_m'))).cast('double'))
                   .withColumn('Izmy_v2_nodist_unnormed',(col('new_alignment_90_high') * col('ps_pm25_tpy') * col('avg_wspd')).cast('double'))
                   .withColumn('Izmy_v2_nodist_unnormed_wspd_ratio',(col('new_alignment_90_high') * col('ps_pm25_tpy') * col('wspd_ratio_ps_sch')).cast('double'))
                   .withColumn('Izmy_v3_normed_D_and_TPY',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * col('avg_wspd') * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                   .withColumn('Izmy_v3_normed_D_and_TPY_wspd_ratio',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * col('wspd_ratio_ps_sch') * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                   .withColumn('Izmy_v4_nodist_normed_TPY',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * col('avg_wspd')).cast('double'))
                   .withColumn('Izmy_v4_nodist_normed_TPY_wspd_ratio',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * col('wspd_ratio_ps_sch')).cast('double'))
                   .withColumn('Izmy_v5_all_normed',(col('new_alignment_90_high_normed') * col('ps_pm25_tpy_normed') * col('avg_wspd_normed') * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                   .withColumn('Izmy_v5_all_normed_but_wspd_ratio',(col('new_alignment_90_high_normed') * col('ps_pm25_tpy_normed') * col('wspd_ratio_ps_sch') * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                   .withColumn('Izmy_v6_unnormed_no_wspd',(col('new_alignment_90_high') * col('ps_pm25_tpy') / col('school_to_ps_geod_dist_m')).cast('double'))
                   .withColumn('Izmy_v7_all_normed_no_wspd',(col('new_alignment_90_high_normed') * col('ps_pm25_tpy_normed')  * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                   .withColumn('Izmy_v8_normed_D_and_TPY_no_wspd',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed')  * col('school_to_ps_geod_dist_m_normed_0_high')).cast('double'))
                  )

    # for testing
    # combined_df.limit(5).show()

    ### Final Section: Aggregations and Saving Out ###

    # write out raw (unaggregated) df
    # file_name = os.path.join(out_dir_unagged, parquet_file)
    # combined_df.write.parquet(file_name)

    # start selecting and aggregating down

    cols_to_select = ['CDSCode',
                     'school_zip',
                     'ps_distance_rank',
                     'ps_pm25_tpy_normed',
                     'ps_pm25_tpy',
                     'angle_to_school',
                     'elevation_diff_m',
                     'school_to_ps_geod_dist_m_normed',
                     'school_to_ps_geod_dist_m',
                     'new_alignment_90_high',
                     'new_alignment_90_high_normed',
                     'central_wind_alignment_180_high',
                     'central_wind_alignment_180_high_normed',
                     'school_wspd',
                     'ps_wspd',
                     'ps_within_5km_count',
                     'avg_wspd_normed',
                     'avg_wspd',
                     'wspd_ratio_ps_sch',
                     'wspd_ratio_sch_ps',
                    'Izmy_v1_unnormed',
                    'Izmy_v1_unnormed_wspd_ratio',
                    'Izmy_v2_nodist_unnormed',
                    'Izmy_v2_nodist_unnormed_wspd_ratio',
                    'Izmy_v3_normed_D_and_TPY',
                    'Izmy_v3_normed_D_and_TPY_wspd_ratio',
                    'Izmy_v4_nodist_normed_TPY',
                    'Izmy_v4_nodist_normed_TPY_wspd_ratio',
                    'Izmy_v5_all_normed',
                    'Izmy_v5_all_normed_but_wspd_ratio',
                    'Izmy_v6_unnormed_no_wspd',
                    'Izmy_v7_all_normed_no_wspd',
                    'Izmy_v8_normed_D_and_TPY_no_wspd']

    combined_df = combined_df.select(*cols_to_select)

    ### AGGREGATION 1/3: reduce to m-y-school-ps(-zip) level ###

    group_by_cols = ['CDSCode',
                    'school_zip',
                    'ps_distance_rank',
                    'ps_pm25_tpy_normed',
                    'ps_pm25_tpy',
                    'angle_to_school',
                    'school_to_ps_geod_dist_m_normed',
                    'school_to_ps_geod_dist_m',
                    'elevation_diff_m',
                    'ps_within_5km_count']

    combined_df = combined_df.groupBy(*group_by_cols) \
                    .agg(
                    avg('new_alignment_90_high').alias('new_alignment_90_high'), \
                    avg('new_alignment_90_high_normed').alias('new_alignment_90_high_normed'), \
                    avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                    avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                    avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                    avg('avg_wspd').alias('avg_wspd'), \
                    avg('school_wspd').alias('avg_school_wspd'), \
                    avg('ps_wspd').alias('avg_ps_wspd'), \
                    avg('wspd_ratio_ps_sch').alias('avg_wspd_ratio_ps_sch'), \
                    avg('wspd_ratio_sch_ps').alias('avg_wspd_ratio_sch_ps'), \
                    sum('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                    sum('Izmy_v1_unnormed_wspd_ratio').alias('Izmy_v1_unnormed_wspd_ratio'), \
                    sum('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                    sum('Izmy_v2_nodist_unnormed_wspd_ratio').alias('Izmy_v2_nodist_unnormed_wspd_ratio'), \
                    sum('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                    sum('Izmy_v3_normed_D_and_TPY_wspd_ratio').alias('Izmy_v3_normed_D_and_TPY_wspd_ratio'), \
                    sum('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                    sum('Izmy_v4_nodist_normed_TPY_wspd_ratio').alias('Izmy_v4_nodist_normed_TPY_wspd_ratio'), \
                    sum('Izmy_v5_all_normed').alias('Izmy_v5_all_normed'), \
                    sum('Izmy_v5_all_normed_but_wspd_ratio').alias('Izmy_v5_all_normed_but_wspd_ratio'), \
                    sum('Izmy_v6_unnormed_no_wspd').alias('Izmy_v6_unnormed_no_wspd'), \
                    sum('Izmy_v7_all_normed_no_wspd').alias('Izmy_v7_all_normed_no_wspd'), \
                    sum('Izmy_v8_normed_D_and_TPY_no_wspd').alias('Izmy_v8_normed_D_and_TPY_no_wspd') \
                        )

    ### AGGREGATION 2/3: reduce to m-y-school(-zip) level ###

    combined_df = combined_df.groupBy("CDSCode", "school_zip") \
                    .agg(
                    avg('new_alignment_90_high').alias('new_alignment_90_high'), \
                    avg('new_alignment_90_high_normed').alias('new_alignment_90_high_normed'), \
                    avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                    avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                    avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                    avg('ps_pm25_tpy').alias('ps_pm25_tpy'), \
                    avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m'), \
                    avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                    avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                    avg('avg_wspd').alias('avg_wspd'), \
                    avg('ps_within_5km_count').alias('avg_count_ps_within_5km'), \
                    avg('elevation_diff_m').alias('avg_elevation_diff_m'), \
                    avg('avg_wspd_ratio_ps_sch').alias('avg_wspd_ratio_ps_sch'), \
                    avg('avg_wspd_ratio_sch_ps').alias('avg_wspd_ratio_sch_ps'), \
                    avg('avg_school_wspd').alias('avg_school_wspd'), \
                    avg('avg_ps_wspd').alias('avg_ps_wspd'), \
                    sum('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                    sum('Izmy_v1_unnormed_wspd_ratio').alias('Izmy_v1_unnormed_wspd_ratio'), \
                    sum('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                    sum('Izmy_v2_nodist_unnormed_wspd_ratio').alias('Izmy_v2_nodist_unnormed_wspd_ratio'), \
                    sum('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                    sum('Izmy_v3_normed_D_and_TPY_wspd_ratio').alias('Izmy_v3_normed_D_and_TPY_wspd_ratio'), \
                    sum('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                    sum('Izmy_v4_nodist_normed_TPY_wspd_ratio').alias('Izmy_v4_nodist_normed_TPY_wspd_ratio'), \
                    sum('Izmy_v5_all_normed').alias('Izmy_v5_all_normed'), \
                    sum('Izmy_v5_all_normed_but_wspd_ratio').alias('Izmy_v5_all_normed_but_wspd_ratio'), \
                    sum('Izmy_v6_unnormed_no_wspd').alias('Izmy_v6_unnormed_no_wspd'), \
                    sum('Izmy_v7_all_normed_no_wspd').alias('Izmy_v7_all_normed_no_wspd'), \
                    sum('Izmy_v8_normed_D_and_TPY_no_wspd').alias('Izmy_v8_normed_D_and_TPY_no_wspd') \
                        )

    ### AGGREGATION 3/3: reduce to m-y-zip level ###

    combined_df_pd = combined_df.groupBy("school_zip") \
                    .agg(
                    avg('new_alignment_90_high').alias('new_alignment_90_high'), \
                    avg('new_alignment_90_high_normed').alias('new_alignment_90_high_normed'), \
                    avg('central_wind_alignment_180_high').alias('central_wind_alignment_180_high'), \
                    avg('ps_pm25_tpy_normed').alias('ps_pm25_tpy_normed'), \
                    avg('school_to_ps_geod_dist_m_normed').alias('school_to_ps_geod_dist_m_normed'), \
                    avg('ps_pm25_tpy').alias('ps_pm25_tpy_top_20'), \
                    avg('school_to_ps_geod_dist_m').alias('school_to_ps_geod_dist_m_top_20'), \
                    avg('central_wind_alignment_180_high_normed').alias('central_wind_alignment_180_high_normed'), \
                    avg('avg_wspd_normed').alias('avg_wspd_normed'), \
                    avg('avg_wspd').alias('avg_wspd_top_20'), \
                    avg('avg_count_ps_within_5km').alias('avg_count_ps_within_5km'), \
                    avg('avg_wspd_ratio_ps_sch').alias('avg_wspd_ratio_ps_sch'), \
                    avg('avg_wspd_ratio_sch_ps').alias('avg_wspd_ratio_sch_ps'), \
                    avg('avg_school_wspd').alias('avg_school_wspd'), \
                    avg('avg_ps_wspd').alias('avg_ps_wspd'), \
                    avg('avg_elevation_diff_m').alias('avg_elevation_diff_m'), \
                    avg('Izmy_v1_unnormed').alias('Izmy_v1_unnormed'), \
                    avg('Izmy_v1_unnormed_wspd_ratio').alias('Izmy_v1_unnormed_wspd_ratio'), \
                    avg('Izmy_v2_nodist_unnormed').alias('Izmy_v2_nodist_unnormed'), \
                    avg('Izmy_v2_nodist_unnormed_wspd_ratio').alias('Izmy_v2_nodist_unnormed_wspd_ratio'), \
                    avg('Izmy_v3_normed_D_and_TPY').alias('Izmy_v3_normed_D_and_TPY'), \
                    avg('Izmy_v3_normed_D_and_TPY_wspd_ratio').alias('Izmy_v3_normed_D_and_TPY_wspd_ratio'), \
                    avg('Izmy_v4_nodist_normed_TPY').alias('Izmy_v4_nodist_normed_TPY'), \
                    avg('Izmy_v4_nodist_normed_TPY_wspd_ratio').alias('Izmy_v4_nodist_normed_TPY_wspd_ratio'), \
                    avg('Izmy_v5_all_normed').alias('Izmy_v5_all_normed'), \
                    avg('Izmy_v5_all_normed_but_wspd_ratio').alias('Izmy_v5_all_normed_but_wspd_ratio'), \
                    avg('Izmy_v6_unnormed_no_wspd').alias('Izmy_v6_unnormed_no_wspd'), \
                    avg('Izmy_v7_all_normed_no_wspd').alias('Izmy_v7_all_normed_no_wspd'), \
                    avg('Izmy_v8_normed_D_and_TPY_no_wspd').alias('Izmy_v8_normed_D_and_TPY_no_wspd'), \
                    count('CDSCode').alias('schools')).toPandas()

    combined_df_pd['y-m'] = parquet_file

    # display(combined_df)

    zmy_agg_list.append(combined_df_pd)
    
    wind_temp_df.unpersist()
    
school_lookup.unpersist()
ps_year_lookup.unpersist()
school_to_ps_lookup.unpersist()
ps_lookup.unpersist()
school_filter.unpersist()

df = pd.concat(zmy_agg_list)

Now processing 2001-01
Now processing 2001-02
Now processing 2001-03
Now processing 2001-04
Now processing 2001-05
Now processing 2001-06
Now processing 2001-07
Now processing 2001-08
Now processing 2001-09
Now processing 2001-10
Now processing 2001-11
Now processing 2001-12
Now processing 2002-01
Now processing 2002-02
Now processing 2002-03
Now processing 2002-04
Now processing 2002-05
Now processing 2002-06
Now processing 2002-07
Now processing 2002-08
Now processing 2002-09
Now processing 2002-10
Now processing 2002-11
Now processing 2002-12
Now processing 2003-01
Now processing 2003-02
Now processing 2003-03
Now processing 2003-04
Now processing 2003-05
Now processing 2003-06
Now processing 2003-07
Now processing 2003-08
Now processing 2003-09
Now processing 2003-10
Now processing 2003-11
Now processing 2003-12
Now processing 2004-01
Now processing 2004-02
Now processing 2004-03
Now processing 2004-04
Now processing 2004-05
Now processing 2004-06
Now processing 2004-07
Now process

In [13]:
# code was failing near final stage on a subsequent run--hijacking the process so I can grab/fix outputs

early_stopping = 1

# read in files
school_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_school_lookup_filtered.csv'))
ps_year_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/year_lookup.csv'))
school_to_ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'schools/school_year_to_point_lookup_top50_4tpy_filtered.csv'))
ps_lookup = spark.read.option("header",True).csv(os.path.join(file_path, 'lookup tables/wind_grid_to_ps_point_lookup_top10_filtered.csv'))
school_filter = spark.read.option("header",True).csv(os.path.join(file_path,'lookup tables/school_ym_filter.csv'))
nearby_ps = spark.read.option("header",True).csv(os.path.join(file_path,'schools/pointsources_within_5km_by_school_by_year.csv'))

nearby_ps = nearby_ps.withColumnRenamed('point_source_year','ps_year').drop('_c0')

school_to_ps_lookup = (school_to_ps_lookup
                      .withColumnRenamed("point_source_index","ps_index")
                      .withColumnRenamed("point_source_lat","ps_lat")
                      .withColumnRenamed("point_source_lon","ps_lon")
                      .withColumnRenamed("point_source_year","ps_year")
                      .withColumnRenamed("point_source_pm25_tpy","ps_pm25_tpy")
                      .withColumnRenamed("point_source_zip","ps_zip")
                      .withColumnRenamed("geod_dist_m","school_to_ps_geod_dist_m")
                      .drop('_c0')
                      )

school_lookup.cache()
ps_year_lookup.cache()
school_to_ps_lookup.cache()
ps_lookup.cache()
school_filter.cache()

# Calculate and save out statistics for first normalizing (standard scaler)
cols_to_drop = ['ps_index','CDSCode', 'ps_year', 'ps_zip', 'angle_to_school', 'ps_distance_rank']
ps_agg = school_to_ps_lookup.drop(*cols_to_drop).distinct()

ps_stats = ps_agg.select(avg('ps_pm25_tpy'), avg('school_to_ps_geod_dist_m'), 
                         stddev_samp('ps_pm25_tpy'), stddev_samp('school_to_ps_geod_dist_m')).collect()

ps_stats = ps_stats[0]

ps_TPY_mean = ps_stats[0]
ps_dist_mean = ps_stats[1]
ps_TPY_sd = ps_stats[2]
ps_dist_sd = ps_stats[3]

# Calculate and save statistics for second normalizing (min-max)
ps_agg = ps_agg.withColumn('TPY_norm', (col('ps_pm25_tpy') - ps_TPY_mean)/ps_TPY_sd).withColumn('dist_norm', (col('school_to_ps_geod_dist_m') - ps_dist_mean)/ps_dist_sd)
ps_stats_mm = ps_agg.select(min('TPY_norm'), min('dist_norm'), max('TPY_norm'), max('dist_norm')).collect()
ps_stats_mm = ps_stats_mm[0]

ps_TPY_min = ps_stats_mm[0]
ps_dist_min = ps_stats_mm[1]
ps_TPY_max = ps_stats_mm[2]
ps_dist_max = ps_stats_mm[3]

# lists to contain pandas dataframes
zmy_agg_list = []
df_avgs_list = []

counter = 0

# loop through files
for parquet_file in os.listdir(in_dir):
    
    if (early_stopping == 0 or counter < early_stopping):

        # for holding augmented df at the zip code level
        temp_zmy_df = pd.DataFrame()

        # for holding augmented df at the school level
        temp_school_my_df = pd.DataFrame()

        # read in one month
        temp_meas_df = spark.read.parquet(os.path.join(in_dir, parquet_file))

        temp_meas_df = (temp_meas_df
                        .withColumn('wdir_wrt_0N',(180*F.atan2(col('u'), col('v'))
                                                   /(3.141592653589793238462)).cast('double')
                                   )
                        )

        # drop lat/lon, wdir, and y-m and store temp df to re-join for ps wind readings (dropped can be recovered if needed)
        wind_temp_df = temp_meas_df.drop('wind_lat','wind_lon','wdir','y-m')
        wind_temp_df.cache()

        # rename for explicitness of measurements
        temp_meas_df = (temp_meas_df
                        .withColumnRenamed('wind_lat','school_wind_lat')
                        .withColumnRenamed('wind_lon','school_wind_lon')
                        .withColumnRenamed('u','school_u')
                        .withColumnRenamed('v','school_v')
                        .withColumnRenamed('grid_index','school_grid_index')
                       ).drop('wdir') # wdir is wrt 0° E and is confusing

        combined_df = temp_meas_df.join(school_lookup, ['school_grid_index'], how='inner')

        # filter out schools which were closed, etc--as discovered elsewhere in pipeline
        school_filter = school_filter.drop('_c0').withColumnRenamed('cdscode','CDSCode')

        # for testing
        # combined_df.limit(5).show()

        # drop join column y-m; it is not needed
        combined_df = combined_df.join(school_filter, ['CDSCode','y-m'], how='inner').drop('y-m')

        # for testing
        # combined_df.limit(5).show()

        # compute zip code averages for wdir, wspd
        zip_avgs = (combined_df.groupBy('school_zip')
                    .avg('wspd','wdir_wrt_0N', 'school_u','school_v')
                    .withColumnRenamed("school_zip","zip_code")
                    .withColumnRenamed("avg(wspd)","avg_wspd_at_school")
                    .withColumnRenamed("avg(wdir_wrt_0N)","avg_wdir_0N")
                    .withColumnRenamed("avg(school_u)","avg_u")
                    .withColumnRenamed("avg(school_v)","avg_v")
                    .toPandas()
                   )

        zip_avgs['y-m'] = parquet_file

        df_avgs_list.append(zip_avgs)

        combined_df = (combined_df
                       .withColumnRenamed('wspd','school_wspd')
                       .withColumnRenamed('wdir_wrt_0N','school_wdir_0N') 
                      )
        # for testing
        # combined_df.limit(5).show()

        ### Middle Section: joins ###

        # need to lookup by CDSCode and year, so substring for year
        combined_df = combined_df.withColumn("year", substring(col('Datetime'),1,4))

        # join in ps <-> year lookup
        combined_df = combined_df.join(ps_year_lookup, ['year'], how='left').drop('year')

        # use lookup year to join in PSs to each school
        combined_df = combined_df.join(school_to_ps_lookup, ['CDSCode','ps_year'], how='left')

        # join in count of nearby PSs by year and CDSCode
        combined_df = combined_df.join(nearby_ps, ['CDSCode','ps_year'], how='left')
        
        combined_df = combined_df.na.fill(value=0,subset=['ps_within_5km_count'])

        # join lookup table to get nearest wind grid index for each PS
        combined_df = combined_df.join(ps_lookup, ['ps_index'], how='left')

        # rename for clarity
        combined_df = combined_df.withColumnRenamed('geod_dist_m', 'school_to_ps_geod_dist_m')

        # rename of ease/cleanliness of join           
        combined_df = combined_df.withColumnRenamed("ps_grid_index","grid_index")

        # filter for top n only if desired
        # combined_df = combined_df.filter(col('ps_distance_rank') == '1')

        # join wind measurements at PS that we had saved off
        combined_df = combined_df.join(wind_temp_df, ['grid_index',"Datetime"], how='left')

        # rename for clarity
        combined_df = (combined_df
                        .withColumnRenamed('u','ps_u')
                        .withColumnRenamed('v','ps_v')
                        .withColumnRenamed('wspd','ps_wspd')
                        .withColumnRenamed('wdir_wrt_0N','ps_wdir_0N')
                       )

        combined_df = combined_df.withColumn("angle_to_school",col("angle_to_school").cast("double"))


        # for testing
        # combined_df.limit(5).show()

        ### Second-to-last Section: Computations ###

        # function to compute better difference between alignments, factoring the zero-crossing
        # first angle is wind angle, second angle is heading to school (both wrt 0N)

        combined_df = (combined_df \
                      .withColumn("school_angle_diff",(col('angle_to_school') - col('school_wdir_0N')).cast('double'))
                      .withColumn("ps_angle_diff",(col('angle_to_school') - col('ps_wdir_0N')).cast('double'))
                      )

        combined_df = (combined_df \
                      .withColumn("school_wind_alignment",
                                 when(col("school_angle_diff") < -180, col("school_angle_diff") + 360) \
                                .when(col("school_angle_diff") > 180, col("school_angle_diff") - 360) \
                                .otherwise(col("school_angle_diff")).cast('double')) \
                      .withColumn("ps_wind_alignment",
                                 when(col("ps_angle_diff") < -180, col("ps_angle_diff") + 360) \
                                .when(col("ps_angle_diff") > 180, col("ps_angle_diff") - 360) \
                                .otherwise(col("ps_angle_diff")).cast('double'))
                      ).drop("school_angle_diff").drop("ps_angle_diff")


        combined_df = combined_df.withColumn("central_wind_alignment_180_high", 
                                             (180 - abs(((col('school_wind_alignment') + col('ps_wind_alignment'))/2))).cast('double'))

        # compute normed TPY and dist using scalars from above
        combined_df = (combined_df \
                       .withColumn('ps_pm25_tpy_normed', 
                                   ((((col('ps_pm25_tpy') - ps_TPY_mean) / ps_TPY_sd) - ps_TPY_min) / (ps_TPY_max - ps_TPY_min)).cast('double'))
                       .withColumn('school_to_ps_geod_dist_m_normed',((((col('school_to_ps_geod_dist_m') - ps_dist_mean) / ps_dist_sd) - ps_dist_min) / (ps_dist_max - ps_dist_min)).cast('double'))
                      )

        # compute normed wspd and wind alignment for Instrument v5

        combined_df = (combined_df \
                       .withColumn('avg_wspd',((col('school_wspd') + col('ps_wspd'))/2).cast('double'))
                      )

        combined_df = (combined_df \
                       .withColumn('central_wind_alignment_180_high_normed',
                                   (col('central_wind_alignment_180_high')/180).cast('double'))
                       .withColumn('avg_wspd_normed',
                                   ((col('avg_wspd') - min_wspd) / (max_wspd - min_wspd)).cast('double'))
                      )

        combined_df = (combined_df \
                       .withColumn('new_alignment_90_high',
                                 when((col('central_wind_alignment_180_high') - 90) < 0, 0) \
                                .otherwise(col('central_wind_alignment_180_high') - 90).cast('double')) \
                      )

        combined_df = (combined_df \
                       .withColumn('new_alignment_90_high_normed',
                                   (col('new_alignment_90_high')/90).cast('double'))
                      )

        # compute elevation differential
        combined_df = (combined_df
                       .withColumn('school_elevation_m',col('school_elevation_m').cast('double'))
                       .withColumn('ps_m',col('ps_elevation_m').cast('double'))
                      )

        combined_df = (combined_df
                   .withColumn('elevation_diff_m',(col('ps_elevation_m') - col('school_elevation_m')).cast('double'))
                      )

        combined_df = (combined_df
               .withColumn('wspd_ratio',(col('ps_wspd') / (col('school_wspd')+0.0001)).cast('double'))
                  )
        
        # fixed dividing by normed distance! --should have been multiplying where normed

        combined_df = (combined_df
                       .withColumn('Izmy_v1_unnormed',(col('new_alignment_90_high') * col('ps_pm25_tpy') * (col('avg_wspd') / col('school_to_ps_geod_dist_m'))).cast('double'))
                       .withColumn('Izmy_v2_nodist_unnormed',(col('new_alignment_90_high') * col('ps_pm25_tpy') * col('avg_wspd')).cast('double'))
                       .withColumn('Izmy_v3_normed_D_and_TPY',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * (col('avg_wspd') * col('school_to_ps_geod_dist_m_normed'))).cast('double'))
                       .withColumn('Izmy_v4_nodist_normed_TPY',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed') * col('avg_wspd')).cast('double'))
                       .withColumn('Izmy_v5_all_normed',(col('new_alignment_90_high_normed') * col('ps_pm25_tpy_normed') * (col('avg_wspd_normed') * col('school_to_ps_geod_dist_m_normed'))).cast('double'))
                       .withColumn('Izmy_v6_unnormed_no_wspd',(col('new_alignment_90_high') * col('ps_pm25_tpy') / col('school_to_ps_geod_dist_m')).cast('double'))
                       .withColumn('Izmy_v7_all_normed_no_wspd',(col('new_alignment_90_high_normed') * col('ps_pm25_tpy_normed')  * col('school_to_ps_geod_dist_m_normed')).cast('double'))
                       .withColumn('Izmy_v8_normed_D_and_TPY_no_wspd',(col('new_alignment_90_high') * col('ps_pm25_tpy_normed')  * col('school_to_ps_geod_dist_m_normed')).cast('double'))
                      )

        counter += 1


In [14]:
combined_df.limit(15).show()

+----------+-------------------+--------+--------------+-------+-----------------+---------------+---------------+---------+---------+-----------+-------------------+----------+----------+-----------+--------------------------+------------------+---------+-----------+-----------+------+--------------+------------------------+-------------------+----------------+-------------------+----------------------+---------+---------+--------+-------------------+---------------------+-------------------+-------------------------------+--------------------+-------------------------------+------------------+--------------------------------------+-------------------+---------------------+----------------------------+-------+-------------------+------------------+--------------------+-----------------------+------------------------+-------------------------+--------------------+------------------------+--------------------------+--------------------------------+
|grid_index|           Datetime|ps_ind

In [10]:
combined_df.printSchema()

root
 |-- grid_index: string (nullable = true)
 |-- Datetime: string (nullable = true)
 |-- ps_index: string (nullable = true)
 |-- CDSCode: string (nullable = true)
 |-- ps_year: string (nullable = true)
 |-- school_grid_index: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_u: double (nullable = true)
 |-- school_v: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- school_wdir_0N: double (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- wind_to_school_geod_dist_m: string (nullable = true)
 |-- school_elevation_m: double (nullable = true)
 |-- ps_lat: string (nullable = true)
 |-- ps_lon: string (nullable = true)
 |-- ps_pm25_tpy: string (nullable = true)
 |-- ps_zip: string (nullable = true)
 |-- ps_elevation_m: string (nullable = true)
 |-- school_to_ps_geod_dist_m: string (nullabl

In [8]:
# df_Izmy = pd.concat(zmy_agg_list)

In [12]:
df_Izmy = df

In [11]:
df.to_parquet(os.path.join(file_path,'combined_instrument_downstream_top20_20tpy_df_w_wspd_ratios_insts.parquet'))

In [10]:
display(df_Izmy)

Unnamed: 0,school_zip,new_alignment_90_high,new_alignment_90_high_normed,central_wind_alignment_180_high,ps_pm25_tpy_normed,school_to_ps_geod_dist_m_normed,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,central_wind_alignment_180_high_normed,avg_wspd_normed,avg_wspd_top_20,avg_school_wspd,avg_ps_wspd,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd,schools,y-m
0,90022,31.555728,0.350619,107.794277,0.003409,0.032201,15.664026,10254.754145,0.598857,0.105485,2.046318,1.713602,2.379033,1623.321299,1.422962e+07,108813.360018,3036.224721,62.323311,827.754800,635.549579,7.435930e+05,13,2001-01
1,93545,37.525149,0.416946,118.846912,0.005257,0.314716,21.442546,99604.191293,0.660261,0.086891,1.685689,1.316469,2.054909,171.567023,1.706395e+07,12139.586301,3953.301199,6.952765,103.214754,81.857330,2.946864e+04,4,2001-01
2,95519,35.450393,0.393893,115.975623,0.011514,0.248719,41.014320,78731.735596,0.644309,0.146493,2.841672,3.806602,1.876741,2136.042726,5.936034e+07,189400.961621,16595.361851,108.487773,761.888016,752.313711,6.770823e+05,10,2001-01
3,91910,34.507378,0.383415,114.353506,0.015131,0.055402,52.325204,17592.384919,0.635297,0.118471,2.298192,2.616727,1.979657,5252.018850,7.027467e+07,481584.859441,20596.415888,275.840670,2259.623314,2302.928672,3.523481e+06,17,2001-01
4,94610,31.843878,0.353821,108.433993,0.004760,0.041139,19.887662,13081.565380,0.602411,0.107693,2.089134,2.313215,1.865053,2615.516468,2.208690e+07,210991.368189,5445.541670,120.851426,1131.085909,987.927189,4.445672e+05,5,2001-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,95697,29.927048,0.332523,107.425364,0.002426,0.109704,12.589081,34766.470677,0.596808,0.100557,1.950745,2.147770,1.753720,385.977314,1.120339e+07,23626.555973,2358.537286,13.531787,236.167373,157.741161,1.419670e+04,1,2017-12
1365,92285,36.012848,0.400143,120.102927,0.028071,0.205932,92.799885,65199.656508,0.667238,0.101123,1.961726,2.118434,1.805017,1638.936623,8.733392e+07,157714.784586,26310.631374,90.331737,864.735286,924.505863,8.320553e+04,1,2017-12
1366,95595,41.204546,0.457828,123.739695,0.005790,0.376809,23.110280,119242.212709,0.687443,0.079118,1.534931,1.268455,1.801406,216.372818,2.370043e+07,17673.258967,6120.621437,10.121995,137.506735,123.714689,1.113432e+04,1,2017-12
1367,90068,33.459500,0.371772,112.542243,0.008319,0.056552,31.018448,17956.313744,0.625235,0.099616,1.932488,2.076431,1.788544,1476.648797,2.651554e+07,112334.653767,6983.529656,64.339941,782.381875,665.742797,5.991685e+04,1,2017-12


In [548]:
pd.set_option('display.max_rows', 500)

In [15]:
df_Izmy.dtypes

school_zip                                  int32
central_wind_alignment_180_high           float64
ps_pm25_tpy_normed                        float64
school_to_ps_geod_dist_m_normed           float64
ps_pm25_tpy_top_1                         float64
school_to_ps_geod_dist_m_top_1            float64
central_wind_alignment_180_high_normed    float64
avg_wspd_normed                           float64
avg_wspd_top_1                            float64
Izmy_v1_unnormed                          float64
Izmy_v2_nodist_unnormed                   float64
Izmy_v3_normed_D_and_TPY                  float64
Izmy_v4_nodist_normed_TPY                 float64
Izmy_v5_all_normed                        float64
schools                                     int64
dtype: object

In [13]:
df_Izmy['school_zip'] = df_Izmy['school_zip'].astype(int)
df_Izmy['y-m'] = df_Izmy['y-m'].astype(str)

In [119]:
df_Izmy.dtypes

school_zip                                  int32
central_wind_alignment_180_high           float64
ps_pm25_tpy_normed                        float64
school_to_ps_geod_dist_m_normed           float64
ps_pm25_tpy_top_10                        float64
school_to_ps_geod_dist_m_top_10           float64
central_wind_alignment_180_high_normed    float64
avg_wspd_normed                           float64
avg_wspd_top_10                           float64
Izmy_v1_unnormed                          float64
Izmy_v2_nodist_unnormed                   float64
Izmy_v3_normed_D_and_TPY                  float64
Izmy_v4_nodist_normed_TPY                 float64
Izmy_v5_all_normed                        float64
schools                                     int64
avg_wspd_at_school                        float64
avg_wdir_0N                               float64
avg_u                                     float64
avg_v                                     float64
y-m                                        object


In [10]:
display(df_Izmy)

Unnamed: 0,school_zip,new_alignment_90_high,new_alignment_90_high_normed,central_wind_alignment_180_high,ps_pm25_tpy_normed,school_to_ps_geod_dist_m_normed,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,central_wind_alignment_180_high_normed,avg_wspd_normed,avg_wspd_top_20,avg_count_ps_within_5km,avg_wspd_ratio,avg_school_wspd,avg_ps_wspd,avg_elevation_diff_m,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd,schools,y-m
0,93545,32.613496,0.362372,111.048334,0.007027,0.232677,25.985214,90118.090809,0.616935,0.085561,1.659885,0.000000,1.911711,1.316469,2.003301,-353.550833,165.260767,1.669299e+07,16945.353167,4562.880891,9.705375,94.325694,105.873451,3.811444e+04,4,2001-01
1,95519,33.763650,0.375152,113.131646,0.010314,0.109944,36.269646,42611.504752,0.628509,0.152098,2.950393,0.000000,0.798461,3.806602,2.094184,8.196333,2880.291082,4.084129e+07,324317.652520,11573.895578,185.770993,892.735948,1114.289149,1.002860e+06,10,2001-01
2,90022,35.540665,0.394896,109.550810,0.002959,0.018401,13.258420,7177.966567,0.608616,0.093058,1.805294,2.153846,1.351027,1.713602,1.896986,-11.899179,1413.797682,9.286519e+06,120417.664560,2042.288206,68.968810,784.674782,746.132186,8.729747e+05,13,2001-01
3,91910,33.444394,0.371604,110.711031,0.008740,0.022451,31.344958,8745.643194,0.615061,0.116038,2.251000,4.529412,1.049285,2.616727,1.885274,-43.098157,5060.511115,2.645242e+07,568808.023027,7383.373979,325.799603,2199.126789,2742.351921,4.195798e+06,17,2001-01
4,92374,24.047210,0.267191,96.742762,0.008483,0.063119,30.541474,24486.871738,0.537460,0.087992,1.707041,0.000000,1.545476,1.613267,1.800816,-58.783333,681.016103,1.445471e+07,73421.210004,4077.387241,42.051395,396.057034,471.475096,5.516259e+05,13,2001-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,95697,31.142381,0.346026,105.368459,0.001061,0.069605,7.319676,26997.439327,0.585380,0.103729,2.012268,0.000000,1.552581,2.147770,1.876766,-8.594000,224.971040,3.752889e+06,12715.956438,447.358875,7.282571,148.404945,99.825117,8.984261e+03,1,2017-12
1365,92285,37.656980,0.418411,118.790029,0.033213,0.139378,107.915850,54004.836512,0.659945,0.103034,1.998784,0.000000,1.015022,2.118434,1.879134,-318.614667,1509.331205,7.917963e+07,179605.587259,24305.045068,102.869175,817.269674,1080.955432,9.728599e+04,1,2017-12
1366,95595,41.663419,0.462927,125.400924,0.004749,0.244302,18.858737,94617.718407,0.696672,0.083192,1.613947,0.000000,2.025676,1.268455,1.959440,-618.055333,161.297518,1.511579e+07,16049.747411,3877.795456,9.192258,98.148951,107.873155,9.708584e+03,1,2017-12
1367,90068,39.319405,0.436882,121.048697,0.001730,0.029196,9.413404,11356.202546,0.672493,0.103652,2.010769,0.000000,1.483027,2.076431,1.945107,-80.743333,901.890816,8.165907e+06,62542.737132,1444.931650,35.821987,442.026877,347.965474,3.131689e+04,1,2017-12


### Lastly, load current dataset and join columns to it

In [14]:
modeling_data = pd.read_csv(os.path.join(gdrive_path, 'modeling_data\\modeling_data_2022-10-18.csv'))

In [15]:
modeling_data['y-m'] = modeling_data.apply(lambda x: str(x["year"]) + '-' + str(x['month']).zfill(2), axis=1)

In [15]:
modeling_data.dtypes

year_month             object
school_zip              int64
school_county_v2       object
school_region_name     object
pm25                  float64
                       ...   
month_09                int64
month_10                int64
month_11                int64
month_12                int64
y-m                    object
Length: 127, dtype: object

In [15]:
display(modeling_data)

Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_last_month,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m
0,2000-01-01,90001,Los Angeles,Los Angeles County,32.149998,44.728889,43.703333,6196.0,3209.0,2987.0,6672.0,3397.0,3275.0,5562.0,2850.0,2712.0,5075.0,2599.0,2476.0,23505.0,12055.0,11450.0,54481.0,27320.0,27161.0,14.241154,3854.812685,-90.196586,0.757031,-172.758321,-172.758321,82.561735,82.561735,0.757031,82.561735,1.124995,0.000000,0.757031,20049.704556,2.608176e+06,47.873130,14.277778,14.266667,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01
1,2000-01-01,90002,Los Angeles,Los Angeles County,31.849998,33.858889,29.690000,4795.0,2447.0,2348.0,5655.0,2870.0,2785.0,5077.0,2566.0,2511.0,4316.0,2179.0,2137.0,19843.0,10062.0,9781.0,44584.0,21553.0,23031.0,6.649500,2734.278190,-75.730039,0.757031,-172.758321,-172.758321,97.028283,97.028283,0.757031,97.028283,0.879819,0.222222,0.757031,19697.001063,1.549733e+06,34.759847,14.300000,14.400000,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01
2,2000-01-01,90003,Los Angeles,Los Angeles County,31.724998,42.329167,45.785000,6527.0,3297.0,3230.0,6894.0,3539.0,3355.0,5994.0,3000.0,2994.0,5381.0,2727.0,2654.0,24796.0,12563.0,12233.0,58187.0,28557.0,29630.0,11.672797,5281.522927,-63.495073,0.855611,-172.758321,-60.057878,106.478652,100.850655,0.806321,103.664653,0.869845,0.000000,0.757031,18895.491452,2.255523e+06,38.763349,14.300000,14.383333,12,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01
3,2000-01-01,90004,Los Angeles,Los Angeles County,29.560000,83.190000,102.906667,5621.0,2879.0,2742.0,5505.0,2816.0,2689.0,4470.0,2297.0,2173.0,4204.0,2199.0,2005.0,19800.0,10191.0,9609.0,67850.0,34200.0,33650.0,5.158316,6452.615217,-87.100171,0.757031,-172.758321,-172.758321,78.897449,78.897449,0.757031,78.897449,1.160977,0.000000,0.757031,41990.032278,4.404657e+07,649.175696,14.022222,13.911111,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01
4,2000-01-01,90006,Los Angeles,Los Angeles County,30.750000,65.920000,77.570000,5939.0,3050.0,2889.0,6009.0,2995.0,3014.0,4493.0,2337.0,2156.0,4416.0,2370.0,2046.0,20857.0,10752.0,10105.0,62765.0,31901.0,30864.0,3.553121,5187.184865,-78.360123,0.757031,-172.758321,-172.758321,94.398199,94.398199,0.757031,94.398199,0.923498,0.000000,0.757031,17629.297337,3.132557e+06,49.909297,14.140000,14.100000,5,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311158,2018-12-01,96145,Placer,Superior California,4.462162,1991.700000,1797.860000,109.0,63.0,46.0,79.0,63.0,16.0,114.0,36.0,78.0,48.0,17.0,31.0,350.0,179.0,171.0,2151.0,1130.0,1021.0,1.693826,14639.964232,55.979727,0.243283,13.414409,-61.071359,162.565317,122.948914,0.340150,142.757116,0.203923,0.000000,0.437017,84420.345595,1.108245e+07,5152.233380,14.183790,16.993708,3,1.110811,3.909009,3.562763,2.959234,3.129392,-0.949344,-0.090811,0.196834,-0.000854,0.832432,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12
311159,2018-12-01,96146,Placer,Superior California,4.626316,1895.970000,1797.860000,36.0,21.0,15.0,59.0,21.0,38.0,32.0,9.0,23.0,20.0,20.0,0.0,147.0,71.0,76.0,969.0,562.0,407.0,1.693826,15987.398121,-147.155701,0.243283,13.414409,-61.071359,160.570111,86.084342,0.340150,123.327226,0.450580,0.000000,0.437017,113113.030726,6.148855e+06,6345.567595,16.993708,16.993708,1,0.394737,3.367544,3.196491,2.601316,2.967325,-0.995338,-0.199912,0.151178,-0.044263,0.321053,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12
311160,2018-12-01,96150,El Dorado,Superior California,4.286667,1922.062222,1907.910000,1613.0,792.0,821.0,1267.0,634.0,633.0,1361.0,716.0,645.0,1499.0,796.0,703.0,5740.0,2938.0,2802.0,29357.0,14913.0,14444.0,1.331067,3837.935999,-18.310985,0.231856,-17.369322,-17.369322,47.669467,47.669467,0.231856,47.669467,1.594435,0.000000,0.231856,55949.806463,1.723578e+07,587.109752,8.575065,8.663953,9,1.046667,3.795556,3.505432,2.869074,2.952315,-0.875175,-0.087296,0.206519,0.021245,0.815556,2018,12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12
311161,2018-12-01,96161,Nevada,Superior California,5.053509,1810.586364,1793.499091,893.0,540.0,353.0,1280.0,736.0,544.0,1219.0,590.0,629.0,1023.0,565.0,458.0,4415.0,2431.0,1984.0,18333.0,9461.0,8872.0,1.660815,7735.266680,-49.871057,0.243283,-61.071359,-61.071359,40.066449,40.066449,0.243283,40.066449,1.677660,0.000000,0.243283,111605.903924,4.081391e+07,2226.253696,15.689734,16.993708,11,1.539474,4.732456,4.320468,3.938962,3.968458,-1.020802,-0.099430,0.119596,0.000669,1.917544,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12


In [16]:
df_join = df_Izmy[['school_zip','central_wind_alignment_180_high','avg_count_ps_within_5km','avg_elevation_diff_m','avg_wspd_ratio_ps_sch','avg_wspd_ratio_sch_ps','avg_school_wspd','avg_ps_wspd','new_alignment_90_high','ps_pm25_tpy_top_20','school_to_ps_geod_dist_m_top_20','avg_wspd_top_20','y-m','Izmy_v1_unnormed','Izmy_v1_unnormed_wspd_ratio','Izmy_v2_nodist_unnormed','Izmy_v2_nodist_unnormed_wspd_ratio','Izmy_v3_normed_D_and_TPY','Izmy_v3_normed_D_and_TPY_wspd_ratio','Izmy_v4_nodist_normed_TPY','Izmy_v4_nodist_normed_TPY_wspd_ratio','Izmy_v5_all_normed','Izmy_v5_all_normed_but_wspd_ratio','Izmy_v6_unnormed_no_wspd','Izmy_v7_all_normed_no_wspd','Izmy_v8_normed_D_and_TPY_no_wspd']]

display(df_join)

Unnamed: 0,school_zip,central_wind_alignment_180_high,avg_count_ps_within_5km,avg_elevation_diff_m,avg_wspd_ratio_ps_sch,avg_wspd_ratio_sch_ps,avg_school_wspd,avg_ps_wspd,new_alignment_90_high,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,avg_wspd_top_20,y-m,Izmy_v1_unnormed,Izmy_v1_unnormed_wspd_ratio,Izmy_v2_nodist_unnormed,Izmy_v2_nodist_unnormed_wspd_ratio,Izmy_v3_normed_D_and_TPY,Izmy_v3_normed_D_and_TPY_wspd_ratio,Izmy_v4_nodist_normed_TPY,Izmy_v4_nodist_normed_TPY_wspd_ratio,Izmy_v5_all_normed,Izmy_v5_all_normed_but_wspd_ratio,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd
0,90022,99.261179,2.153846,-15.594769,1.427885,1.240527,1.713602,1.977198,29.700061,58.059677,24769.373905,1.845400,2001-01,1990.065706,1529.301384,4.751878e+07,3.746688e+07,9534.963805,7604.962199,10095.725538,8057.841160,5.461168,84.499580,1101.984065,56.769848,5109.286329
1,95519,118.919085,0.000000,161.632000,0.701114,3.082648,3.806602,1.877015,37.787228,54.750516,89952.296158,2.841809,2001-01,3253.202021,729.114528,8.833889e+07,2.157251e+07,13481.023902,3355.338776,18039.958181,4481.964499,7.721884,37.281542,1134.046282,52.071961,4686.476483
2,91910,119.228032,4.529412,19.821235,1.087493,2.155345,2.616727,1.923664,37.841033,71.799101,28980.453591,2.270195,2001-01,6291.175275,2823.902834,9.316890e+07,4.449815e+07,20191.369812,9582.445000,21604.299450,10290.539727,11.565185,106.471611,2690.548655,94.908123,8541.731069
3,93545,116.851159,0.000000,-717.576000,2.003892,0.983472,1.316469,2.126206,34.764727,144.775715,175872.548644,1.721337,2001-01,711.769167,830.178782,1.275580e+08,1.485761e+08,21013.260044,24472.302685,35255.239743,41011.450552,12.035144,271.914474,417.075895,136.240316,12261.628417
4,94610,115.519996,3.400000,-26.511200,1.120863,1.734462,2.313215,2.146523,35.729501,73.252203,24266.683548,2.229869,2001-01,3921.912464,1855.414285,8.107822e+07,4.098382e+07,17722.145024,8931.720162,18847.241210,9511.352693,10.150496,99.241335,1955.644501,100.574954,9051.745882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,95697,111.145174,0.000000,103.723000,1.453368,1.484952,2.147770,1.935117,34.152255,117.501557,83411.137676,2.041443,2017-12,1266.059271,876.898036,1.062316e+08,7.370084e+07,22635.793479,15530.249602,27848.508381,19152.630819,12.964540,172.558329,707.866100,141.839349,12765.541430
1365,92285,122.317361,0.000000,-310.978000,1.206939,1.583551,2.118434,2.197631,36.665014,132.657525,88534.004931,2.158033,2017-12,2055.823755,1128.209520,1.391882e+08,7.539439e+07,30849.102905,16468.704103,37284.220746,19942.629165,17.669003,182.985601,1061.014741,177.875491,16008.794171
1366,95595,117.902631,0.000000,-346.473000,1.841883,1.070472,1.268455,1.831995,35.010112,40.182680,184986.632496,1.550225,2017-12,209.910832,242.612718,2.913127e+07,3.459393e+07,2924.817531,3316.857637,4566.077751,5312.468172,1.675080,36.853974,145.258805,22.750891,2047.580158
1367,90068,102.735526,0.000000,-37.624500,1.629194,1.667226,2.076431,2.054201,27.377072,80.097723,54370.924921,2.065316,2017-12,1521.663534,1346.168487,5.946228e+07,5.453786e+07,12759.081551,11346.866991,14246.269664,12787.925271,7.307731,126.076300,851.206485,78.597204,7073.748388


In [17]:
df_modeling_combined = pd.merge(modeling_data, df_join, on=['school_zip','y-m'], how='left')

In [18]:
display(df_modeling_combined)

Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_last_month,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,avg_count_ps_within_5km,avg_elevation_diff_m,avg_wspd_ratio_ps_sch,avg_wspd_ratio_sch_ps,avg_school_wspd,avg_ps_wspd,new_alignment_90_high,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,avg_wspd_top_20,Izmy_v1_unnormed,Izmy_v1_unnormed_wspd_ratio,Izmy_v2_nodist_unnormed,Izmy_v2_nodist_unnormed_wspd_ratio,Izmy_v3_normed_D_and_TPY,Izmy_v3_normed_D_and_TPY_wspd_ratio,Izmy_v4_nodist_normed_TPY,Izmy_v4_nodist_normed_TPY_wspd_ratio,Izmy_v5_all_normed,Izmy_v5_all_normed_but_wspd_ratio,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd,avg_temp
0,2000-01-01,90001,Los Angeles,Los Angeles County,32.149998,44.728889,43.703333,6196.0,3209.0,2987.0,6672.0,3397.0,3275.0,5562.0,2850.0,2712.0,5075.0,2599.0,2476.0,23505.0,12055.0,11450.0,54481.0,27320.0,27161.0,14.241154,3854.812685,-90.196586,0.757031,-172.758321,-172.758321,82.561735,82.561735,0.757031,82.561735,1.124995,0.000000,0.757031,20049.704556,2.608176e+06,47.873130,14.277778,14.266667,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,14.272222
1,2000-01-01,90002,Los Angeles,Los Angeles County,31.849998,33.858889,29.690000,4795.0,2447.0,2348.0,5655.0,2870.0,2785.0,5077.0,2566.0,2511.0,4316.0,2179.0,2137.0,19843.0,10062.0,9781.0,44584.0,21553.0,23031.0,6.649500,2734.278190,-75.730039,0.757031,-172.758321,-172.758321,97.028283,97.028283,0.757031,97.028283,0.879819,0.222222,0.757031,19697.001063,1.549733e+06,34.759847,14.300000,14.400000,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,14.350000
2,2000-01-01,90003,Los Angeles,Los Angeles County,31.724998,42.329167,45.785000,6527.0,3297.0,3230.0,6894.0,3539.0,3355.0,5994.0,3000.0,2994.0,5381.0,2727.0,2654.0,24796.0,12563.0,12233.0,58187.0,28557.0,29630.0,11.672797,5281.522927,-63.495073,0.855611,-172.758321,-60.057878,106.478652,100.850655,0.806321,103.664653,0.869845,0.000000,0.757031,18895.491452,2.255523e+06,38.763349,14.300000,14.383333,12,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,14.341667
3,2000-01-01,90004,Los Angeles,Los Angeles County,29.560000,83.190000,102.906667,5621.0,2879.0,2742.0,5505.0,2816.0,2689.0,4470.0,2297.0,2173.0,4204.0,2199.0,2005.0,19800.0,10191.0,9609.0,67850.0,34200.0,33650.0,5.158316,6452.615217,-87.100171,0.757031,-172.758321,-172.758321,78.897449,78.897449,0.757031,78.897449,1.160977,0.000000,0.757031,41990.032278,4.404657e+07,649.175696,14.022222,13.911111,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,13.966667
4,2000-01-01,90006,Los Angeles,Los Angeles County,30.750000,65.920000,77.570000,5939.0,3050.0,2889.0,6009.0,2995.0,3014.0,4493.0,2337.0,2156.0,4416.0,2370.0,2046.0,20857.0,10752.0,10105.0,62765.0,31901.0,30864.0,3.553121,5187.184865,-78.360123,0.757031,-172.758321,-172.758321,94.398199,94.398199,0.757031,94.398199,0.923498,0.000000,0.757031,17629.297337,3.132557e+06,49.909297,14.140000,14.100000,5,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,,,,,,14.120000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311158,2018-12-01,96145,Placer,Superior California,4.462162,1991.700000,1797.860000,109.0,63.0,46.0,79.0,63.0,16.0,114.0,36.0,78.0,48.0,17.0,31.0,350.0,179.0,171.0,2151.0,1130.0,1021.0,1.693826,14639.964232,55.979727,0.243283,13.414409,-61.071359,162.565317,122.948914,0.340150,142.757116,0.203923,0.000000,0.437017,84420.345595,1.108245e+07,5152.233380,14.183790,16.993708,3,1.110811,3.909009,3.562763,2.959234,3.129392,-0.949344,-0.090811,0.196834,-0.000854,0.832432,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,,,,,,15.588749
311159,2018-12-01,96146,Placer,Superior California,4.626316,1895.970000,1797.860000,36.0,21.0,15.0,59.0,21.0,38.0,32.0,9.0,23.0,20.0,20.0,0.0,147.0,71.0,76.0,969.0,562.0,407.0,1.693826,15987.398121,-147.155701,0.243283,13.414409,-61.071359,160.570111,86.084342,0.340150,123.327226,0.450580,0.000000,0.437017,113113.030726,6.148855e+06,6345.567595,16.993708,16.993708,1,0.394737,3.367544,3.196491,2.601316,2.967325,-0.995338,-0.199912,0.151178,-0.044263,0.321053,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,,,,,,16.993708
311160,2018-12-01,96150,El Dorado,Superior California,4.286667,1922.062222,1907.910000,1613.0,792.0,821.0,1267.0,634.0,633.0,1361.0,716.0,645.0,1499.0,796.0,703.0,5740.0,2938.0,2802.0,29357.0,14913.0,14444.0,1.331067,3837.935999,-18.310985,0.231856,-17.369322,-17.369322,47.669467,47.669467,0.231856,47.669467,1.594435,0.000000,0.231856,55949.806463,1.723578e+07,587.109752,8.575065,8.663953,9,1.046667,3.795556,3.505432,2.869074,2.952315,-0.875175,-0.087296,0.206519,0.021245,0.815556,2018,12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,,,,,,8.619509
311161,2018-12-01,96161,Nevada,Superior California,5.053509,1810.586364,1793.499091,893.0,540.0,353.0,1280.0,736.0,544.0,1219.0,590.0,629.0,1023.0,565.0,458.0,4415.0,2431.0,1984.0,18333.0,9461.0,8872.0,1.660815,7735.266680,-49.871057,0.243283,-61.071359,-61.071359,40.066449,40.066449,0.243283,40.066449,1.677660,0.000000,0.243283,111605.903924,4.081391e+07,2226.253696,15.689734,16.993708,11,1.539474,4.732456,4.320468,3.938962,3.968458,-1.020802,-0.099430,0.119596,0.000669,1.917544,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,,,,,,16.341721


In [15]:
df_modeling_combined = pd.read_csv(os.path.join(remote_dir,'modeling_data\\modeling_data_joined_11-17-top15_4tpy_ds_wind_ratios.csv'))

In [12]:
display(df_modeling_combined)

Unnamed: 0.1,Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_last_month,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,avg_count_ps_within_5km,avg_elevation_diff_m,avg_wspd_ratio_ps_sch,avg_wspd_ratio_sch_ps,avg_school_wspd,avg_ps_wspd,new_alignment_90_high,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,avg_wspd_top_20,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed_but_wspd_ratio,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd
0,0,2000-01-01,90001,Los Angeles,Los Angeles County,32.149998,44.728889,43.703333,6196.0,3209.0,2987.0,6672.0,3397.0,3275.0,5562.0,2850.0,2712.0,5075.0,2599.0,2476.0,23505.0,12055.0,11450.0,54481.0,27320.0,27161.0,14.241154,3854.812685,-90.196586,0.757031,-172.758321,-172.758321,82.561735,82.561735,0.757031,82.561735,1.124995,0.000000,0.757031,20049.704556,2.608176e+06,47.873130,14.277778,14.266667,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,
1,1,2000-01-01,90002,Los Angeles,Los Angeles County,31.849998,33.858889,29.690000,4795.0,2447.0,2348.0,5655.0,2870.0,2785.0,5077.0,2566.0,2511.0,4316.0,2179.0,2137.0,19843.0,10062.0,9781.0,44584.0,21553.0,23031.0,6.649500,2734.278190,-75.730039,0.757031,-172.758321,-172.758321,97.028283,97.028283,0.757031,97.028283,0.879819,0.222222,0.757031,19697.001063,1.549733e+06,34.759847,14.300000,14.400000,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,
2,2,2000-01-01,90003,Los Angeles,Los Angeles County,31.724998,42.329167,45.785000,6527.0,3297.0,3230.0,6894.0,3539.0,3355.0,5994.0,3000.0,2994.0,5381.0,2727.0,2654.0,24796.0,12563.0,12233.0,58187.0,28557.0,29630.0,11.672797,5281.522927,-63.495073,0.855611,-172.758321,-60.057878,106.478652,100.850655,0.806321,103.664653,0.869845,0.000000,0.757031,18895.491452,2.255523e+06,38.763349,14.300000,14.383333,12,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,
3,3,2000-01-01,90004,Los Angeles,Los Angeles County,29.560000,83.190000,102.906667,5621.0,2879.0,2742.0,5505.0,2816.0,2689.0,4470.0,2297.0,2173.0,4204.0,2199.0,2005.0,19800.0,10191.0,9609.0,67850.0,34200.0,33650.0,5.158316,6452.615217,-87.100171,0.757031,-172.758321,-172.758321,78.897449,78.897449,0.757031,78.897449,1.160977,0.000000,0.757031,41990.032278,4.404657e+07,649.175696,14.022222,13.911111,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,
4,4,2000-01-01,90006,Los Angeles,Los Angeles County,30.750000,65.920000,77.570000,5939.0,3050.0,2889.0,6009.0,2995.0,3014.0,4493.0,2337.0,2156.0,4416.0,2370.0,2046.0,20857.0,10752.0,10105.0,62765.0,31901.0,30864.0,3.553121,5187.184865,-78.360123,0.757031,-172.758321,-172.758321,94.398199,94.398199,0.757031,94.398199,0.923498,0.000000,0.757031,17629.297337,3.132557e+06,49.909297,14.140000,14.100000,5,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311158,311158,2018-12-01,96145,Placer,Superior California,4.462162,1991.700000,1797.860000,109.0,63.0,46.0,79.0,63.0,16.0,114.0,36.0,78.0,48.0,17.0,31.0,350.0,179.0,171.0,2151.0,1130.0,1021.0,1.693826,14639.964232,55.979727,0.243283,13.414409,-61.071359,162.565317,122.948914,0.340150,142.757116,0.203923,0.000000,0.437017,84420.345595,1.108245e+07,5152.233380,14.183790,16.993708,3,1.110811,3.909009,3.562763,2.959234,3.129392,-0.949344,-0.090811,0.196834,-0.000854,0.832432,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,
311159,311159,2018-12-01,96146,Placer,Superior California,4.626316,1895.970000,1797.860000,36.0,21.0,15.0,59.0,21.0,38.0,32.0,9.0,23.0,20.0,20.0,0.0,147.0,71.0,76.0,969.0,562.0,407.0,1.693826,15987.398121,-147.155701,0.243283,13.414409,-61.071359,160.570111,86.084342,0.340150,123.327226,0.450580,0.000000,0.437017,113113.030726,6.148855e+06,6345.567595,16.993708,16.993708,1,0.394737,3.367544,3.196491,2.601316,2.967325,-0.995338,-0.199912,0.151178,-0.044263,0.321053,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,
311160,311160,2018-12-01,96150,El Dorado,Superior California,4.286667,1922.062222,1907.910000,1613.0,792.0,821.0,1267.0,634.0,633.0,1361.0,716.0,645.0,1499.0,796.0,703.0,5740.0,2938.0,2802.0,29357.0,14913.0,14444.0,1.331067,3837.935999,-18.310985,0.231856,-17.369322,-17.369322,47.669467,47.669467,0.231856,47.669467,1.594435,0.000000,0.231856,55949.806463,1.723578e+07,587.109752,8.575065,8.663953,9,1.046667,3.795556,3.505432,2.869074,2.952315,-0.875175,-0.087296,0.206519,0.021245,0.815556,2018,12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,
311161,311161,2018-12-01,96161,Nevada,Superior California,5.053509,1810.586364,1793.499091,893.0,540.0,353.0,1280.0,736.0,544.0,1219.0,590.0,629.0,1023.0,565.0,458.0,4415.0,2431.0,1984.0,18333.0,9461.0,8872.0,1.660815,7735.266680,-49.871057,0.243283,-61.071359,-61.071359,40.066449,40.066449,0.243283,40.066449,1.677660,0.000000,0.243283,111605.903924,4.081391e+07,2226.253696,15.689734,16.993708,11,1.539474,4.732456,4.320468,3.938962,3.968458,-1.020802,-0.099430,0.119596,0.000669,1.917544,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,


In [16]:
df_modeling_combined['avg_temp'] = ((df_modeling_combined['school_temperature'] + df_modeling_combined['ps_temperature']) / 2)

In [17]:
df_modeling_combined['diff_temp_s_ps'] = df_modeling_combined['school_temperature'] - df_modeling_combined['ps_temperature']

In [18]:
df_modeling_combined = df_modeling_combined.rename(columns={"avg_wspd_top_20": "avg_wspd_top_15"})

In [19]:
df_modeling_combined = df_modeling_combined.drop(columns=['avg_wind_speed'])

In [20]:
display(df_modeling_combined)

Unnamed: 0.1,Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_last_month,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,avg_count_ps_within_5km,avg_elevation_diff_m,avg_wspd_ratio_ps_sch,avg_wspd_ratio_sch_ps,avg_school_wspd,avg_ps_wspd,new_alignment_90_high,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,avg_wspd_top_15,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed_but_wspd_ratio,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd,avg_temp,diff_temp_s_ps
0,0,2000-01-01,90001,Los Angeles,Los Angeles County,32.149998,44.728889,43.703333,6196.0,3209.0,2987.0,6672.0,3397.0,3275.0,5562.0,2850.0,2712.0,5075.0,2599.0,2476.0,23505.0,12055.0,11450.0,54481.0,27320.0,27161.0,14.241154,3854.812685,-90.196586,0.757031,-172.758321,-172.758321,82.561735,82.561735,82.561735,1.124995,0.000000,0.757031,20049.704556,2.608176e+06,47.873130,14.277778,14.266667,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,14.272222,0.011111
1,1,2000-01-01,90002,Los Angeles,Los Angeles County,31.849998,33.858889,29.690000,4795.0,2447.0,2348.0,5655.0,2870.0,2785.0,5077.0,2566.0,2511.0,4316.0,2179.0,2137.0,19843.0,10062.0,9781.0,44584.0,21553.0,23031.0,6.649500,2734.278190,-75.730039,0.757031,-172.758321,-172.758321,97.028283,97.028283,97.028283,0.879819,0.222222,0.757031,19697.001063,1.549733e+06,34.759847,14.300000,14.400000,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,14.350000,-0.100000
2,2,2000-01-01,90003,Los Angeles,Los Angeles County,31.724998,42.329167,45.785000,6527.0,3297.0,3230.0,6894.0,3539.0,3355.0,5994.0,3000.0,2994.0,5381.0,2727.0,2654.0,24796.0,12563.0,12233.0,58187.0,28557.0,29630.0,11.672797,5281.522927,-63.495073,0.855611,-172.758321,-60.057878,106.478652,100.850655,103.664653,0.869845,0.000000,0.757031,18895.491452,2.255523e+06,38.763349,14.300000,14.383333,12,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,14.341667,-0.083333
3,3,2000-01-01,90004,Los Angeles,Los Angeles County,29.560000,83.190000,102.906667,5621.0,2879.0,2742.0,5505.0,2816.0,2689.0,4470.0,2297.0,2173.0,4204.0,2199.0,2005.0,19800.0,10191.0,9609.0,67850.0,34200.0,33650.0,5.158316,6452.615217,-87.100171,0.757031,-172.758321,-172.758321,78.897449,78.897449,78.897449,1.160977,0.000000,0.757031,41990.032278,4.404657e+07,649.175696,14.022222,13.911111,9,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,13.966667,0.111111
4,4,2000-01-01,90006,Los Angeles,Los Angeles County,30.750000,65.920000,77.570000,5939.0,3050.0,2889.0,6009.0,2995.0,3014.0,4493.0,2337.0,2156.0,4416.0,2370.0,2046.0,20857.0,10752.0,10105.0,62765.0,31901.0,30864.0,3.553121,5187.184865,-78.360123,0.757031,-172.758321,-172.758321,94.398199,94.398199,94.398199,0.923498,0.000000,0.757031,17629.297337,3.132557e+06,49.909297,14.140000,14.100000,5,,,,,,,,,,,2000,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2000-01,,,,,,,,,,,,,,,,,,,,14.120000,0.040000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311158,311158,2018-12-01,96145,Placer,Superior California,4.462162,1991.700000,1797.860000,109.0,63.0,46.0,79.0,63.0,16.0,114.0,36.0,78.0,48.0,17.0,31.0,350.0,179.0,171.0,2151.0,1130.0,1021.0,1.693826,14639.964232,55.979727,0.243283,13.414409,-61.071359,162.565317,122.948914,142.757116,0.203923,0.000000,0.437017,84420.345595,1.108245e+07,5152.233380,14.183790,16.993708,3,1.110811,3.909009,3.562763,2.959234,3.129392,-0.949344,-0.090811,0.196834,-0.000854,0.832432,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,15.588749,-2.809918
311159,311159,2018-12-01,96146,Placer,Superior California,4.626316,1895.970000,1797.860000,36.0,21.0,15.0,59.0,21.0,38.0,32.0,9.0,23.0,20.0,20.0,0.0,147.0,71.0,76.0,969.0,562.0,407.0,1.693826,15987.398121,-147.155701,0.243283,13.414409,-61.071359,160.570111,86.084342,123.327226,0.450580,0.000000,0.437017,113113.030726,6.148855e+06,6345.567595,16.993708,16.993708,1,0.394737,3.367544,3.196491,2.601316,2.967325,-0.995338,-0.199912,0.151178,-0.044263,0.321053,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,16.993708,0.000000
311160,311160,2018-12-01,96150,El Dorado,Superior California,4.286667,1922.062222,1907.910000,1613.0,792.0,821.0,1267.0,634.0,633.0,1361.0,716.0,645.0,1499.0,796.0,703.0,5740.0,2938.0,2802.0,29357.0,14913.0,14444.0,1.331067,3837.935999,-18.310985,0.231856,-17.369322,-17.369322,47.669467,47.669467,47.669467,1.594435,0.000000,0.231856,55949.806463,1.723578e+07,587.109752,8.575065,8.663953,9,1.046667,3.795556,3.505432,2.869074,2.952315,-0.875175,-0.087296,0.206519,0.021245,0.815556,2018,12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,8.619509,-0.088889
311161,311161,2018-12-01,96161,Nevada,Superior California,5.053509,1810.586364,1793.499091,893.0,540.0,353.0,1280.0,736.0,544.0,1219.0,590.0,629.0,1023.0,565.0,458.0,4415.0,2431.0,1984.0,18333.0,9461.0,8872.0,1.660815,7735.266680,-49.871057,0.243283,-61.071359,-61.071359,40.066449,40.066449,40.066449,1.677660,0.000000,0.243283,111605.903924,4.081391e+07,2226.253696,15.689734,16.993708,11,1.539474,4.732456,4.320468,3.938962,3.968458,-1.020802,-0.099430,0.119596,0.000669,1.917544,2018,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2018-12,,,,,,,,,,,,,,,,,,,,16.341721,-1.303973


In [21]:
df_modeling_combined.to_csv(os.path.join(local_dir,'modeling_data_joined_11-22-top15_4tpy_ds_wind_ratios.csv'))

In [19]:
df_modeling_combined_selected = df_modeling_combined[~df_modeling_combined['central_wind_alignment_180_high'].isnull()]

In [20]:
display(df_modeling_combined_selected)

Unnamed: 0,year_month,school_zip,school_county_v2,school_region_name,pm25,school_elevation_m,ps_elevation_m,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,pop_under19_male,pop_under19_female,total_population,total_population_male,total_population_female,point_source_pm25_tpy,dist_school_to_ps_m,angle_to_school,ps_wspd_merge,school_wdir_wrt_0n,ps_wdir_wrt_0n,school_wind_alignment,ps_wind_alignment,avg_wind_speed,avg_wind_alignment,avg_wind_alignment_cosine,nearby_point_source_count,school_wspd,ca_agi_per_returns,total_tax_liability,tax_liability_per_capita,school_temperature,ps_temperature,school_count,pm25_last_month,pm25_r6,pm25_r9,pm25_r12,pm25_r24,pm25_slope6,pm25_slope9,pm25_slope12,pm25_slope24,pm25_lag_12mo,year,month,school_county_v2_alameda,school_county_v2_alpine,school_county_v2_amador,school_county_v2_butte,school_county_v2_calaveras,school_county_v2_colusa,school_county_v2_contra_costa,school_county_v2_del_norte,school_county_v2_el_dorado,school_county_v2_fresno,school_county_v2_glenn,school_county_v2_humboldt,school_county_v2_imperial,school_county_v2_inyo,school_county_v2_kern,school_county_v2_kings,school_county_v2_lake,school_county_v2_lassen,school_county_v2_los_angeles,school_county_v2_madera,school_county_v2_marin,school_county_v2_mariposa,school_county_v2_mendocino,school_county_v2_merced,school_county_v2_modoc,school_county_v2_mono,school_county_v2_monterey,school_county_v2_napa,school_county_v2_nevada,school_county_v2_orange,school_county_v2_placer,school_county_v2_plumas,school_county_v2_riverside,school_county_v2_sacramento,school_county_v2_san_benito,school_county_v2_san_bernardino,school_county_v2_san_diego,school_county_v2_san_francisco,school_county_v2_san_joaquin,school_county_v2_san_luis_obispo,school_county_v2_san_mateo,school_county_v2_santa_barbara,school_county_v2_santa_clara,school_county_v2_santa_cruz,school_county_v2_shasta,school_county_v2_sierra,school_county_v2_siskiyou,school_county_v2_solano,school_county_v2_sonoma,school_county_v2_stanislaus,school_county_v2_sutter,school_county_v2_tehama,school_county_v2_trinity,school_county_v2_tulare,school_county_v2_tuolumne,school_county_v2_ventura,school_county_v2_yolo,school_county_v2_yuba,month_01,month_02,month_03,month_04,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,y-m,central_wind_alignment_180_high,avg_school_wspd,avg_ps_wspd,new_alignment_90_high,ps_pm25_tpy_top_20,school_to_ps_geod_dist_m_top_20,avg_wspd_top_20,Izmy_v1_unnormed,Izmy_v2_nodist_unnormed,Izmy_v3_normed_D_and_TPY,Izmy_v4_nodist_normed_TPY,Izmy_v5_all_normed,Izmy_v6_unnormed_no_wspd,Izmy_v7_all_normed_no_wspd,Izmy_v8_normed_D_and_TPY_no_wspd
15881,2001-01-01,90001,Los Angeles,Los Angeles County,28.900000,44.728889,43.703333,6149.0,3180.0,2969.0,6537.0,3322.0,3215.0,5555.0,2842.0,2713.0,5126.0,2622.0,2504.0,23367.0,11966.0,11401.0,54744.0,27435.0,27309.0,14.241154,3854.812685,-90.196586,0.587163,-139.918024,-139.918024,49.721438,49.721438,0.587163,49.721438,1.572834,0.000000,0.587163,20186.065385,2.235903e+06,40.842887,12.355556,12.000000,9,37.000000,24.175000,21.879629,21.659722,,3.638572,2.086111,0.853671,,32.149998,2001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2001-01,108.931680,1.713602,2.390189,30.757574,21.267044,10743.323154,2.051896,2040.789704,1.820348e+07,146538.522559,4325.049826,83.931570,999.733006,808.529801,654909.139128
15882,2001-01-01,90002,Los Angeles,Los Angeles County,28.683334,33.858889,29.690000,4847.0,2477.0,2370.0,5582.0,2831.0,2751.0,5072.0,2561.0,2511.0,4413.0,2220.0,2193.0,19914.0,10089.0,9825.0,45248.0,21885.0,23363.0,6.649500,2734.278190,-75.730039,0.587163,-139.918024,-139.918024,64.187986,64.187986,0.587163,64.187986,1.430098,0.222222,0.587163,20343.701888,1.796920e+06,39.712694,12.400000,11.400000,9,37.316666,24.005556,21.690741,21.465278,,3.705714,2.112778,0.867891,,31.849998,2001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2001-01,112.717228,1.713602,2.461906,33.622584,22.913958,10117.323198,2.087754,2438.393106,2.225846e+07,178547.037043,5447.604118,102.264818,1197.622853,984.437563,797394.425857
15883,2001-01-01,90003,Los Angeles,Los Angeles County,28.675000,42.329167,45.785000,6558.0,3314.0,3244.0,6823.0,3507.0,3316.0,6036.0,3020.0,3016.0,5496.0,2784.0,2712.0,24913.0,12625.0,12288.0,58995.0,28964.0,30030.0,11.672797,5281.522927,-63.495073,0.590464,-139.918024,-46.924705,76.422951,70.196765,0.588814,73.309858,1.240651,0.000000,0.587163,19651.756057,2.308299e+06,39.127028,12.400000,12.258333,12,36.812500,23.822917,21.531944,21.307292,,3.668214,2.093333,0.858348,,31.724998,2001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2001-01,106.748596,1.713602,2.361285,28.781575,25.945395,11158.519253,2.037444,1856.566552,2.006117e+07,143382.540662,5039.667189,82.122980,958.589479,836.273764,903175.664845
15884,2001-01-01,90004,Los Angeles,Los Angeles County,29.210000,83.190000,102.906667,5456.0,2796.0,2660.0,5294.0,2706.0,2588.0,4383.0,2252.0,2131.0,4177.0,2180.0,1997.0,19310.0,9934.0,9376.0,67283.0,33910.0,33373.0,5.158316,6452.615217,-87.100171,0.587163,-139.918024,-139.918024,52.890540,52.890540,0.587163,52.890540,1.457552,0.000000,0.587163,40642.828267,4.032749e+07,599.371178,12.122222,11.955556,9,33.300000,22.430000,20.564444,20.385000,,2.996000,1.690667,0.685874,,29.560000,2001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2001-01,108.005576,2.000370,2.360818,31.734151,17.612440,15605.520450,2.180594,1106.424379,1.707828e+07,75695.886046,3854.391962,43.355886,532.762044,407.167796,329805.914701
15885,2001-01-01,90006,Los Angeles,Los Angeles County,29.283333,65.920000,77.570000,5792.0,2975.0,2817.0,5788.0,2885.0,2903.0,4430.0,2300.0,2130.0,4415.0,2354.0,2061.0,20425.0,10514.0,9911.0,62407.0,31736.0,30671.0,3.553121,5187.184865,-78.360123,0.587163,-139.918024,-139.918024,61.557902,61.557902,0.587163,61.557902,1.475006,0.000000,0.587163,17992.659457,3.057961e+06,49.000288,12.240000,12.200000,5,34.050000,23.125000,21.161111,20.980556,,3.154762,1.783056,0.719231,,30.750000,2001,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2001-01,105.469269,2.000370,2.369464,29.740385,19.325429,14426.475172,2.184917,1143.751031,1.665737e+07,80193.090172,3826.138546,45.931458,561.104804,443.723354,199675.509216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294892,2017-12-01,96145,Placer,Superior California,0.832432,1991.700000,1797.860000,108.0,65.0,43.0,88.0,70.0,18.0,123.0,40.0,83.0,49.0,18.0,31.0,368.0,193.0,175.0,2406.0,1258.0,1148.0,1.693826,14639.964232,55.979727,0.366957,-16.744771,-62.962672,167.275502,121.057601,0.445046,144.166552,0.190946,0.000000,0.523135,72996.345941,1.024349e+07,4257.476309,14.183790,16.993708,3,1.337838,3.047748,3.353454,3.299550,3.219595,-0.295598,-0.276532,-0.032357,0.036204,0.081081,2017,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-12,121.755312,2.245801,1.749025,40.185018,27.979343,120535.517727,1.997413,278.158695,3.142667e+07,23003.387005,8096.117428,13.175782,123.803397,111.224013,30030.483571
294893,2017-12-01,96146,Placer,Superior California,0.321053,1895.845000,1797.860000,26.0,12.0,14.0,97.0,42.0,55.0,33.0,9.0,24.0,26.0,26.0,0.0,182.0,89.0,93.0,1030.0,569.0,461.0,1.693826,15988.035181,-147.230195,0.366957,-16.744771,-62.962672,130.485424,84.267523,0.445046,107.376474,0.701351,0.000000,0.523135,90969.418848,4.708804e+06,4571.654369,16.993708,16.993708,2,0.721053,2.693860,3.249708,3.333333,3.092982,-0.372782,-0.432105,-0.138572,0.041982,0.036842,2017,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-12,120.309266,2.245801,1.750355,39.299769,27.217033,114006.560427,1.998078,282.925210,3.028262e+07,23261.653398,7762.947200,13.323694,126.558639,113.240906,20383.362995
294894,2017-12-01,96150,El Dorado,Superior California,0.815556,1922.062222,1907.910000,1511.0,767.0,744.0,1348.0,663.0,685.0,1350.0,704.0,646.0,1513.0,721.0,792.0,5722.0,2855.0,2867.0,29103.0,14981.0,14122.0,1.331067,3837.935999,-18.310985,0.406088,-41.615590,-41.615590,50.363497,50.363497,0.406088,50.363497,1.503046,0.000000,0.406088,54841.402443,1.708554e+07,587.071470,8.575065,8.663953,9,1.568889,3.131852,3.300988,3.035556,2.971389,-0.397333,-0.213852,0.048159,0.054034,0.437778,2017,12,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-12,121.262726,1.781143,1.718598,39.240093,28.592464,119079.044087,1.749870,263.747226,2.816367e+07,22270.941221,7365.520890,12.755924,134.395044,123.149262,99750.902607
294895,2017-12-01,96161,Nevada,Superior California,1.917544,1802.365000,1793.063000,1036.0,527.0,509.0,1285.0,764.0,521.0,1169.0,564.0,605.0,926.0,510.0,416.0,4416.0,2365.0,2051.0,18369.0,9512.0,8857.0,1.657514,8046.683622,-70.709314,0.366957,-62.962672,-62.962672,28.896594,28.896594,0.366957,28.896594,1.829178,0.000000,0.366957,103784.456458,3.547946e+07,1931.485437,15.559337,16.993708,10,2.023684,3.591374,3.867739,3.997953,3.827376,-0.271303,-0.269079,-0.084517,0.037011,0.722807,2017,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2017-12,121.871333,2.267244,1.839678,39.735522,28.693671,118082.820920,2.053461,283.733109,3.224310e+07,23504.787946,8378.454802,13.462949,127.157454,114.722102,103249.891905
