In [1]:
import findspark
findspark.init()

In [49]:
import pyspark
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from math import radians, cos, sin, asin, sqrt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, row_number, count, when, isnan, abs, cos, atan2

from cmath import pi
from math import atan2, radians

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
spark = SparkSession.builder.master("local[*]").config("spark.executor.memory", "48g").config("spark.driver.memory", "48g").config("spark.driver.maxResultSize","0").config("spark.executor.heartbeatInterval","100000").getOrCreate()

In [4]:
gdrive_path = 'C:\\Users\\matts\\Google Drive\\berkeley\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\'

In [5]:
df_census = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'census/census_bureau_clean/census_bureau.csv'))

# re-using Pandas df cleaning code before converting to Spark
df_wind = pd.read_parquet(os.path.join(gdrive_path, 'wind/'))
df_pollution = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'AirPollution/UW_pm25_zip_monthly_anand_2000-2018-v2.csv'))
df_point_sources = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'Point source/pollution_point_sources.csv'))
df_schools = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'schools/filtered_joined_schools_data.csv'))             

In [6]:
df_schools = df_schools.withColumnRenamed("County","school_county")\
.withColumnRenamed("StatusType","school_active_status")\
.withColumnRenamed("Street","school_street")\
.withColumnRenamed("EILName","school_type")\
.withColumnRenamed("GSoffered","school_grades_offered")\
.withColumnRenamed("Zip_first_five","school_zip")\
.withColumnRenamed("OpenDate","school_open_date")\
.withColumnRenamed("ClosedDate","school_closed_date")\
.withColumnRenamed("Merged_Lat","school_lat")\
.withColumnRenamed("Merged_Long","school_lon")\
.withColumnRenamed("CDSCode","cdscode")

df_schools = df_schools.select("cdscode","school_county"\
                               ,"school_active_status","school_street"\
                               ,"school_type","school_grades_offered","school_zip"\
                               ,"school_open_date","school_closed_date","school_lat","school_lon")

In [7]:
# clean wind
df_wind = df_wind.rename(columns={'lat': 'wind_lat', 'lon': 'wind_lon'})
df_wind['year_month'] = df_wind['year_month'].astype(str).map(lambda x: x[:4] + '-' + x[-2:])
df_wind['year'] = df_wind['year_month'].map(lambda x: int(x[:4]))
df_wind['ZCTA10'] = df_wind['ZCTA10'].astype(int)
df_wind = df_wind[(df_wind['year'] >= 2000) & (df_wind['year'] <= 2019)]

df_wind = spark.createDataFrame(df_wind)

In [8]:
df_wind.schema

StructType([StructField('wind_lat', DoubleType(), True), StructField('wind_lon', DoubleType(), True), StructField('ZCTA10', LongType(), True), StructField('u', DoubleType(), True), StructField('v', DoubleType(), True), StructField('wdir', DoubleType(), True), StructField('wspd', DoubleType(), True), StructField('year_month', StringType(), True), StructField('year', LongType(), True)])

In [9]:

df_wind.limit(5).show()

+--------+--------+------+--------------------+--------------------+------------------+-------------------+----------+----+
|wind_lat|wind_lon|ZCTA10|                   u|                   v|              wdir|               wspd|year_month|year|
+--------+--------+------+--------------------+--------------------+------------------+-------------------+----------+----+
|  37.465|-117.936| 89010|  0.8576987981796265|  0.2611056864261627|16.931604385375977| 0.8965619802474976|   2000-01|2000|
|  35.396|-116.322| 89019|  0.4024794399738312| 0.08723340928554535|12.229137420654297|0.41182443499565125|   2000-01|2000|
|  36.161|-116.139| 89060|-0.07470311224460602| 0.17972277104854584|112.57057189941406| 0.1946299821138382|   2000-01|2000|
|  35.957|-115.897| 89061|  0.6378185749053955|-0.03187631443142891|  357.138916015625| 0.6386146545410156|   2000-01|2000|
|   39.52|-120.032| 89439|  0.7424972057342529|  0.8004334568977356| 47.15041732788086| 1.0917856693267822|   2000-01|2000|
+-------

In [10]:
df_pollution = df_pollution.drop('Unnamed: 0').drop('GEOID10').drop('year_month_zip')

In [11]:
# clean pollution point sources
df_point_sources = df_point_sources.drop('zip_code').drop('new_zip')\
.withColumnRenamed('combo_zip','point_source_zip')\
.withColumnRenamed('_c0','point_source_index')\
.withColumnRenamed('checked_lat','point_source_lat')\
.withColumnRenamed('checked_lon','point_source_lon')\
.withColumnRenamed('report_year','point_source_year')\
.withColumnRenamed('PM25_emissions_TPY','point_source_pm25_tpy')

df_point_sources = df_point_sources.withColumn('point_source_zip',df_point_sources.point_source_zip.cast(IntegerType()))

# # create an ID field for easier lookups
# df_point_sources['point_source_id'] = [i for i in range(df_point_sources.shape[0])]

In [12]:
df_point_sources.limit(5).show()

+------------------+----------------+----------------+-----------------+---------------------+----------------+
|point_source_index|point_source_lat|point_source_lon|point_source_year|point_source_pm25_tpy|point_source_zip|
+------------------+----------------+----------------+-----------------+---------------------+----------------+
|                 0|       33.811466|      -117.91555|             2002|          1.787853962|           92803|
|                 1|       34.088242|     -117.470116|             2002|               1.7892|           92335|
|                 2|       33.911602|     -118.281799|             2002|          1.791299781|           93420|
|                 3|       37.944618|     -121.325859|             2002|          1.797499934|           95203|
|                 4|       39.221817|     -121.054955|             2002|              1.80154|           95945|
+------------------+----------------+----------------+-----------------+---------------------+----------

In [13]:
df_all = df_schools.join(df_wind,df_schools['school_zip'] == df_wind['ZCTA10'], 'left')

df_all = df_all.withColumnRenamed("u","school_wind_u")\
.withColumnRenamed("v","school_wind_v")\
.withColumnRenamed("wdir","school_wdir")\
.withColumnRenamed("wspd","school_wspd")\
.withColumnRenamed("wind_lat","school_wind_lat")\
.withColumnRenamed("wind_lon","school_wind_lon")\
.withColumnRenamed("ZCTA10","school_wind_zip")

df_all.cache()

df_all.limit(5).show()

# df_all = pd.merge(df_schools, df_wind, left_on = 'school_zip', right_on='ZCTA10', how='left')\
#   .merge(df_census, left_on = ['school_zip', 'year'], right_on=['zip', 'year'], how='left')\
#   .merge(df_pollution, left_on=['school_zip', 'year_month'], right_on=['ZIP10', 'year_month'], how='left')

+--------------+-------------+--------------------+--------------------+-----------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+----+
|       cdscode|school_county|school_active_status|       school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat| school_lon|school_wind_lat|school_wind_lon|school_wind_zip|       school_wind_u|       school_wind_v|       school_wdir|        school_wspd|year_month|year|
+--------------+-------------+--------------------+--------------------+-----------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+----+
|19643376011894|  Los Angeles|    

In [14]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+----+
|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat|school_lon|school_wind_lat|school_wind_lon|school_wind_zip|school_wind_u|school_wind_v|school_wdir|school_wspd|year_month|year|
+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+----+
|      0|            0|                   0|            0|          0|                    0|         0|               0|                 0|         0|         0|     

In [15]:
df_census.printSchema()

root
 |-- year: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- total_population: string (nullable = true)
 |-- total_population_male: string (nullable = true)
 |-- total_population_female: string (nullable = true)
 |-- population_0_4: string (nullable = true)
 |-- population_0_4_male: string (nullable = true)
 |-- population_0_4_female: string (nullable = true)
 |-- population_5_9: string (nullable = true)
 |-- population_5_9_male: string (nullable = true)
 |-- population_5_9_female: string (nullable = true)
 |-- population_10_14: string (nullable = true)
 |-- population_10_14_male: string (nullable = true)
 |-- population_10_14_female: string (nullable = true)
 |-- population_15_19: string (nullable = true)
 |-- population_15_19_male: string (nullable = true)
 |-- population_15_19_female: string (nullable = true)
 |-- total_pop_under19: string (nullable = true)



In [16]:
# avoiding duplicate column names after join
df_census = df_census.withColumnRenamed("year","census_year").withColumnRenamed("zip","census_zip")

In [17]:
df_all = df_all.join(df_census, (df_all['year'] == df_census['census_year']) & (df_all['school_zip'] == df_census['census_zip']), 'left')

df_all.cache()

df_all.limit(5).show()

+--------------+-------------+--------------------+--------------------+-----------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+----+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+
|       cdscode|school_county|school_active_status|       school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat| school_lon|school_wind_lat|school_wind_lon|school_wind_zip|       school_wind_u|       school_wind_v|       school_wdir|        school_wspd|year_month|year|census_year|c

In [18]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+----+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+
|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat|school_lon|school_wind_lat|school_wind_lon|school_wind_zip|school_wind_u|school_wind_v|school_wdir|school_wspd|year_month|year|census_year|census_zip|total_population|total_population_male|total_population_female|population_0_4|

In [19]:
df_pollution.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- year_month: string (nullable = true)
 |-- ZIP10: string (nullable = true)
 |-- pm25: string (nullable = true)



In [20]:
df_pollution = df_pollution.withColumnRenamed("_c0","pm25_index")\
.withColumnRenamed("year_month","pm25_year_month")\
.withColumnRenamed("ZIP10","pm25_zip")

In [21]:
df_all = df_all.join(df_pollution, (df_all['year_month'] == df_pollution['pm25_year_month']) & (df_all['school_zip'] == df_pollution['pm25_zip']), 'left')

df_all.cache()

df_all.limit(5).show()

+--------------+-------------+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+-------------------+-----------------+------------------+----------+----+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+-----+
|       cdscode|school_county|school_active_status|       school_street|         school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat| school_lon|school_wind_lat|school_wind_lon|school_wind_zip|       school_wind_u|      school_wind_v|      school

In [22]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+----+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+
|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat|school_lon|school_wind_lat|school_wind_lon|school_wind_zip|school_wind_u|school_wind_v|school_wdir|school_wspd|year_month|year|census_year|census_zip|total_population|total_population_m

In [23]:
df_closest_ps_by_year = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'schools/school_year_to_point_lookup.csv'))

In [24]:
df_closest_ps_by_year.limit(5).show()

+---+--------------+-------------+----------+----------------+------------------+----------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+------------------+-------------------+
|_c0|       CDSCode|school_county|school_zip|school_open_date|school_closed_date|school_lat| school_lon|point_source_index|point_source_lat|point_source_lon|point_source_year|point_source_pm25_tpy|point_source_zip|       geod_dist_m|    angle_to_school|
+---+--------------+-------------+----------+----------------+------------------+----------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+------------------+-------------------+
|  0|10621171030071|       Fresno|     93611|        7/1/1967|           No Data| 36.834437| -119.69052|               111|       36.830643|     -119.684105|             2002|           2.30478261|         93612.0| 710.4600630843861|-53.6

In [25]:
df_year_lookup = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'school_pollution_mapping/year_lookup.csv'))

In [26]:
df_all = df_all.join(df_year_lookup,["year"],"left")

In [27]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+
|year|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat|school_lon|school_wind_lat|school_wind_lon|school_wind_zip|school_wind_u|school_wind_v|school_wdir|school_wspd|year_month|census_year|census_zip|total_population|total_

In [28]:
df_all.limit(10).drop()

DataFrame[year: bigint, cdscode: string, school_county: string, school_active_status: string, school_street: string, school_type: string, school_grades_offered: string, school_zip: string, school_open_date: string, school_closed_date: string, school_lat: string, school_lon: string, school_wind_lat: double, school_wind_lon: double, school_wind_zip: bigint, school_wind_u: double, school_wind_v: double, school_wdir: double, school_wspd: double, year_month: string, census_year: string, census_zip: string, total_population: string, total_population_male: string, total_population_female: string, population_0_4: string, population_0_4_male: string, population_0_4_female: string, population_5_9: string, population_5_9_male: string, population_5_9_female: string, population_10_14: string, population_10_14_male: string, population_10_14_female: string, population_15_19: string, population_15_19_male: string, population_15_19_female: string, total_pop_under19: string, pm25_index: string, pm25_yea

In [29]:
# compare data types for join
df_all.printSchema()

root
 |-- year: long (nullable = true)
 |-- cdscode: string (nullable = true)
 |-- school_county: string (nullable = true)
 |-- school_active_status: string (nullable = true)
 |-- school_street: string (nullable = true)
 |-- school_type: string (nullable = true)
 |-- school_grades_offered: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_open_date: string (nullable = true)
 |-- school_closed_date: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_wind_zip: long (nullable = true)
 |-- school_wind_u: double (nullable = true)
 |-- school_wind_v: double (nullable = true)
 |-- school_wdir: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- year_month: string (nullable = true)
 |-- census_year: string (nullable = true)
 |-- census_zip: string (nullable = true)
 |-- total_populat

In [30]:
col_list = ["CDSCode","point_source_lat","point_source_lon","point_source_year","point_source_pm25_tpy","point_source_zip","geod_dist_m","angle_to_school"]

df_closest_ps_by_year = df_closest_ps_by_year.select(col_list)
df_closest_ps_by_year = df_closest_ps_by_year.withColumnRenamed("CDSCode","cdscode_closest_ps")\
.withColumnRenamed("geod_dist_m","dist_school_to_ps_m")


# convert year and zip strings to ints for joining
df_closest_ps_by_year = df_closest_ps_by_year.withColumn('point_source_zip',df_closest_ps_by_year.point_source_zip.cast(IntegerType()))
df_closest_ps_by_year = df_closest_ps_by_year.withColumn('point_source_year',df_closest_ps_by_year.point_source_year.cast(IntegerType()))

In [31]:
df_closest_ps_by_year.printSchema()

root
 |-- cdscode_closest_ps: string (nullable = true)
 |-- point_source_lat: string (nullable = true)
 |-- point_source_lon: string (nullable = true)
 |-- point_source_year: integer (nullable = true)
 |-- point_source_pm25_tpy: string (nullable = true)
 |-- point_source_zip: integer (nullable = true)
 |-- dist_school_to_ps_m: string (nullable = true)
 |-- angle_to_school: string (nullable = true)



In [32]:
df_closest_ps_by_year.limit(10).show()

+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+
|cdscode_closest_ps|point_source_lat|point_source_lon|point_source_year|point_source_pm25_tpy|point_source_zip|dist_school_to_ps_m|    angle_to_school|
+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+
|    10621171030071|       36.830643|     -119.684105|             2002|           2.30478261|           93612|  710.4600630843861|-53.654480572843454|
|    10621171030196|          36.841|       -119.7903|             2008|            1.5648188|           93721|  3345.539540386091| 48.137648661761396|
|    10621176104624|          36.841|       -119.7903|             2008|            1.5648188|           93721| 3985.8146532595038|   82.1616370925463|
|    10621250000000|        36.13427|      -120.38843|             2011|           16.05

In [33]:
df_all = df_all.join(df_closest_ps_by_year,((df_all["cdscode"] == df_closest_ps_by_year['cdscode_closest_ps']) & (df_all['lookup_year'] == df_closest_ps_by_year['point_source_year'])),"left")

In [34]:
df_all.limit(10).drop()

DataFrame[year: bigint, cdscode: string, school_county: string, school_active_status: string, school_street: string, school_type: string, school_grades_offered: string, school_zip: string, school_open_date: string, school_closed_date: string, school_lat: string, school_lon: string, school_wind_lat: double, school_wind_lon: double, school_wind_zip: bigint, school_wind_u: double, school_wind_v: double, school_wdir: double, school_wspd: double, year_month: string, census_year: string, census_zip: string, total_population: string, total_population_male: string, total_population_female: string, population_0_4: string, population_0_4_male: string, population_0_4_female: string, population_5_9: string, population_5_9_male: string, population_5_9_female: string, population_10_14: string, population_10_14_male: string, population_10_14_female: string, population_15_19: string, population_15_19_male: string, population_15_19_female: string, total_pop_under19: string, pm25_index: string, pm25_yea

In [35]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+
|year|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date|school_closed_date|school_lat|school_lon|school_wind

In [36]:
df_wind_2 = df_wind

df_wind_2 = df_wind_2.withColumnRenamed("year","wind_year").withColumnRenamed("year_month","wind_year_month")

In [37]:
df_all = df_all.join(df_wind_2,((df_all['point_source_zip'] == df_wind_2['ZCTA10']) & (df_all['year_month'] == df_wind_2['wind_year_month'])), 'left')
df_all = df_all.drop("wind_year").drop("wind_year_month")

In [38]:
df_all.limit(5).drop()

DataFrame[year: bigint, cdscode: string, school_county: string, school_active_status: string, school_street: string, school_type: string, school_grades_offered: string, school_zip: string, school_open_date: string, school_closed_date: string, school_lat: string, school_lon: string, school_wind_lat: double, school_wind_lon: double, school_wind_zip: bigint, school_wind_u: double, school_wind_v: double, school_wdir: double, school_wspd: double, year_month: string, census_year: string, census_zip: string, total_population: string, total_population_male: string, total_population_female: string, population_0_4: string, population_0_4_male: string, population_0_4_female: string, population_5_9: string, population_5_9_male: string, population_5_9_female: string, population_10_14: string, population_10_14_male: string, population_10_14_female: string, population_15_19: string, population_15_19_male: string, population_15_19_female: string, total_pop_under19: string, pm25_index: string, pm25_yea

In [39]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+--------+--------+------+------+------+------+------+
|year|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_zip|school_open_date

In [40]:
df_all = df_all.withColumnRenamed("u","ps_wind_u")\
.withColumnRenamed("v","ps_wind_v")\
.withColumnRenamed("wdir","ps_wdir")\
.withColumnRenamed("wspd","ps_wspd")\
.withColumnRenamed("wind_lat","ps_wind_lat")\
.withColumnRenamed("wind_lon","ps_wind_lon")\
.withColumnRenamed("ZCTA10","ps_wind_zip")

df_all.cache()

# learned we can use .drop() instead of .show() to get a nice output. So stupid!
# upon restarting kernel, does not work. so strange.
df_all.limit(5).drop()

DataFrame[year: bigint, cdscode: string, school_county: string, school_active_status: string, school_street: string, school_type: string, school_grades_offered: string, school_zip: string, school_open_date: string, school_closed_date: string, school_lat: string, school_lon: string, school_wind_lat: double, school_wind_lon: double, school_wind_zip: bigint, school_wind_u: double, school_wind_v: double, school_wdir: double, school_wspd: double, year_month: string, census_year: string, census_zip: string, total_population: string, total_population_male: string, total_population_female: string, population_0_4: string, population_0_4_male: string, population_0_4_female: string, population_5_9: string, population_5_9_male: string, population_5_9_female: string, population_10_14: string, population_10_14_male: string, population_10_14_female: string, population_15_19: string, population_15_19_male: string, population_15_19_female: string, total_pop_under19: string, pm25_index: string, pm25_yea

In [41]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+
|year|cdscode|school_county|school_active_status|school_street|school_type|school_grades_offered|school_z

In [42]:
df_all.count()

2983111

In [43]:
# df_all.write.parquet('C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\joined_data_v2_10_2_22.parquet')

## Next steps: create column for pollution source wind which infers from schools
## Then: recalculate wind dir for school and pollution source u and v
## Lastly: compute the angle differences between school bearing and wind dirs, taking average

In [44]:
df_all = df_all.withColumn('ps_wind_u_merge',when(df_all.ps_wspd.isNull(),df_all.school_wind_u).otherwise(df_all.ps_wind_u))
df_all = df_all.withColumn('ps_wind_v_merge',when(df_all.ps_wspd.isNull(),df_all.school_wind_v).otherwise(df_all.ps_wind_v))
df_all = df_all.withColumn('ps_wdir_merge',when(df_all.ps_wspd.isNull(),df_all.school_wdir).otherwise(df_all.ps_wdir))
df_all = df_all.withColumn('ps_wspd_merge',when(df_all.ps_wspd.isNull(),df_all.school_wspd).otherwise(df_all.ps_wspd))

In [45]:
df_all.filter(df_all.ps_wspd.isNotNull()).filter(df_all.ps_wspd != df_all.school_wspd).limit(10).show()

+----+--------------+-------------+--------------------+--------------------+-----------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+-------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+----------------+---------------------+-----------------------+------------------+---------------------+-----------------------+-----------------+----------+---------------+--------+---------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+-------------------+--------------------+-------------------+-------------------+-----

In [46]:
# spot check for nulls

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+
|year|cdscode|school_county|school_active_sta

In [47]:
def convert_u_v_wdir(u, v):
    """convert wind speed in u (+east) and v (+north) to a bearing from (-180, 180] with 0 degrees being due north. 

    Args:
        u (float): windspeed arrow going towards east
        v (float): windspeed arrow going towards north

    Returns:
        float: wdir in degrees measured from (-180, 180], compatible with azimuth/bearing from pyproj.geod.inv
    """
    
    if not u or not v:
        return None

    angle_rad = atan2(u, v)
    angle_deg = 180*angle_rad/pi

    return angle_deg

In [48]:
udf_convert_u_v_wdir = F.udf(convert_u_v_wdir)

In [47]:
# original udf run for school wdir

# df_all = df_all.withColumn('school_wdir_wrt_0N',\
#                                      udf_convert_u_v_wdir(df_all.school_wind_u, df_all_test.school_wind_v).cast('double'))

In [49]:
# original udf for point source wdir

# df_all = df_all.withColumn('ps_wdir_wrt_0N',\
#                                      udf_convert_u_v_wdir(df_all.ps_wind_u_merge, df_all.ps_wind_v_merge).cast('double'))

In [52]:
# try to compute wind bearing wrt 0N using built-in Pyspark

df_all = df_all.withColumn('school_wdir_wrt_0N',(180*F.atan2(df_all.school_wind_u, df_all.school_wind_v)/(3.141592653589793238462)).cast('double'))

In [53]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+
|year|cdscode|school_count

In [54]:
# try to compute wind bearing wrt 0N using built-in Pyspark

df_all = df_all.withColumn('ps_wdir_wrt_0N',(180*F.atan2(df_all.ps_wind_u_merge, df_all.ps_wind_v_merge)/(3.141592653589793238462)).cast('double'))

In [55]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+
|year|cdsco

In [59]:
test_cols = ['school_wdir','school_wdir_wrt_0N',\
             'ps_wdir','ps_wdir_wrt_0N',\
             'school_wind_u','school_wind_v','ps_wind_u','ps_wind_v']

df_all.select(test_cols).filter(df_all.ps_wind_u.isNotNull()).filter(df_all.school_wdir_wrt_0N != df_all.ps_wdir_wrt_0N).limit(10).show()

+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+--------------------+
|       school_wdir|school_wdir_wrt_0N|            ps_wdir|    ps_wdir_wrt_0N|      school_wind_u|       school_wind_v|          ps_wind_u|           ps_wind_v|
+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+--------------------+
| 19.44426918029785| 70.55572942519163|  35.76771926879883|54.232279912986044|  1.169479250907898|  0.4128546118736267| 0.7486752867698669|   0.539320707321167|
|  351.636474609375| 98.36353130324896|  358.9339904785156| 91.06599514689991| 1.2767947912216187|-0.18771032989025116| 0.9332471489906311|-0.01736518368124962|
|  355.478759765625|  94.5212546601001| 304.06695556640625|145.93306460632525| 1.0174813270568848|-0.08045731484889984|0.10566005855798721| -0.1562534123659134|
|347.58514404296875|102.4148562314

In [53]:
df_all.select(test_cols).filter(df_all.ps_wdir.isNotNull()).filter(df_all.ps_wind_u != df_all.school_wind_u).limit(10).show()

DataFrame[school_wdir: double, school_wdir_wrt_0N: double, ps_wdir: double, ps_wdir_wrt_0N: double, school_wind_u: double, school_wind_v: double, ps_wind_u: double, ps_wind_v: double]

In [60]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+
|year|cdsco

In [54]:
def abs_of_diff(angle1, angle2):
    if not angle1 or not angle2:
        return None
    else:
        return abs(angle1 - angle2)

In [55]:
udf_abs_of_diff = F.udf(abs_of_diff)

In [56]:
def null_safe_avg(n1, n2):
    if not n1 or not n2:
        return None
    else:
        return (n1 + n2)/2

In [57]:
udf_null_safe_avg = F.udf(null_safe_avg)

In [58]:
# previously used UDFs to compute--converted to native, which run much faster and handle nulls better

# df_all_test = df_all_test.withColumn("school_wind_alignment",udf_abs_of_diff(df_all_test.school_wdir_wrt_0N, df_all_test.angle_to_school).cast('double'))

# df_all_test = df_all_test.withColumn("ps_wind_alignment",udf_abs_of_diff(df_all_test.ps_wdir_wrt_0N, df_all_test.angle_to_school).cast('double'))

# df_all_test = df_all_test.withColumn("avg_wind_alignment",udf_null_safe_avg(df_all_test.ps_wind_alignment, df_all_test.school_wind_alignment).cast('double'))

https://gamedev.stackexchange.com/questions/4467/comparing-angles-and-working-out-the-difference

In [61]:
df_all = df_all.withColumn("school_wind_alignment",180 - abs(abs(df_all.school_wdir_wrt_0N - df_all.angle_to_school) - 180).cast('double'))

df_all = df_all.withColumn("ps_wind_alignment",180 - abs(abs(df_all.ps_wdir_wrt_0N - df_all.angle_to_school) - 180).cast('double'))

df_all = df_all.withColumn("avg_wind_alignment",((df_all.ps_wind_alignment + df_all.school_wind_alignment)/2).cast('double'))

In [62]:
df_all.printSchema()

root
 |-- year: long (nullable = true)
 |-- cdscode: string (nullable = true)
 |-- school_county: string (nullable = true)
 |-- school_active_status: string (nullable = true)
 |-- school_street: string (nullable = true)
 |-- school_type: string (nullable = true)
 |-- school_grades_offered: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_open_date: string (nullable = true)
 |-- school_closed_date: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_wind_zip: long (nullable = true)
 |-- school_wind_u: double (nullable = true)
 |-- school_wind_v: double (nullable = true)
 |-- school_wdir: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- year_month: string (nullable = true)
 |-- census_year: string (nullable = true)
 |-- census_zip: string (nullable = true)
 |-- total_populat

In [63]:
test_cols2 = ['angle_to_school','school_wdir_wrt_0N',\
             'ps_wdir_wrt_0N',"school_wind_alignment", "ps_wind_alignment",\
                         'avg_wind_alignment']

df_all.select(test_cols2).limit(10).show()

+-------------------+-------------------+-------------------+---------------------+------------------+------------------+
|    angle_to_school| school_wdir_wrt_0N|     ps_wdir_wrt_0N|school_wind_alignment| ps_wind_alignment|avg_wind_alignment|
+-------------------+-------------------+-------------------+---------------------+------------------+------------------+
|-112.69585791567243| 159.88589549578458| 159.88589549578458|    87.41824658854296| 87.41824658854296| 87.41824658854296|
|-112.69585791567243| 159.88589549578458| 159.88589549578458|    87.41824658854296| 87.41824658854296| 87.41824658854296|
|-113.61098323048188| 159.88589549578458| 159.88589549578458|    86.50312127373354| 86.50312127373354| 86.50312127373354|
|   74.1806907307745|-52.623776587312655|-52.623776587312655|   126.80446731808715|126.80446731808715|126.80446731808715|
| 129.10846544104348|  5.874323870140522|  5.874323870140522|   123.23414157090295|123.23414157090295|123.23414157090295|
| 125.27784582729981|-17

In [64]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+------------

In [73]:
df_all = df_all.withColumn("avg_wind_alignment_cosine",cos(((3.1415926)*df_all.avg_wind_alignment/180)) + 1)

In [66]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+----+-------+-------------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+------------

In [74]:
df_all.limit(10).show()

+----+--------------+-------------+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+------------------+---------------------+-----------------------+------------------+---------------------+-----------------------+-----------------+----------+---------------+--------+---------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+---------+---------+-------+-------+--------------------+-----------------

In [69]:
df_all.count()

2983111

In [70]:
df_all.printSchema()

root
 |-- year: long (nullable = true)
 |-- cdscode: string (nullable = true)
 |-- school_county: string (nullable = true)
 |-- school_active_status: string (nullable = true)
 |-- school_street: string (nullable = true)
 |-- school_type: string (nullable = true)
 |-- school_grades_offered: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_open_date: string (nullable = true)
 |-- school_closed_date: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- school_wind_lat: double (nullable = true)
 |-- school_wind_lon: double (nullable = true)
 |-- school_wind_zip: long (nullable = true)
 |-- school_wind_u: double (nullable = true)
 |-- school_wind_v: double (nullable = true)
 |-- school_wdir: double (nullable = true)
 |-- school_wspd: double (nullable = true)
 |-- year_month: string (nullable = true)
 |-- census_year: string (nullable = true)
 |-- census_zip: string (nullable = true)
 |-- total_populat

In [75]:
df_all.write.parquet('C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\joined_data_v3_10_2_22.parquet')

## Reloading to add avg wind spd, total pediatric pop by gender per county, region name, and load in elevation data


In [None]:
parquet_path = 'C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\joined_data_v3_10_2_22.parquet'

In [76]:
school_elevations = spark.read.parquet(os.path.join(gdrive_path, 'schools/school_elevations.parquet'))
ps_elevations = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'Point source/pollution_point_source_elevations.csv'))

In [77]:
school_elevations.show()

+--------------+----------+-----------+----------------+-----------------+
|       cdscode|school_lat| school_lon|elevation_meters|__index_level_0__|
+--------------+----------+-----------+----------------+-----------------+
|20102070109728| 37.115210| -120.26603|           70.84|                0|
|20102072030187| 37.115210| -120.26603|           70.84|                1|
|20651930000000| 37.122449| -120.26487|           72.77|                2|
|20651930113076| 37.131113|-120.242025|           77.12|                3|
|20651936023907| 37.115243| -120.26777|           70.66|                4|
|20652010000000| 37.116435| -120.26264|           71.38|                5|
|20652012030047| 37.114421|-120.262612|           71.01|                6|
|20652012032357| 37.114421|-120.262612|           71.01|                7|
|15636770113837|  35.10877|-117.956403|          726.11|                8|
|15636770114512| 35.138427| -117.97325|          721.91|                9|
|15636776009823|  35.1223

In [78]:
ps_elevations.show()

+----------------+----------------+----------------+
|point_source_lat|point_source_lon|elevation_meters|
+----------------+----------------+----------------+
|         37.1056|       -120.2487|            73.3|
|        35.15125|      -118.01667|          740.08|
|       35.513841|     -118.901299|           250.9|
|         35.5767|       -119.7854|           175.1|
|       34.534084|     -117.861722|          900.09|
|        36.13427|      -120.38843|          248.32|
|        33.03891|      -116.91364|          423.96|
|        36.15631|      -119.32617|            83.5|
|          34.935|      -119.55425|          719.21|
|         34.8996|        -120.465|           80.81|
|        38.99018|      -122.89972|          417.72|
|        38.91713|      -121.35401|           35.62|
|          35.456|       -116.0208|          712.94|
|        38.67388|      -121.87208|           35.47|
|       34.898505|      -117.05719|          650.03|
|        36.66634|      -119.44985|          1

In [80]:
# add columns for total pediatric population by gender

df_all = df_all.withColumn("pop_under19_male",df_all.population_0_4_male + df_all.population_5_9_male + df_all.population_10_14_male + df_all.population_15_19_male.cast("double"))

df_all = df_all.withColumn("pop_under19_female",df_all.population_0_4_female + df_all.population_5_9_female + df_all.population_10_14_female + df_all.population_15_19_female.cast("double"))

In [81]:
df_all.show()

+----+--------------+-------------+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+------------------+---------------------+-----------------------+------------------+---------------------+-----------------------+------------------+----------+---------------+--------+---------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+---------+---------+-------+-------+--------------------+----------------

In [85]:
# add avg wind speed

df_all = df_all.withColumn("avg_wind_speed", ((df_all.ps_wspd_merge + df_all.school_wspd) / 2).cast('double'))

df_all.filter(df_all.ps_wspd.isNotNull()).filter(df_all.ps_wspd != df_all.school_wspd).show()

+----+--------------+-------------+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+-------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+----------------+---------------------+-----------------------+------------------+---------------------+-----------------------+-----------------+----------+---------------+--------+---------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+-------------------+--------------------+-------------------+----------------

In [88]:
regions_by_county = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'county region lookup.csv'))

regions_by_county.show()

+-------------+-------------------+-------------+
|region_number|        region_name|school_county|
+-------------+-------------------+-------------+
|            1|Superior California|        Butte|
|            1|Superior California|       Colusa|
|            1|Superior California|    El Dorado|
|            1|Superior California|        Glenn|
|            1|Superior California|       Lassen|
|            1|Superior California|        Modoc|
|            1|Superior California|       Nevada|
|            1|Superior California|       Placer|
|            1|Superior California|       Plumas|
|            1|Superior California|   Sacramento|
|            1|Superior California|       Shasta|
|            1|Superior California|       Sierra|
|            1|Superior California|     Siskiyou|
|            1|Superior California|       Sutter|
|            1|Superior California|       Tehama|
|            1|Superior California|         Yolo|
|            1|Superior California|         Yuba|


In [89]:
# join county data

df_all = df_all.join(regions_by_county, ['school_county'], 'left')

In [90]:
df_all.show()

+-------------+----+--------------+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+------------------+---------------------+-----------------------+------------------+---------------------+-----------------------+------------------+----------+---------------+--------+---------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+---------+---------+-------+-------+--------------------+----------------

In [93]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+-------------+----+-------+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+----------------+----------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+------------

In [94]:
df_all = df_all.join(ps_elevations, ['point_source_lat','point_source_lon'], 'left')

df_all = df_all.withColumnRenamed("elevation_meters","ps_elevation_m")

In [98]:
df_all = df_all.join(school_elevations, ['cdscode'], 'left')

df_all = df_all.withColumnRenamed("elevation_meters","school_elevation_m")

In [99]:
df_all.show()

+--------------+----------------+----------------+-------------+----+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+------------------+---------------------+-----------------------+------------------+---------------------+-----------------------+------------------+----------+---------------+--------+---------+-----------+------------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+---------+---------+-------+-------+--------------------+----------------

In [100]:
# accidentally acquired some duplicate colnames; fixing

colnames = df_all.columns
print(colnames)

['cdscode', 'point_source_lat', 'point_source_lon', 'school_county', 'year', 'school_active_status', 'school_street', 'school_type', 'school_grades_offered', 'school_zip', 'school_open_date', 'school_closed_date', 'school_lat', 'school_lon', 'school_wind_lat', 'school_wind_lon', 'school_wind_zip', 'school_wind_u', 'school_wind_v', 'school_wdir', 'school_wspd', 'year_month', 'census_year', 'census_zip', 'total_population', 'total_population_male', 'total_population_female', 'population_0_4', 'population_0_4_male', 'population_0_4_female', 'population_5_9', 'population_5_9_male', 'population_5_9_female', 'population_10_14', 'population_10_14_male', 'population_10_14_female', 'population_15_19', 'population_15_19_male', 'population_15_19_female', 'total_pop_under19', 'pm25_index', 'pm25_year_month', 'pm25_zip', 'pm25', 'lookup_year', 'cdscode_closest_ps', 'point_source_year', 'point_source_pm25_tpy', 'point_source_zip', 'dist_school_to_ps_m', 'angle_to_school', 'ps_wind_lat', 'ps_wind_lon

In [102]:
df_all_rename = df_all.toDF(*map(str, range(len(colnames))))
print(df_all_rename.columns)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77']


In [103]:
df_all_rename = df_all_rename.drop(df_all_rename.columns[-1])
del colnames[-1]
df_all_rename = df_all_rename.drop(df_all_rename.columns[-2])
del colnames[-2]
df_all_rename = df_all_rename.drop(df_all_rename.columns[-2])
del colnames[-2]

print(colnames)

['cdscode', 'point_source_lat', 'point_source_lon', 'school_county', 'year', 'school_active_status', 'school_street', 'school_type', 'school_grades_offered', 'school_zip', 'school_open_date', 'school_closed_date', 'school_lat', 'school_lon', 'school_wind_lat', 'school_wind_lon', 'school_wind_zip', 'school_wind_u', 'school_wind_v', 'school_wdir', 'school_wspd', 'year_month', 'census_year', 'census_zip', 'total_population', 'total_population_male', 'total_population_female', 'population_0_4', 'population_0_4_male', 'population_0_4_female', 'population_5_9', 'population_5_9_male', 'population_5_9_female', 'population_10_14', 'population_10_14_male', 'population_10_14_female', 'population_15_19', 'population_15_19_male', 'population_15_19_female', 'total_pop_under19', 'pm25_index', 'pm25_year_month', 'pm25_zip', 'pm25', 'lookup_year', 'cdscode_closest_ps', 'point_source_year', 'point_source_pm25_tpy', 'point_source_zip', 'dist_school_to_ps_m', 'angle_to_school', 'ps_wind_lat', 'ps_wind_lon

In [105]:
df_all = df_all_rename.toDF(*colnames)

df_all.show()

+--------------+----------------+----------------+-------------+----+--------------------+--------------------+--------------------+---------------------+----------+----------------+------------------+----------+-----------+---------------+---------------+---------------+--------------------+--------------------+------------------+-------------------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+------------------+-------------------+---------------------+------------------+---------------------+-----------------------+------------------+---------------------+-----------------------+------------------+----------+---------------+--------+---------+-----------+------------------+-----------------+---------------------+----------------+-------------------+-------------------+-----------+-----------+-----------+---------+---------+-------+-------+--------------------+----------------

In [106]:
# spot check for nulls

df_all.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_all.columns]).show()

+-------+----------------+----------------+-------------+----+--------------------+-------------+-----------+---------------------+----------+----------------+------------------+----------+----------+---------------+---------------+---------------+-------------+-------------+-----------+-----------+----------+-----------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+-----------------+----------+---------------+--------+------+-----------+------------------+-----------------+---------------------+----------------+-------------------+---------------+-----------+-----------+-----------+---------+---------+-------+-------+---------------+---------------+-------------+-------------+------------------+--------------+------------

In [107]:
df_all.write.parquet('C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\joined_data_v4_10_3_22.parquet')