# About

Join 2022-11-05
- This time, get the three nearest pollution sources per school. 

Pretty much do everything in pandas until the very end, do the cross join etc. in spark to make it parallelized.

In [35]:
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px

# spark
from pyspark.sql import SparkSession
from pyspark.sql import types, functions as F, Window

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
# local or gdrive
path_source = 'work'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'

In [7]:
df_census = pd.read_csv(os.path.join(data_path, 'census/census_bureau_clean/census_bureau.csv'))
df_wind = pd.read_parquet(os.path.join(data_path, 'wind'))
df_pollution = pd.read_csv(os.path.join(data_path, 'AirPollution/UW_pm25_zip_monthly_anand_2000-2018-v2.csv'))
df_point_sources = pd.read_csv(os.path.join(data_path, 'Point source/pollution_point_sources.csv'))
# df_schools = pd.read_csv(os.path.join(data_path, 'schools/filtered_joined_schools_data.csv'))

file_encoding = 'utf8'
with open(os.path.join(data_path, 'schools/filtered_joined_schools_data.csv'), encoding=file_encoding, errors = 'backslashreplace') as my_csv:
  df_schools = pd.read_csv(my_csv, low_memory=False)

In [8]:
# clean schools 
df_schools.columns = [i.lower() for i in df_schools.columns]
# only select necessary fields
df_schools = df_schools[['cdscode', 'statustype', 'county', 'street', 'zip_first_five', 'opendate', 'closeddate', 'eilname', 'gsoffered', 
  'latitude', 'longitude', 'lastupdate']]\
  .rename(columns={
    'statustype': 'school_active_status', 'county': 'school_county', 'street': 'school_street', 
    'zip_first_five': 'school_zip', 'opendate': 'school_open_date', 'closeddate': 'school_closed_date', 
    'eilname': 'school_type', 'gsoffered': 'school_grades_offered', 'latitude': 'school_lat', 'longitude': 'school_lon', 
    'lastupdate': 'school_last_updated_date'})

# clean wind
df_wind = df_wind.rename(columns={'lat': 'wind_lat', 'lon': 'wind_lon'})
df_wind['year_month'] = df_wind['year_month'].astype(str).map(lambda x: x[:4] + '-' + x[-2:])
df_wind['year'] = df_wind['year_month'].map(lambda x: int(x[:4]))
df_wind['ZCTA10'] = df_wind['ZCTA10'].astype(int)
df_wind = df_wind[(df_wind['year'] >= 2000) & (df_wind['year'] <= 2019)]

# clean pollution
df_pollution = df_pollution.drop(columns=['Unnamed: 0', 'GEOID10', 'year_month_zip'])

# clean pollution point sources
df_point_sources = df_point_sources.rename(columns={'zip_code': 'point_source_zip'})
df_point_sources['point_source_zip'] = df_point_sources['point_source_zip'].astype(int)
# create an ID field for easier lookups
df_point_sources['point_source_id'] = [i for i in range(df_point_sources.shape[0])]

In [9]:
df_all = pd.merge(df_schools, df_wind, left_on = 'school_zip', right_on='ZCTA10', how='left')\
  .merge(df_census, left_on = ['school_zip', 'year'], right_on=['zip', 'year'], how='left')\
  .merge(df_pollution, left_on=['school_zip', 'year_month'], right_on=['ZIP10', 'year_month'], how='left')

In [10]:
# QA Checks
# each school is repeated for every year-month combo. But some schools dont have wind/population data where we dont have that zip code in those datasets. 
yr_mo = df_wind[['year_month']].drop_duplicates().shape[0]
print(f'There are {yr_mo} year month combos')
print('So most schools are repeated 240 times, for the schools that dont have a zip code in the wind data, there are no obs')
df_all['cdscode'].value_counts().to_frame().value_counts('cdscode')

There are 240 year month combos
So most schools are repeated 240 times, for the schools that dont have a zip code in the wind data, there are no obs


cdscode
240    12426
1        871
dtype: int64

## Lat/Lon Join

Join the above dataset to pull the nearest pollution source by year. 

Do so by creating a school <--> source mapping by year

In [11]:
# here is the year mapping since we dont have all years available in the pollution sources. 
# thus, we have to interpolate for the missing years
year_mapping = {
  2000: 2002, 
  2001: 2002, 
  2002: 2002,
  2003: 2005,
  2004: 2005,
  2005: 2005,
  2006: 2008,
  2007: 2008,
  2008: 2008,
  2009: 2011,
  2010: 2011,
  2011: 2011,
  2012: 2014,
  2013: 2014,
  2014: 2014,
  2015: 2017,
  2016: 2017,
  2017: 2017,
  2018: 2017,
  2019: 2017
}

In [None]:
spark = SparkSession\
    .builder\
    .appName('')\
    .master("local[*]")\
    .getOrCreate()

sc = spark.sparkContext

In [48]:
df_schools = df_all[['cdscode', 'year', 'school_lat', 'school_lon']].drop_duplicates()
df_ps = df_point_sources[['point_source_id', 'checked_lat', 'checked_lon', 'point_source_zip', 'report_year', 'PM25_emissions_TPY']]

In [64]:
def get_top_3_polluters(data_year, verbose=True):
  
  if verbose:
    print('Working on year: {}'.format(data_year))

  # get spark df of individual year
  df_ps_i = spark.createDataFrame(df_ps[df_ps.report_year == year_mapping[data_year]])
  df_school_yr_i = spark.createDataFrame(df_schools[df_schools.year == data_year])

  # cross join, calc distance, then rank the distances, then filter on top 3
  df_x = df_school_yr_i.crossJoin(df_ps_i)\
    .withColumn('distance_euclid', ((F.col('school_lat') - F.col('checked_lat'))**2 + (F.col('school_lon') - F.col('checked_lon'))**2)**.5)\
    .withColumn('dist_rank', F.row_number().over(Window().partitionBy('cdscode').orderBy(F.col('distance_euclid'))))\
    .filter(F.col('dist_rank') <= 3)
  
  return df_x.toPandas()

In [None]:
# run the function on every year!
top_polluters_df = pd.concat([get_top_3_polluters(data_year) for data_year in range(2000, 2020)])

In [74]:
# every school/year combo has 3 records
top_polluters_df.value_counts(['cdscode', 'year']).to_frame().reset_index().rename(columns={0: 'n'}).value_counts('n')

n
3    248520
dtype: int64

In [97]:
top_polluters_df.head(3)

Unnamed: 0,cdscode,year,school_lat,school_lon,point_source_id,checked_lat,checked_lon,point_source_zip,report_year,PM25_emissions_TPY,distance_euclid,dist_rank
0,1611190131805,2000.0,37.77693,-122.28528,518,37.789864,-122.293587,94501,2002,6.9153,0.015372,1
1,1611190131805,2000.0,37.77693,-122.28528,163,37.798897,-122.282364,94607,2002,2.631207,0.02216,2
2,1611190131805,2000.0,37.77693,-122.28528,248,37.790672,-122.261948,94606,2002,3.168564,0.027078,3


In [94]:
all_schools = list(set(df_schools['cdscode'].values))
res = [i not in top_polluters_df['cdscode'].values for i in all_schools]
missing_schools = np.array(all_schools)[res]
print('{} schools missing from the final lookup.\nThese are the schools that dont have a valid year field'.format(len(missing_schools)))
df_schools[df_schools['cdscode'].isin(missing_schools)]

871 schools missing from the final lookup.
These are the schools that dont have a valid year field


Unnamed: 0,cdscode,year,school_lat,school_lon
480,1100170112607,,37.804520,-122.26815
2881,1100170136101,,37.603623,-122.02530
38162,1611920131334,,37.661939,-122.05792
38403,1611920133520,,37.672622,-122.09814
42004,1611926001101,,37.652934,-122.09406
...,...,...,...,...
2895746,56724700000000,,34.262484,-119.09424
2896227,56724706055123,,34.262484,-119.09424
2925508,56726110000000,,34.258820,-118.99756
2925749,56726116055834,,34.258820,-118.99756


In [98]:
print('In the end, we should end up with this many records.\nB/c there are 3 nearest polluters for each school for each year')
print(df_schools[['cdscode', 'year']].drop_duplicates().shape[0] * 3)

# the
res_count = top_polluters_df.shape[0] + 871*3
print('Here is the count in the end: {}. We have to add in the 871 missing schools that have NA year'.format(res_count))

In the end, we should end up with this many records.
B/c there are 3 nearest polluters for each school for each year
748173
Here is the count in the end: 748173. We have to add in the 871 missing schools that have NA year


In [100]:
top_polluters_df = top_polluters_df.rename(columns = {'checked_lat': 'ps_lat', 'checked_lon': 'ps_lon'})

In [101]:
top_polluters_df.head(3)

Unnamed: 0,cdscode,year,school_lat,school_lon,point_source_id,ps_lat,ps_lon,point_source_zip,report_year,PM25_emissions_TPY,distance_euclid,dist_rank
0,1611190131805,2000.0,37.77693,-122.28528,518,37.789864,-122.293587,94501,2002,6.9153,0.015372,1
1,1611190131805,2000.0,37.77693,-122.28528,163,37.798897,-122.282364,94607,2002,2.631207,0.02216,2
2,1611190131805,2000.0,37.77693,-122.28528,248,37.790672,-122.261948,94606,2002,3.168564,0.027078,3


In [113]:
top_polluters_df2 = top_polluters_df.pivot(
  index = ['cdscode', 'year', 'school_lat', 'school_lon'], 
  columns = ['dist_rank'], 
  values = ['point_source_id', 'ps_lat', 'ps_lon', 'PM25_emissions_TPY', 'distance_euclid']
  )

top_polluters_df2.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,point_source_id,point_source_id,point_source_id,ps_lat,ps_lat,ps_lat,ps_lon,ps_lon,ps_lon,PM25_emissions_TPY,PM25_emissions_TPY,PM25_emissions_TPY,distance_euclid,distance_euclid,distance_euclid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,dist_rank,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
cdscode,year,school_lat,school_lon,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1100170000000,2000.0,37.658212,-122.09713,774.0,608.0,385.0,37.667973,37.633793,37.635574,-122.103806,-122.124954,-122.052284,25.813472,9.581375,4.616791,0.011826,0.03702,0.050236
1100170000000,2001.0,37.658212,-122.09713,774.0,608.0,385.0,37.667973,37.633793,37.635574,-122.103806,-122.124954,-122.052284,25.813472,9.581375,4.616791,0.011826,0.03702,0.050236
1100170000000,2002.0,37.658212,-122.09713,774.0,608.0,385.0,37.667973,37.633793,37.635574,-122.103806,-122.124954,-122.052284,25.813472,9.581375,4.616791,0.011826,0.03702,0.050236


In [107]:
top_polluters_df2.shape[0]

248520

In [None]:
# write this mapping table as parquet to disk
school_ps_mapping_df = pd.concat(school_ps_mapping)
fpath = os.path.join(data_path, 'school_pollution_mapping/school_pollution_mapping_top3.parquet')
school_ps_mapping_df.to_parquet(fpath)

In [None]:
# finally, cast the rows out wider for the mapping. 