# JOIN

# Anand Notes:
jan-2000, school1, zip, lat, lon
...
jan-2000, school1300, zip, lat, lon
feb-2000, school1, zip, lat, lon
...
feb-2000, school1300, zip, lat, lon

520,000,000

use lat, lon to find nearest pollution site, get bearing as pollution_bearing (using geodesic inv library)

join wind u, v onto school's zip-month-year, use trig to make school_wind_bearing

join wind u, v onto pollution site's zip-month-year, use trig to make  pollution_wind_bearing


make 3 variables 

- abs(pollution_bearing - school_wind_bearing) as school_wind_downstream_angle

- abs(pollution_bearing - pollution_wind_bearing) as pollution_wind_downstream_angle

- mean(school_wind_downstream_angle, pollution_wind_downstream_angle) as mean_downstream_angle


---> group by zip-month-year

~ 400,000 rows

aggregations:

- avg(school_wind_downstream_angle) as zip_avg_school_wind_downstream_angle

- avg(pollution_wind_downstream_angle) as zip_avg_pollution_wind_downstream_angle

- avg(mean_downstream_angle) as zip_avg_mean_downstream_angle


year, month, zip, year-month-zip, zip_avg_school_wind_downstream_angle, zip_avg_pollution_wind_downstream_angle, zip_avg_mean_downstream_angle


join in population stuff on year-zipcode

join in pm2.5 on year-month-zipcode

![](2022-09-27-18-07-10.png)

In [1]:
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px

from netCDF4 import Dataset
import cartopy.crs as ccrs
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
# import dotenv

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [4]:
path_source = 'work'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  gdrive_path = '/content/gdrive/MyDrive/Classes/W210_capstone'
  env_path = '/content/gdrive/MyDrive/.env'
  
elif path_source == 'local':
  gdrive_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/MyDrive/.env'

elif path_source == 'work':
  gdrive_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone'
  env_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/.env'
  

wind_path = os.path.join(gdrive_path, 'W210 Capstone/Data/wind')

Read in each dataset

In [114]:
df_census = pd.read_csv(os.path.join(gdrive_path, 'W210 Capstone/Data/census/census_bureau_clean/census_bureau.csv'))
df_wind = pd.read_parquet(os.path.join(gdrive_path, 'W210 Capstone/Data/wind'))
df_pollution = pd.read_csv(os.path.join(gdrive_path, 'W210 Capstone/Data/AQI/UW_pm25_zip_monthly_anand_2000-2018-v2.csv'))
df_point_sources = pd.read_csv(os.path.join(gdrive_path, 'W210 Capstone/Data/Point source/pollution_point_sources.csv'))

file_encoding = 'utf8'
with open(os.path.join(gdrive_path, 'JLPS_capstone_project/data/schools_data/tabular_school_data.csv'), encoding=file_encoding, errors = 'backslashreplace') as my_csv:
  df_schools = pd.read_csv(my_csv, low_memory=False)

Things to fix on datasets:

df_schools
- 7 zip codes are <NULL> and have the same school name, but diff lat/lng that put them in diff zip codes. 
- Stack_cnt feature, many schools are "stacked" on one another. we may want to just take 1 school obs, or perhaps retain the info that the schools are stacked. 
- Features: gradesoffered, gradesserved. Fix these (formatted as dates). 

Other fixes:
- Drop duplicate columns
- Drop year-months that we dont need from wind (1998 - 2000)
- do quality checks on full joined dataset for dups etc. 

pollution point sources assumptions
- For a given year, just find the nearest pollution source for that year. Dont try and track a pollution source across time. 
- We have data every 3 years, assume the surrounding 

In [115]:
# clean schools
df_schools = df_schools[df_schools['Zip'] != '<Null>']
df_schools['zip5'] = df_schools['Zip'].map(lambda x: x[:5])
df_schools['zip5'] = df_schools['zip5'].astype(int)
df_schools.columns = [i.lower() for i in df_schools.columns]

In [116]:
# clean wind
df_wind['ZCTA10'] = df_wind['ZCTA10'].astype(int)

In [170]:
# join schools and wind
df_all = pd.merge(df_schools, df_wind.rename(columns={'lat': 'lat_wind', 'lon': 'lon_wind'}), left_on = 'zip5', right_on='ZCTA10', how='left')

df_all['year_month'] = df_all['year_month'].fillna('0')
df_all['year_month'] = df_all['year_month'].astype(str)
df_all['year'] = df_all['year_month'].map(lambda x: int(x[:4]))

# join census
df_all = pd.merge(df_all, df_census.rename(columns={'zip': 'zip_census'}), left_on = ['zip5', 'year'], right_on = ['zip_census', 'year'], how='left')

# make year_month year-month
df_all['year_month'] = df_all['year_month'].astype(str)
df_all['year_month'] = df_all['year_month'].map(lambda x: x[:4] + '-' + x[-2:])

# join pollution
df_all = pd.merge(df_all, df_pollution, left_on = ['year_month', 'zip5'], right_on = ['year_month', 'ZIP10'], how='left')


df_all.head(3)

Unnamed: 0,objectid,cdscode,status,charter,ed_type,level,pb_prv_bie,school,district,county,street,city,state,zip,lat,long,stacked,stack_cnt,source,cds_uniq,gradesoffered,gradesserved,zip5,lat_wind,lon_wind,ZCTA10,u,v,wdir,wspd,year_month,year,zip_census,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,GEOID10,pm25,year_month_zip
0,1,19768700000000.0,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,19768700000000.0,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.281756,-0.20091,324.508789,0.346051,1998-01,1998,,,,,,,,,,,,,,,,,,,,,
1,1,19768700000000.0,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,19768700000000.0,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.682824,0.314714,24.745012,0.751861,1998-02,1998,,,,,,,,,,,,,,,,,,,,,
2,1,19768700000000.0,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,19768700000000.0,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.85577,0.027615,1.848251,0.856216,1998-03,1998,,,,,,,,,,,,,,,,,,,,,


In [175]:
# join these together on the nearest lat long
df_all_2017 = df_all[df_all['year'] == 2017]

df_point_sources_2017 = df_point_sources[df_point_sources['report_year'] == 2017]
df_point_sources_2017

Unnamed: 0,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY
5689,33.359400,-117.111400,92059.0,2017,0.784458
5690,37.493617,-121.935869,0.0,2017,0.784750
5691,32.821000,-117.144000,92123.0,2017,0.785131
5692,38.734600,-121.954430,95695.0,2017,0.786000
5693,35.352150,-119.240640,93312.0,2017,0.790105
...,...,...,...,...,...
7150,38.025100,-122.063900,94553.0,2017,265.824083
7151,34.622200,-117.100100,92307.0,2017,494.738668
7152,34.231230,-116.056220,92778.0,2017,534.734811
7153,37.938779,-122.396453,94802.0,2017,566.232588


In [176]:
df_all_2017[['lat', 'long']].head(3)

Unnamed: 0,lat,long
228,33.908583,-118.376468
229,33.908583,-118.376468
230,33.908583,-118.376468


In [178]:
df_all_2017.shape

(112776, 54)

In [179]:
df_point_sources_2017.shape

(1466, 6)

In [181]:
112776 * 1466

165329616

In [203]:
df_all_2017

Unnamed: 0,objectid,cdscode,status,charter,ed_type,level,pb_prv_bie,school,district,county,street,city,state,zip,lat,long,stacked,stack_cnt,source,cds_uniq,gradesoffered,gradesserved,zip5,lat_wind,lon_wind,ZCTA10,u,v,wdir,wspd,year_month,year,zip_census,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,GEOID10,pm25,year_month_zip,key
228,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.370,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0
229,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.370,90250.0,0.252659,0.271296,47.037128,0.370726,2017-02,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,10.183333,2017-02_90250,0
230,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.370,90250.0,0.310494,-0.416862,306.680115,0.519789,2017-03,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,11.575000,2017-03_90250,0
231,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.370,90250.0,0.843505,-0.414065,333.854248,0.939655,2017-04,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,10.716666,2017-04_90250,0
232,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.370,90250.0,0.899092,0.732567,39.172634,1.159750,2017-05,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,11.341667,2017-05_90250,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2585081,10076,5.47184E+13,Active,Yes,Traditional,Elementary - High Combination,Public,Summit Charter Academy,Burton Elementary,Tulare,175 South Mathew Street,Porterville,CA,93257-2710,36.062551,-119.063155,Yes,2,Tulare CO Parcels 2011,5.47184E+13,K-12,K-12,93257,35.790,-118.918,93257.0,0.606863,-0.068813,353.530823,0.610752,2017-08,2017,93257.0,76676.0,38383.0,38293.0,6391.0,3401.0,2990.0,6984.0,3675.0,3309.0,6723.0,3185.0,3538.0,6779.0,3337.0,3442.0,26877.0,93257.0,693257.0,9.825664,2017-08_93257,0
2585082,10076,5.47184E+13,Active,Yes,Traditional,Elementary - High Combination,Public,Summit Charter Academy,Burton Elementary,Tulare,175 South Mathew Street,Porterville,CA,93257-2710,36.062551,-119.063155,Yes,2,Tulare CO Parcels 2011,5.47184E+13,K-12,K-12,93257,35.790,-118.918,93257.0,0.448100,-0.057807,352.649231,0.451814,2017-09,2017,93257.0,76676.0,38383.0,38293.0,6391.0,3401.0,2990.0,6984.0,3675.0,3309.0,6723.0,3185.0,3538.0,6779.0,3337.0,3442.0,26877.0,93257.0,693257.0,9.173451,2017-09_93257,0
2585083,10076,5.47184E+13,Active,Yes,Traditional,Elementary - High Combination,Public,Summit Charter Academy,Burton Elementary,Tulare,175 South Mathew Street,Porterville,CA,93257-2710,36.062551,-119.063155,Yes,2,Tulare CO Parcels 2011,5.47184E+13,K-12,K-12,93257,35.790,-118.918,93257.0,0.059406,-0.114671,297.386841,0.129145,2017-10,2017,93257.0,76676.0,38383.0,38293.0,6391.0,3401.0,2990.0,6984.0,3675.0,3309.0,6723.0,3185.0,3538.0,6779.0,3337.0,3442.0,26877.0,93257.0,693257.0,12.497345,2017-10_93257,0
2585084,10076,5.47184E+13,Active,Yes,Traditional,Elementary - High Combination,Public,Summit Charter Academy,Burton Elementary,Tulare,175 South Mathew Street,Porterville,CA,93257-2710,36.062551,-119.063155,Yes,2,Tulare CO Parcels 2011,5.47184E+13,K-12,K-12,93257,35.790,-118.918,93257.0,0.164958,-0.198417,309.739105,0.258032,2017-11,2017,93257.0,76676.0,38383.0,38293.0,6391.0,3401.0,2990.0,6984.0,3675.0,3309.0,6723.0,3185.0,3538.0,6779.0,3337.0,3442.0,26877.0,93257.0,693257.0,10.211504,2017-11_93257,0


In [177]:
df_all_2017['key'] = 0
df_point_sources_2017['key'] = 0

df_all_2017_2 = pd.merge(df_all_2017, df_point_sources_2017, on = 'key', how='outer')

In [204]:
def calc_distance(lat1, lng1, lat2, lng2):
  return ((lat1 - lat2)**2 + (lng1 - lng2)**2)**.5

df_all_2017_2['distance'] = df_all_2017_2.apply(lambda df: calc_distance(df['lat'], df['long'], df['checked_lat'], df['checked_lon']), axis=1)

In [None]:
# get min distance for every school (objectid)
df_closest = df_all_2017_2.loc[df_all_2017_2.groupby('objectid').distance.idxmin()]
df_closest

In [193]:
def calc_distance(lat1, lng1, lat2, lng2):
  return ((lat1 - lat2)**2 + (lng1 - lng2)**2)**.5



df_temp['distance'] = df_temp.apply(lambda df: calc_distance(df['lat'], df['long'], df['checked_lat'], df['checked_lon']), axis=1)
df_temp

Unnamed: 0,objectid,cdscode,status,charter,ed_type,level,pb_prv_bie,school,district,county,street,city,state,zip,lat,long,stacked,stack_cnt,source,cds_uniq,gradesoffered,gradesserved,zip5,lat_wind,lon_wind,ZCTA10,u,v,wdir,wspd,year_month,year,zip_census,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,GEOID10,pm25,year_month_zip,key,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY,distance
0,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0,33.359400,-117.111400,92059.0,2017,0.784458,1.379130
1,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0,37.493617,-121.935869,0.0,2017,0.784750,5.051911
2,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0,32.821000,-117.144000,92123.0,2017,0.785131,1.643720
3,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0,38.734600,-121.954430,95695.0,2017,0.786000,6.007683
4,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.200000,2017-01_90250,0,35.352150,-119.240640,93312.0,2017,0.790105,1.682462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6,1.97687E+13,Active,No,Traditional,Intermediate/Middle/Junior High,Public,Richard Henry Dana Middle,Wiseburn Unified,Los Angeles,5504 West 135th Street,Hawthorne,CA,90250-6462,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,8-Jun,8-Jun,90250,33.895,-118.37,90250.0,0.856764,0.661672,37.678623,1.082522,2017-09,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.091667,2017-09_90250,0,40.876930,-121.723440,96013.0,2017,1.238125,7.730465
99996,6,1.97687E+13,Active,No,Traditional,Intermediate/Middle/Junior High,Public,Richard Henry Dana Middle,Wiseburn Unified,Los Angeles,5504 West 135th Street,Hawthorne,CA,90250-6462,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,8-Jun,8-Jun,90250,33.895,-118.37,90250.0,0.856764,0.661672,37.678623,1.082522,2017-09,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.091667,2017-09_90250,0,33.779837,-118.186677,90813.0,2017,1.238381,0.229339
99997,6,1.97687E+13,Active,No,Traditional,Intermediate/Middle/Junior High,Public,Richard Henry Dana Middle,Wiseburn Unified,Los Angeles,5504 West 135th Street,Hawthorne,CA,90250-6462,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,8-Jun,8-Jun,90250,33.895,-118.37,90250.0,0.856764,0.661672,37.678623,1.082522,2017-09,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.091667,2017-09_90250,0,33.745430,-118.261370,90731.0,2017,1.240654,0.199666
99998,6,1.97687E+13,Active,No,Traditional,Intermediate/Middle/Junior High,Public,Richard Henry Dana Middle,Wiseburn Unified,Los Angeles,5504 West 135th Street,Hawthorne,CA,90250-6462,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,8-Jun,8-Jun,90250,33.895,-118.37,90250.0,0.856764,0.661672,37.678623,1.082522,2017-09,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.091667,2017-09_90250,0,33.126000,-117.113000,92029.0,2017,1.240840,1.486199


In [201]:
df_temp.shape

(100000, 60)

In [199]:
df_temp.groupby('objectid')['distance'].min()

objectid
1    0.009262
2    0.005396
3    0.010535
4    0.010532
5    0.016075
6    0.009262
Name: distance, dtype: float64

In [200]:
df_temp.loc[df_temp.groupby('objectid').distance.idxmin()]

Unnamed: 0,objectid,cdscode,status,charter,ed_type,level,pb_prv_bie,school,district,county,street,city,state,zip,lat,long,stacked,stack_cnt,source,cds_uniq,gradesoffered,gradesserved,zip5,lat_wind,lon_wind,ZCTA10,u,v,wdir,wspd,year_month,year,zip_census,total_population,total_population_male,total_population_female,population_0_4,population_0_4_male,population_0_4_female,population_5_9,population_5_9_male,population_5_9_female,population_10_14,population_10_14_male,population_10_14_female,population_15_19,population_15_19_male,population_15_19_female,total_pop_under19,ZIP10,GEOID10,pm25,year_month_zip,key,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY,distance
682,1,19768700000000.0,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,19768700000000.0,5-Mar,5-Mar,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.2,2017-01_90250,0,33.91348,-118.38433,90245.0,2017,2.188522,0.009262
18274,2,19768700000000.0,Active,Yes,Traditional,High School,Public,Da Vinci Science,Wiseburn Unified,Los Angeles,201 N. Douglas Street,El Segundo,CA,90245-4637,33.918876,-118.384333,Yes,2,Los Angeles CO Parcels 2020,19768700000000.0,12-Sep,12-Sep,90245,33.902,-118.417,90245.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90245.0,16929.0,8313.0,8616.0,1289.0,637.0,652.0,1026.0,601.0,425.0,998.0,548.0,450.0,1084.0,521.0,563.0,4397.0,90245.0,690245.0,11.9,2017-01_90245,0,33.91348,-118.38433,90245.0,2017,2.188522,0.005396
35866,3,19768700000000.0,Active,Yes,Traditional,High School,Public,Da Vinci Design,Wiseburn Unified,Los Angeles,12501 Isis Avenue,Hawthorne,CA,90250-4149,33.919227,-118.375501,Yes,2,Los Angeles CO Parcels 2020,19768700000000.0,12-Sep,12-Sep,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.2,2017-01_90250,0,33.91348,-118.38433,90245.0,2017,2.188522,0.010535
53458,4,19768700000000.0,Active,Yes,Traditional,Elementary - Intermediate/Middle/Junior High C...,Public,Da Vinci Connect,Wiseburn Unified,Los Angeles,12501 Isis Avenue,Hawthorne,CA,90250-6462,33.919231,-118.375507,Yes,2,Los Angeles CO Parcels 2020,19768700000000.0,K-12,K-8,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.2,2017-01_90250,0,33.91348,-118.38433,90245.0,2017,2.188522,0.010532
71050,5,19768700000000.0,Active,No,Traditional,Elementary,Public,Juan De Anza Elementary,Wiseburn Unified,Los Angeles,12110 Hindry Avenue,Hawthorne,CA,90250-3403,33.922632,-118.371115,No,1,Los Angeles CO Parcels,19768700000000.0,K-5,K-5,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.2,2017-01_90250,0,33.91348,-118.38433,90245.0,2017,2.188522,0.016075
88642,6,19768700000000.0,Active,No,Traditional,Intermediate/Middle/Junior High,Public,Richard Henry Dana Middle,Wiseburn Unified,Los Angeles,5504 West 135th Street,Hawthorne,CA,90250-6462,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,19768700000000.0,8-Jun,8-Jun,90250,33.895,-118.37,90250.0,0.268502,0.117679,23.666864,0.293158,2017-01,2017,90250.0,97371.0,48146.0,49225.0,7611.0,4400.0,3211.0,7205.0,3869.0,3336.0,6767.0,3545.0,3222.0,5960.0,3118.0,2842.0,27543.0,90250.0,690250.0,12.2,2017-01_90250,0,33.91348,-118.38433,90245.0,2017,2.188522,0.009262


In [None]:
df.loc[df.groupby("ID").date1.idxmin()]

In [198]:
df_temp.groupby('objectid').distance.transform(np.min)



0        0.009262
1        0.009262
2        0.009262
3        0.009262
4        0.009262
           ...   
99995    0.009262
99996    0.009262
99997    0.009262
99998    0.009262
99999    0.009262
Name: distance, Length: 100000, dtype: float64

In [None]:
df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')

In [202]:
100000 / 1466

68.21282401091405

Unnamed: 0,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY
5689,33.359400,-117.111400,92059.0,2017,0.784458
5690,37.493617,-121.935869,0.0,2017,0.784750
5691,32.821000,-117.144000,92123.0,2017,0.785131
5692,38.734600,-121.954430,95695.0,2017,0.786000
5693,35.352150,-119.240640,93312.0,2017,0.790105
...,...,...,...,...,...
7150,38.025100,-122.063900,94553.0,2017,265.824083
7151,34.622200,-117.100100,92307.0,2017,494.738668
7152,34.231230,-116.056220,92778.0,2017,534.734811
7153,37.938779,-122.396453,94802.0,2017,566.232588


In [162]:
df_schools

Unnamed: 0,objectid,cdscode,status,charter,ed_type,level,pb_prv_bie,school,district,county,street,city,state,zip,lat,long,stacked,stack_cnt,source,cds_uniq,gradesoffered,gradesserved,zip5
0,1,1.97687E+13,Active,No,Traditional,Elementary,Public,138th Street,Wiseburn Unified,Los Angeles,5403 West 138th Street,Hawthorne,CA,90250-6431,33.908583,-118.376468,Yes,3,Los Angeles CO Parcels,1.97687E+13,5-Mar,5-Mar,90250
1,2,1.97687E+13,Active,Yes,Traditional,High School,Public,Da Vinci Science,Wiseburn Unified,Los Angeles,201 N. Douglas Street,El Segundo,CA,90245-4637,33.918876,-118.384333,Yes,2,Los Angeles CO Parcels 2020,1.97687E+13,12-Sep,12-Sep,90245
2,3,1.97687E+13,Active,Yes,Traditional,High School,Public,Da Vinci Design,Wiseburn Unified,Los Angeles,12501 Isis Avenue,Hawthorne,CA,90250-4149,33.919227,-118.375501,Yes,2,Los Angeles CO Parcels 2020,1.97687E+13,12-Sep,12-Sep,90250
3,4,1.97687E+13,Active,Yes,Traditional,Elementary - Intermediate/Middle/Junior High C...,Public,Da Vinci Connect,Wiseburn Unified,Los Angeles,12501 Isis Avenue,Hawthorne,CA,90250-6462,33.919231,-118.375507,Yes,2,Los Angeles CO Parcels 2020,1.97687E+13,K-12,K-8,90250
4,5,1.97687E+13,Active,No,Traditional,Elementary,Public,Juan De Anza Elementary,Wiseburn Unified,Los Angeles,12110 Hindry Avenue,Hawthorne,CA,90250-3403,33.922632,-118.371115,No,1,Los Angeles CO Parcels,1.97687E+13,K-5,K-5,90250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10064,10065,1.96473E+13,Active,Yes,Traditional,Elementary,Public,STEM Preparatory Elementary,Los Angeles Unified,Los Angeles,1374 West 35th Street,Los Angeles,CA,90007-3410,34.024337,-118.299810,No,1,Los Angeles CO Parcels 2020,1.96473E+13,K-5,K-4,90007
10065,10066,3.36698E+13,Active,No,Special Education School,Elementary,Public,Summit Elementary,Alvord Unified,Riverside,10368 Campbell Avenue,Riverside,CA,92505-1308,33.935586,-117.481338,Yes,3,Riverside CO Parcels 2020,3.36698E+13,K-5,No Data,92505
10066,10067,3.36698E+13,Active,No,Continuation School,High School,Public,Alvord Alternative Continuation High,Alvord Unified,Riverside,10368 Campbell Avenue,Riverside,CA,92505-1308,33.935586,-117.481338,Yes,3,Riverside CO Parcels 2020,3.36698E+13,12-Sep,12-Nov,92505
10067,10068,5.61056E+13,Active,No,Special Education School,Elementary - High Combination,Public,Triton Academy,Ventura County Office of Education,Ventura,5250 Adolfo Road,Camarillo,CA,93012-4801,34.214990,-118.998155,No,1,Ventura CO Parcels 2019,5.61056E+13,12-Mar,12-Mar,93012


In [161]:
df_schools[['lat', 'long']]

Unnamed: 0,lat,long
0,33.908583,-118.376468
1,33.918876,-118.384333
2,33.919227,-118.375501
3,33.919231,-118.375507
4,33.922632,-118.371115
...,...,...
10064,34.024337,-118.299810
10065,33.935586,-117.481338
10066,33.935586,-117.481338
10067,34.214990,-118.998155


In [159]:
df_point_sources_2017[['checked_lat', 'checked_lon']]

Unnamed: 0,checked_lat,checked_lon
5689,33.359400,-117.111400
5690,37.493617,-121.935869
5691,32.821000,-117.144000
5692,38.734600,-121.954430
5693,35.352150,-119.240640
...,...,...
7150,38.025100,-122.063900
7151,34.622200,-117.100100
7152,34.231230,-116.056220
7153,37.938779,-122.396453


In [153]:
df_point_sources['report_year'].value_counts()

2017    1466
2011    1323
2014    1189
2008    1137
2005    1122
2002     918
Name: report_year, dtype: int64

Methodology of interpolating

2002 represents 2000 - 2002

2005 represents 2003 - 2005

...

2017 represents 2015 - 2019

In [156]:
df_point_sources_2002 = df_point_sources[df_point_sources['report_year'] == 2002]
df_point_sources_2000 = df_point_sources_2002['report_year'] = 2000
df_point_sources_2001 = df_point_sources_2002['report_year'] = 2001



Unnamed: 0,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY
0,33.811466,-117.915550,92803.0,2002,1.787854
1,34.088242,-117.470116,92335.0,2002,1.789200
2,33.911602,-118.281799,93420.0,2002,1.791300
3,37.944618,-121.325859,95203.0,2002,1.797500
4,39.221817,-121.054955,95945.0,2002,1.801540
...,...,...,...,...,...
913,38.025414,-122.113251,94553.0,2002,307.379504
914,34.603149,-117.338593,92368.0,2002,312.212247
915,38.370838,-120.808243,95654.0,2002,413.799008
916,32.837570,-117.152794,92111.0,2002,473.915287
