#### Notebook to feature engineer fraction of time PM10 was above EPA standard for all counties

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 15)

In [2]:
PATH_1=os.path.join('..','..','..','..','Weather_Data','CAMS','2015','2015_single_level_34_variables.nc')
sl_34_variables = xr.open_dataset(PATH_1) ## single level 34 variables
sl_34_variables
pm10=sl_34_variables['pm10'] 
pm10

In [3]:
pm10.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm10
valid_time,latitude,longitude,Unnamed: 3_level_1
2015-01-01 00:00:00,49.25,-124.849999999999994,0.000000002774579
2015-01-01 00:00:00,49.25,-124.099999999999994,0.000000006283324
2015-01-01 00:00:00,49.25,-123.349999999999994,0.000000013315210
2015-01-01 00:00:00,49.25,-122.599999999999994,0.000000012260204
2015-01-01 00:00:00,49.25,-121.849999999999994,0.000000004254852
...,...,...,...
2015-12-31 21:00:00,24.50,-70.099999999999994,0.000000010635038
2015-12-31 21:00:00,24.50,-69.349999999999994,0.000000010833762
2015-12-31 21:00:00,24.50,-68.599999999999994,0.000000011063644
2015-12-31 21:00:00,24.50,-67.849999999999994,0.000000011569963


In [4]:
pm_threshold=150*10**(-9) ## epa standard
pm_threshold

1.5000000000000002e-07

In [5]:
SHAPE_PATH=os.path.join('..','..','..','..','Shapefiles','county_shapefiles','2015_county_shapefile','cb_2015_us_county_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)
county_gdf

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,01,005,00161528,0500000US01005,01005,Barbour,06,2291820706,50864677,"POLYGON ((-85.74803 31.61918, -85.74544 31.618..."
1,01,023,00161537,0500000US01023,01023,Choctaw,06,2365954971,19059247,"POLYGON ((-88.47323 31.89386, -88.46888 31.930..."
2,01,035,00161543,0500000US01035,01035,Conecuh,06,2201896058,6643480,"POLYGON ((-87.42720 31.26436, -87.42551 31.268..."
3,01,051,00161551,0500000US01051,01051,Elmore,06,1601876535,99850740,"POLYGON ((-86.41333 32.75059, -86.37115 32.750..."
4,01,065,00161558,0500000US01065,01065,Hale,06,1667804583,32525874,"POLYGON ((-87.87046 32.76244, -87.86818 32.765..."
...,...,...,...,...,...,...,...,...,...,...
3228,45,019,01252740,0500000US45019,45019,Charleston,06,2372842394,1144346152,"MULTIPOLYGON (((-79.50795 33.02008, -79.50713 ..."
3229,45,077,01248015,0500000US45077,45077,Pickens,06,1285536060,40612589,"MULTIPOLYGON (((-82.86687 34.61742, -82.86451 ..."
3230,46,123,01265784,0500000US46123,46123,Tripp,06,4176233698,13272785,"POLYGON ((-100.23091 43.49989, -100.23044 43.5..."
3231,47,073,01639752,0500000US47073,47073,Hawkins,06,1261443215,32545400,"POLYGON ((-83.28890 36.37879, -83.28250 36.382..."


In [6]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON ((-85.74803 31.61918, -85.74544 31.618...
1       POLYGON ((-88.47323 31.89386, -88.46888 31.930...
2       POLYGON ((-87.42720 31.26436, -87.42551 31.268...
3       POLYGON ((-86.41333 32.75059, -86.37115 32.750...
4       POLYGON ((-87.87046 32.76244, -87.86818 32.765...
                              ...                        
3228    MULTIPOLYGON (((-79.50795 33.02008, -79.50713 ...
3229    MULTIPOLYGON (((-82.86687 34.61742, -82.86451 ...
3230    POLYGON ((-100.23091 43.49989, -100.23044 43.5...
3231    POLYGON ((-83.28890 36.37879, -83.28250 36.382...
3232    POLYGON ((-101.62940 34.75006, -101.62806 34.8...
Name: geometry, Length: 3233, dtype: geometry

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.748031999999995,31.619180999999998
0,1,-85.745435000000001,31.618897999999998
0,2,-85.742650999999995,31.621258999999998
0,3,-85.741739999999993,31.619402999999998
0,4,-85.739812999999998,31.621810000000000
...,...,...,...
3232,29,-101.385870999999995,34.748376999999998
3232,30,-101.434904000000003,34.747419999999998
3232,31,-101.471562000000006,34.747461999999999
3232,32,-101.629256999999996,34.747648999999996


#### Fraction of time PM10 was above EPA threshold

In [8]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a 100 or less latitude and longitude pair.
        And then to find the fraction of time, PM2.5 was above EPA standard
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            pm_above_df: Dataframe consisting the values of the variables interpolated in the county.
    '''   
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    extract_val=len(longitude) ## find number of longitude that a shapfile has

    ## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
    if extract_val < 100:
        extract_val=extract_val
    else:
        extract_val=100

    longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
    latitude=lat_lon.loc[(var),'y']    ## get the latitude values
    latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    ## find the corresponding values of the variables in the finer grid

    year_avg_finer= pm10.interp(longitude=lon_list, latitude=lat_list)
    
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
    get_vals=[]

    for i in range(0, extract_val):
        row=year_avg_finer.isel(latitude=[i], longitude=[i])
        row_df=row.to_dataframe()
        get_vals.append(row_df)
    
    initial_df=pd.concat(get_vals)
    summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary

    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['pm10'].values[i] > pm_threshold:
            c=c+1

## convert number to a fraction
    frac_time=(c/total_rows)*100
    pm_above_df=pd.DataFrame({'PM10 above threshold':[frac_time]})


    return pm_above_df

In [9]:
len(county_gdf)

3233

In [10]:
%%time

## get the values for all counties

df_list_below=[]

for i in range(0,len(county_gdf)): ## loop for the all list of counties
    try:
        df_list_below.append(single_county(i))
    except:
        pass

CPU times: total: 41min 32s
Wall time: 41min 56s


In [11]:
concatenate_df=pd.concat(df_list_below)
final_df=concatenate_df.reset_index(drop=True)
final_df

Unnamed: 0,PM10 above threshold
0,0.376712328767123
1,0.171232876712329
2,0.410958904109589
3,0.342465753424658
4,0.000000000000000
...,...
3228,0.034246575342466
3229,0.034246575342466
3230,0.102739726027397
3231,0.000000000000000


In [12]:
## lets first create a fips column in the original shape file

county_gdf['fips']=county_gdf['STATEFP'] + county_gdf['COUNTYFP']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

county_gdf = county_gdf.drop(['STATEFP','COUNTYFP','COUNTYNS','AFFGEOID','GEOID','LSAD','ALAND','AWATER'], axis=1)
county_gdf

Unnamed: 0,NAME,geometry,fips
0,Barbour,"POLYGON ((-85.74803 31.61918, -85.74544 31.618...",1005
1,Choctaw,"POLYGON ((-88.47323 31.89386, -88.46888 31.930...",1023
2,Conecuh,"POLYGON ((-87.42720 31.26436, -87.42551 31.268...",1035
3,Elmore,"POLYGON ((-86.41333 32.75059, -86.37115 32.750...",1051
4,Hale,"POLYGON ((-87.87046 32.76244, -87.86818 32.765...",1065
...,...,...,...
3228,Charleston,"MULTIPOLYGON (((-79.50795 33.02008, -79.50713 ...",45019
3229,Pickens,"MULTIPOLYGON (((-82.86687 34.61742, -82.86451 ...",45077
3230,Tripp,"POLYGON ((-100.23091 43.49989, -100.23044 43.5...",46123
3231,Hawkins,"POLYGON ((-83.28890 36.37879, -83.28250 36.382...",47073


In [13]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,NAME,geometry,fips,PM10 above threshold
0,Barbour,"POLYGON ((-85.74803 31.61918, -85.74544 31.618...",1005,0.376712328767123
1,Choctaw,"POLYGON ((-88.47323 31.89386, -88.46888 31.930...",1023,0.171232876712329
2,Conecuh,"POLYGON ((-87.42720 31.26436, -87.42551 31.268...",1035,0.410958904109589
3,Elmore,"POLYGON ((-86.41333 32.75059, -86.37115 32.750...",1051,0.342465753424658
4,Hale,"POLYGON ((-87.87046 32.76244, -87.86818 32.765...",1065,0.000000000000000
...,...,...,...,...
3228,Charleston,"MULTIPOLYGON (((-79.50795 33.02008, -79.50713 ...",45019,0.034246575342466
3229,Pickens,"MULTIPOLYGON (((-82.86687 34.61742, -82.86451 ...",45077,0.034246575342466
3230,Tripp,"POLYGON ((-100.23091 43.49989, -100.23044 43.5...",46123,0.102739726027397
3231,Hawkins,"POLYGON ((-83.28890 36.37879, -83.28250 36.382...",47073,0.000000000000000


In [14]:
county_df=merged_df.dropna()
county_df.head()

Unnamed: 0,NAME,geometry,fips,PM10 above threshold
0,Barbour,"POLYGON ((-85.74803 31.61918, -85.74544 31.618...",1005,0.376712328767123
1,Choctaw,"POLYGON ((-88.47323 31.89386, -88.46888 31.930...",1023,0.171232876712329
2,Conecuh,"POLYGON ((-87.42720 31.26436, -87.42551 31.268...",1035,0.410958904109589
3,Elmore,"POLYGON ((-86.41333 32.75059, -86.37115 32.750...",1051,0.342465753424658
4,Hale,"POLYGON ((-87.87046 32.76244, -87.86818 32.765...",1065,0.0


In [15]:
## convert into pandas dataframe without the geometry column
pd.options.display.float_format = '{:.15f}'.format ## see 15 decimal places of the numbers
county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

Unnamed: 0,NAME,fips,PM10 above threshold
0,Barbour,1005,0.376712328767123
1,Choctaw,1023,0.171232876712329
2,Conecuh,1035,0.410958904109589
3,Elmore,1051,0.342465753424658
4,Hale,1065,0.000000000000000
...,...,...,...
3228,Charleston,45019,0.034246575342466
3229,Pickens,45077,0.034246575342466
3230,Tripp,46123,0.102739726027397
3231,Hawkins,47073,0.000000000000000


In [16]:
county_var.to_pickle('PM10_above_threshold.pkl')