## Notebook to feature engineer fraction of time PM2.5 was above EPA standard for all counties

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 15)

In [2]:
PATH_1=os.path.join('..','..','..','Weather_Data','CAMS','2003','2003_single_level_34_variables.nc')
sl_34_variables = xr.open_dataset(PATH_1) ## single level 34 variables
pm2_5=sl_34_variables['pm2p5'] ## get only pm2.5 variable
pm2_5

In [3]:
pm2_5.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm2p5
valid_time,latitude,longitude,Unnamed: 3_level_1
2003-01-01 00:00:00,49.25,-124.849999999999994,0.000000004423940
2003-01-01 00:00:00,49.25,-124.099999999999994,0.000000009259248
2003-01-01 00:00:00,49.25,-123.349999999999994,0.000000014751317
2003-01-01 00:00:00,49.25,-122.599999999999994,0.000000012178498
2003-01-01 00:00:00,49.25,-121.849999999999994,0.000000007155878
...,...,...,...
2003-12-31 21:00:00,24.50,-70.099999999999994,0.000000003350070
2003-12-31 21:00:00,24.50,-69.349999999999994,0.000000003731365
2003-12-31 21:00:00,24.50,-68.599999999999994,0.000000004375821
2003-12-31 21:00:00,24.50,-67.849999999999994,0.000000005139164


In [4]:
pm_threshold=9*10**(-9) ## epa standard
pm_threshold

9.000000000000001e-09

In [5]:
## use county shapefile from 2008 as the shapefile before 2008 could not be found
SHAPE_PATH=os.path.join('..','..','..','Shapefiles','county_shapefiles','2008_county_shapefile','tl_2008_us_county.shp')
county_gdf = gpd.read_file(SHAPE_PATH)
county_gdf

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,CNTYIDFP,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,geometry
0,19,175,00465276,19175,Union,Union County,06,H1,G4020,,,,A,"POLYGON ((-94.47051 40.97504, -94.47050 40.975..."
1,19,177,00465277,19177,Van Buren,Van Buren County,06,H1,G4020,,,,A,"POLYGON ((-91.96059 40.90070, -91.95925 40.900..."
2,20,097,00485013,20097,Kiowa,Kiowa County,06,H1,G4020,,,,A,"POLYGON ((-99.01471 37.67895, -99.01473 37.678..."
3,20,109,00485019,20109,Logan,Logan County,06,H1,G4020,,,,A,"POLYGON ((-100.89562 39.13329, -100.89404 39.1..."
4,20,003,00484971,20003,Anderson,Anderson County,06,H1,G4020,,,,A,"POLYGON ((-95.51741 38.20619, -95.51741 38.206..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,35,015,00936829,35015,Eddy,Eddy County,06,H1,G4020,,16100,,A,"POLYGON ((-104.09554 32.00000, -104.09586 32.0..."
3229,40,015,01101795,40015,Caddo,Caddo County,06,H1,G4020,,,,A,"POLYGON ((-98.36984 35.55133, -98.36964 35.551..."
3230,40,025,01101800,40025,Cimarron,Cimarron County,06,H1,G4020,,,,A,"POLYGON ((-102.14252 36.50032, -102.14380 36.5..."
3231,40,069,01101822,40069,Johnston,Johnston County,06,H1,G4020,,,,A,"POLYGON ((-96.74415 34.17223, -96.74416 34.172..."


In [6]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON ((-94.47051 40.97504, -94.47050 40.975...
1       POLYGON ((-91.96059 40.90070, -91.95925 40.900...
2       POLYGON ((-99.01471 37.67895, -99.01473 37.678...
3       POLYGON ((-100.89562 39.13329, -100.89404 39.1...
4       POLYGON ((-95.51741 38.20619, -95.51741 38.206...
                              ...                        
3228    POLYGON ((-104.09554 32.00000, -104.09586 32.0...
3229    POLYGON ((-98.36984 35.55133, -98.36964 35.551...
3230    POLYGON ((-102.14252 36.50032, -102.14380 36.5...
3231    POLYGON ((-96.74415 34.17223, -96.74416 34.172...
3232    POLYGON ((-84.27514 39.28921, -84.27467 39.289...
Name: geometry, Length: 3233, dtype: geometry

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-94.470506000000000,40.975042999999999
0,1,-94.470500999999999,40.975550999999996
0,2,-94.470511999999999,40.978164000000000
0,3,-94.470497999999992,40.978803999999997
0,4,-94.470501999999996,40.979332999999997
...,...,...,...
3232,3894,-84.275637000000003,39.289242000000002
3232,3895,-84.275549999999996,39.289235999999995
3232,3896,-84.275515999999996,39.289234000000000
3232,3897,-84.275508000000002,39.289232999999996


### Fraction of time PM2.5 was above EPA threshold

In [8]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a 100 or less latitude and longitude pair.
        And then to find the fraction of time, PM2.5 was above EPA standard
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            pm_above_df: Dataframe consisting the values of the variables interpolated in the county.
    '''   
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    extract_val=len(longitude) ## find number of longitude that a shapfile has

    ## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
    if extract_val < 100:
        extract_val=extract_val
    else:
        extract_val=100

    longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
    latitude=lat_lon.loc[(var),'y']    ## get the latitude values
    latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    ## find the corresponding values of the variables in the finer grid

    year_avg_finer= pm2_5.interp(longitude=lon_list, latitude=lat_list)
    
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
    get_vals=[]

    for i in range(0, extract_val):
        row=year_avg_finer.isel(latitude=[i], longitude=[i])
        row_df=row.to_dataframe()
        get_vals.append(row_df)
    
    initial_df=pd.concat(get_vals)
    summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary

    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['pm2p5'].values[i] > pm_threshold:
            c=c+1

## convert number to a fraction
    frac_time=(c/total_rows)*100
    pm_above_df=pd.DataFrame({'PM2.5 above threshold':[frac_time]})


    return pm_above_df

In [9]:
len(county_gdf)

3233

In [10]:
%%time

## get the values for all counties

df_list_below=[]

for i in range(0,len(county_gdf)): ## loop for the all list of counties
    try:
        df_list_below.append(single_county(i))
    except:
        pass

CPU times: total: 26min 39s
Wall time: 27min 9s


In [12]:
concatenate_df=pd.concat(df_list_below)
final_df=concatenate_df.reset_index(drop=True)
final_df

Unnamed: 0,PM2.5 above threshold
0,70.308219178082183
1,75.513698630136986
2,43.595890410958901
3,36.541095890410958
4,71.061643835616437
...,...
3228,58.698630136986296
3229,53.904109589041092
3230,29.452054794520549
3231,71.198630136986296


In [13]:
## lets first create a fips column in the original shape file

county_gdf['fips']=county_gdf['STATEFP'] + county_gdf['COUNTYFP']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

county_gdf = county_gdf.drop([ 'STATEFP','COUNTYFP','COUNTYNS','CNTYIDFP','NAMELSAD','LSAD','CLASSFP','MTFCC',
                                 'CSAFP','CBSAFP','METDIVFP','FUNCSTAT'], axis=1)
county_gdf

Unnamed: 0,NAME,geometry,fips
0,Union,"POLYGON ((-94.47051 40.97504, -94.47050 40.975...",19175
1,Van Buren,"POLYGON ((-91.96059 40.90070, -91.95925 40.900...",19177
2,Kiowa,"POLYGON ((-99.01471 37.67895, -99.01473 37.678...",20097
3,Logan,"POLYGON ((-100.89562 39.13329, -100.89404 39.1...",20109
4,Anderson,"POLYGON ((-95.51741 38.20619, -95.51741 38.206...",20003
...,...,...,...
3228,Eddy,"POLYGON ((-104.09554 32.00000, -104.09586 32.0...",35015
3229,Caddo,"POLYGON ((-98.36984 35.55133, -98.36964 35.551...",40015
3230,Cimarron,"POLYGON ((-102.14252 36.50032, -102.14380 36.5...",40025
3231,Johnston,"POLYGON ((-96.74415 34.17223, -96.74416 34.172...",40069


In [14]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Union,"POLYGON ((-94.47051 40.97504, -94.47050 40.975...",19175,70.308219178082183
1,Van Buren,"POLYGON ((-91.96059 40.90070, -91.95925 40.900...",19177,75.513698630136986
2,Kiowa,"POLYGON ((-99.01471 37.67895, -99.01473 37.678...",20097,43.595890410958901
3,Logan,"POLYGON ((-100.89562 39.13329, -100.89404 39.1...",20109,36.541095890410958
4,Anderson,"POLYGON ((-95.51741 38.20619, -95.51741 38.206...",20003,71.061643835616437
...,...,...,...,...
3228,Eddy,"POLYGON ((-104.09554 32.00000, -104.09586 32.0...",35015,58.698630136986296
3229,Caddo,"POLYGON ((-98.36984 35.55133, -98.36964 35.551...",40015,53.904109589041092
3230,Cimarron,"POLYGON ((-102.14252 36.50032, -102.14380 36.5...",40025,29.452054794520549
3231,Johnston,"POLYGON ((-96.74415 34.17223, -96.74416 34.172...",40069,71.198630136986296


In [15]:
county_df=merged_df.dropna()
county_df.head()

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Union,"POLYGON ((-94.47051 40.97504, -94.47050 40.975...",19175,70.30821917808218
1,Van Buren,"POLYGON ((-91.96059 40.90070, -91.95925 40.900...",19177,75.51369863013697
2,Kiowa,"POLYGON ((-99.01471 37.67895, -99.01473 37.678...",20097,43.595890410958894
3,Logan,"POLYGON ((-100.89562 39.13329, -100.89404 39.1...",20109,36.54109589041096
4,Anderson,"POLYGON ((-95.51741 38.20619, -95.51741 38.206...",20003,71.06164383561644


In [16]:
## convert into pandas dataframe without the geometry column
pd.options.display.float_format = '{:.15f}'.format ## see 15 decimal places of the numbers
county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

Unnamed: 0,NAME,fips,PM2.5 above threshold
0,Union,19175,70.308219178082183
1,Van Buren,19177,75.513698630136986
2,Kiowa,20097,43.595890410958901
3,Logan,20109,36.541095890410958
4,Anderson,20003,71.061643835616437
...,...,...,...
3228,Eddy,35015,58.698630136986296
3229,Caddo,40015,53.904109589041092
3230,Cimarron,40025,29.452054794520549
3231,Johnston,40069,71.198630136986296


In [26]:
county_var.to_pickle('FE_PM25_2003.pkl')

#### OPTIONAL feature engineering for a single county

In [17]:
longitude= lat_lon.loc[(0), 'x']  # get the longitude values of the index 0
extract_val=len(longitude)        ## find number of longitude that a shapfile has
print(extract_val)

## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
if extract_val < 100:
    extract_val=extract_val
else:
    extract_val=100

longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
longitude

1021


0    -94.470506000000000
1    -94.470500999999999
2    -94.470511999999999
3    -94.470497999999992
4    -94.470501999999996
             ...        
95   -94.470536999999993
96   -94.470563999999996
97   -94.470573999999999
98   -94.470585999999997
99   -94.470624000000001
Name: x, Length: 100, dtype: float64

In [18]:
latitude= lat_lon.loc[(0), 'y']   # get the latitude values
latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100
latitude

0    40.975042999999999
1    40.975550999999996
2    40.978164000000000
3    40.978803999999997
4    40.979332999999997
            ...        
95   41.077373000000001
96   41.078908999999996
97   41.080002999999998
98   41.081210999999996
99   41.085254999999997
Name: y, Length: 100, dtype: float64

In [19]:
lat_list=latitude.tolist()
lon_list=longitude.tolist()

In [20]:
## find the corresponding values of the variables in the finer grid

year_avg_finer= pm2_5.interp(longitude=lon_list, latitude=lat_list)
year_avg_finer

In [21]:
year_avg_finer.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm2p5
valid_time,latitude,longitude,Unnamed: 3_level_1
2003-01-01 00:00:00,40.975042999999999,-94.470506000000000,0.000000013182011
2003-01-01 00:00:00,40.975042999999999,-94.470500999999999,0.000000013182017
2003-01-01 00:00:00,40.975042999999999,-94.470511999999999,0.000000013182005
2003-01-01 00:00:00,40.975042999999999,-94.470497999999992,0.000000013182020
2003-01-01 00:00:00,40.975042999999999,-94.470501999999996,0.000000013182016
...,...,...,...
2003-12-31 21:00:00,41.085254999999997,-94.470536999999993,0.000000001076289
2003-12-31 21:00:00,41.085254999999997,-94.470563999999996,0.000000001076289
2003-12-31 21:00:00,41.085254999999997,-94.470573999999999,0.000000001076289
2003-12-31 21:00:00,41.085254999999997,-94.470585999999997,0.000000001076289


In [22]:
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
get_vals=[]

for i in range(0, extract_val):
    row=year_avg_finer.isel(latitude=[i], longitude=[i])
    row_df=row.to_dataframe()
    get_vals.append(row_df)

In [23]:
initial_df=pd.concat(get_vals)
initial_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm2p5
valid_time,latitude,longitude,Unnamed: 3_level_1
2003-01-01 00:00:00,40.975042999999999,-94.470506000000000,0.000000013182011
2003-01-01 03:00:00,40.975042999999999,-94.470506000000000,0.000000015240844
2003-01-01 06:00:00,40.975042999999999,-94.470506000000000,0.000000010789637
2003-01-01 09:00:00,40.975042999999999,-94.470506000000000,0.000000009904943
2003-01-01 12:00:00,40.975042999999999,-94.470506000000000,0.000000009982001
...,...,...,...
2003-12-31 09:00:00,41.085254999999997,-94.470624000000001,0.000000012284730
2003-12-31 12:00:00,41.085254999999997,-94.470624000000001,0.000000000000000
2003-12-31 15:00:00,41.085254999999997,-94.470624000000001,0.000000000076877
2003-12-31 18:00:00,41.085254999999997,-94.470624000000001,0.000000000592509


In [24]:
summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
summary

Unnamed: 0_level_0,pm2p5
valid_time,Unnamed: 1_level_1
2003-01-01 00:00:00,0.000000013211159
2003-01-01 03:00:00,0.000000015471542
2003-01-01 06:00:00,0.000000010898754
2003-01-01 09:00:00,0.000000009966438
2003-01-01 12:00:00,0.000000010134134
...,...
2003-12-31 09:00:00,0.000000012611013
2003-12-31 12:00:00,0.000000000000226
2003-12-31 15:00:00,0.000000000023390
2003-12-31 18:00:00,0.000000000396424


In [25]:
type(summary)

pandas.core.frame.DataFrame