In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 15)

In [2]:
PATH_1=os.path.join('..','..','..','Weather_Data','CAMS','2016','2016_single_level_34_variables.nc')
sl_34_variables = xr.open_dataset(PATH_1) ## single level 34 variables
pm2_5=sl_34_variables['pm2p5'] ## get only pm2.5 variable
pm2_5

In [3]:
pm2_5.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm2p5
valid_time,latitude,longitude,Unnamed: 3_level_1
2016-01-31 00:00:00,49.25,-124.849999999999994,0.000000004839642
2016-01-31 00:00:00,49.25,-124.099999999999994,0.000000005734204
2016-01-31 00:00:00,49.25,-123.349999999999994,0.000000007260955
2016-01-31 00:00:00,49.25,-122.599999999999994,0.000000007010009
2016-01-31 00:00:00,49.25,-121.849999999999994,0.000000005048928
...,...,...,...
2016-12-31 21:00:00,24.50,-70.099999999999994,0.000000014629448
2016-12-31 21:00:00,24.50,-69.349999999999994,0.000000015538001
2016-12-31 21:00:00,24.50,-68.599999999999994,0.000000016646471
2016-12-31 21:00:00,24.50,-67.849999999999994,0.000000017259229


In [4]:
pm_threshold=9*10**(-9) ## epa standard
pm_threshold

9.000000000000001e-09

In [5]:
SHAPE_PATH=os.path.join('..','..','..','Shapefiles','county_shapefiles','2016_county_shapefile','cb_2016_us_county_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)
county_gdf

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,19,107,00465242,0500000US19107,19107,Keokuk,06,1500067253,1929323,"POLYGON ((-92.41199 41.50955, -92.35539 41.509..."
1,19,189,00465283,0500000US19189,19189,Winnebago,06,1037261946,3182052,"POLYGON ((-93.97076 43.49960, -93.88843 43.499..."
2,20,093,00485011,0500000US20093,20093,Kearny,06,2254696689,1133601,"POLYGON ((-101.54192 37.91457, -101.54186 37.9..."
3,20,123,00485026,0500000US20123,20123,Mitchell,06,1817632928,44979981,"POLYGON ((-98.49007 39.24167, -98.49005 39.263..."
4,20,187,00485055,0500000US20187,20187,Stanton,06,1762104518,178555,"POLYGON ((-102.04190 37.54119, -102.04189 37.5..."
...,...,...,...,...,...,...,...,...,...,...
3228,72,123,01804542,0500000US72123,72123,Salinas,13,179660999,115910809,"POLYGON ((-66.34257 17.99366, -66.34053 17.995..."
3229,47,017,01639729,0500000US47017,47017,Carroll,06,1547925982,6308712,"POLYGON ((-88.70600 35.79604, -88.70667 35.798..."
3230,51,183,01690257,0500000US51183,51183,Sussex,06,1269664389,6755317,"POLYGON ((-77.61620 36.87920, -77.58713 36.894..."
3231,55,013,01581066,0500000US55013,55013,Burnett,06,2127856271,151932123,"POLYGON ((-92.88571 45.64602, -92.88442 45.652..."


In [6]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON ((-92.41199 41.50955, -92.35539 41.509...
1       POLYGON ((-93.97076 43.49960, -93.88843 43.499...
2       POLYGON ((-101.54192 37.91457, -101.54186 37.9...
3       POLYGON ((-98.49007 39.24167, -98.49005 39.263...
4       POLYGON ((-102.04190 37.54119, -102.04189 37.5...
                              ...                        
3228    POLYGON ((-66.34257 17.99366, -66.34053 17.995...
3229    POLYGON ((-88.70600 35.79604, -88.70667 35.798...
3230    POLYGON ((-77.61620 36.87920, -77.58713 36.894...
3231    POLYGON ((-92.88571 45.64602, -92.88442 45.652...
3232    POLYGON ((-89.04779 45.98234, -89.03148 45.982...
Name: geometry, Length: 3233, dtype: geometry

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-92.411994999999990,41.509547999999995
0,1,-92.355389000000002,41.509645999999996
0,2,-92.345286999999999,41.509676999999996
0,3,-92.297494000000000,41.509789999999995
0,4,-92.287641999999991,41.509827999999999
...,...,...,...
3232,174,-89.047474999999991,45.796779999999998
3232,175,-89.047601999999998,45.881585999999999
3232,176,-89.047601000000000,45.895354999999995
3232,177,-89.047753000000000,45.924363000000000


#### Fraction of time PM2.5 was above EPA threshold

In [8]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a 100 or less latitude and longitude pair.
        And then to find the fraction of time, PM2.5 was above EPA standard
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            pm_above_df: Dataframe consisting the values of the variables interpolated in the county.
    '''   
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    extract_val=len(longitude) ## find number of longitude that a shapfile has

    ## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
    if extract_val < 100:
        extract_val=extract_val
    else:
        extract_val=100

    longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
    latitude=lat_lon.loc[(var),'y']    ## get the latitude values
    latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    ## find the corresponding values of the variables in the finer grid

    year_avg_finer= pm2_5.interp(longitude=lon_list, latitude=lat_list)
    
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
    get_vals=[]

    for i in range(0, extract_val):
        row=year_avg_finer.isel(latitude=[i], longitude=[i])
        row_df=row.to_dataframe()
        get_vals.append(row_df)
    
    initial_df=pd.concat(get_vals)
    summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary

    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['pm2p5'].values[i] > pm_threshold:
            c=c+1

## convert number to a fraction
    frac_time=(c/total_rows)*100
    pm_above_df=pd.DataFrame({'PM2.5 above threshold':[frac_time]})


    return pm_above_df

In [9]:
len(county_gdf)

3233

In [10]:
%%time

## get the values for all counties

df_list_below=[]

for i in range(0,len(county_gdf)): ## loop for the all list of counties
    try:
        df_list_below.append(single_county(i))
    except:
        pass

CPU times: total: 45min 55s
Wall time: 46min 23s


In [11]:
concatenate_df=pd.concat(df_list_below)
final_df=concatenate_df.reset_index(drop=True)
final_df

Unnamed: 0,PM2.5 above threshold
0,55.059523809523810
1,49.255952380952387
2,28.013392857142854
3,37.239583333333329
4,27.827380952380953
...,...
3228,0.000000000000000
3229,59.226190476190474
3230,60.639880952380956
3231,43.080357142857146


In [12]:
## lets first create a fips column in the original shape file

county_gdf['fips']=county_gdf['STATEFP'] + county_gdf['COUNTYFP']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

county_gdf = county_gdf.drop(['STATEFP','COUNTYFP','COUNTYNS','AFFGEOID','GEOID','LSAD','ALAND','AWATER'], axis=1)
county_gdf

Unnamed: 0,NAME,geometry,fips
0,Keokuk,"POLYGON ((-92.41199 41.50955, -92.35539 41.509...",19107
1,Winnebago,"POLYGON ((-93.97076 43.49960, -93.88843 43.499...",19189
2,Kearny,"POLYGON ((-101.54192 37.91457, -101.54186 37.9...",20093
3,Mitchell,"POLYGON ((-98.49007 39.24167, -98.49005 39.263...",20123
4,Stanton,"POLYGON ((-102.04190 37.54119, -102.04189 37.5...",20187
...,...,...,...
3228,Salinas,"POLYGON ((-66.34257 17.99366, -66.34053 17.995...",72123
3229,Carroll,"POLYGON ((-88.70600 35.79604, -88.70667 35.798...",47017
3230,Sussex,"POLYGON ((-77.61620 36.87920, -77.58713 36.894...",51183
3231,Burnett,"POLYGON ((-92.88571 45.64602, -92.88442 45.652...",55013


In [13]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Keokuk,"POLYGON ((-92.41199 41.50955, -92.35539 41.509...",19107,55.059523809523810
1,Winnebago,"POLYGON ((-93.97076 43.49960, -93.88843 43.499...",19189,49.255952380952387
2,Kearny,"POLYGON ((-101.54192 37.91457, -101.54186 37.9...",20093,28.013392857142854
3,Mitchell,"POLYGON ((-98.49007 39.24167, -98.49005 39.263...",20123,37.239583333333329
4,Stanton,"POLYGON ((-102.04190 37.54119, -102.04189 37.5...",20187,27.827380952380953
...,...,...,...,...
3228,Salinas,"POLYGON ((-66.34257 17.99366, -66.34053 17.995...",72123,0.000000000000000
3229,Carroll,"POLYGON ((-88.70600 35.79604, -88.70667 35.798...",47017,59.226190476190474
3230,Sussex,"POLYGON ((-77.61620 36.87920, -77.58713 36.894...",51183,60.639880952380956
3231,Burnett,"POLYGON ((-92.88571 45.64602, -92.88442 45.652...",55013,43.080357142857146


In [14]:
county_df=merged_df.dropna()
county_df.head()

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Keokuk,"POLYGON ((-92.41199 41.50955, -92.35539 41.509...",19107,55.05952380952381
1,Winnebago,"POLYGON ((-93.97076 43.49960, -93.88843 43.499...",19189,49.255952380952394
2,Kearny,"POLYGON ((-101.54192 37.91457, -101.54186 37.9...",20093,28.01339285714285
3,Mitchell,"POLYGON ((-98.49007 39.24167, -98.49005 39.263...",20123,37.23958333333333
4,Stanton,"POLYGON ((-102.04190 37.54119, -102.04189 37.5...",20187,27.827380952380956


In [15]:
## convert into pandas dataframe without the geometry column
pd.options.display.float_format = '{:.15f}'.format ## see 15 decimal places of the numbers
county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

Unnamed: 0,NAME,fips,PM2.5 above threshold
0,Keokuk,19107,55.059523809523810
1,Winnebago,19189,49.255952380952387
2,Kearny,20093,28.013392857142854
3,Mitchell,20123,37.239583333333329
4,Stanton,20187,27.827380952380953
...,...,...,...
3228,Salinas,72123,0.000000000000000
3229,Carroll,47017,59.226190476190474
3230,Sussex,51183,60.639880952380956
3231,Burnett,55013,43.080357142857146


In [16]:
county_var.to_pickle('FE_PM25_2016.pkl')