In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 15)

In [2]:
PATH_1=os.path.join('..','..','..','Weather_Data','CAMS','2014','2014_single_level_34_variables.nc')
sl_34_variables = xr.open_dataset(PATH_1) ## single level 34 variables
pm2_5=sl_34_variables['pm2p5'] ## get only pm2.5 variable
pm2_5

In [3]:
pm2_5.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pm2p5
valid_time,latitude,longitude,Unnamed: 3_level_1
2014-01-01 00:00:00,49.25,-124.849999999999994,0.000000001772327
2014-01-01 00:00:00,49.25,-124.099999999999994,0.000000002833490
2014-01-01 00:00:00,49.25,-123.349999999999994,0.000000008520340
2014-01-01 00:00:00,49.25,-122.599999999999994,0.000000009762358
2014-01-01 00:00:00,49.25,-121.849999999999994,0.000000006094498
...,...,...,...
2014-12-31 21:00:00,24.50,-70.099999999999994,0.000000002952799
2014-12-31 21:00:00,24.50,-69.349999999999994,0.000000003235151
2014-12-31 21:00:00,24.50,-68.599999999999994,0.000000003529909
2014-12-31 21:00:00,24.50,-67.849999999999994,0.000000003683237


In [4]:
pm_threshold=9*10**(-9) ## epa standard
pm_threshold

9.000000000000001e-09

In [5]:
SHAPE_PATH=os.path.join('..','..','..','Shapefiles','county_shapefiles','2014_county_shapefile','cb_2014_us_county_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)
county_gdf

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,01,001,00161526,0500000US01001,01001,Autauga,06,1539584444,25773561,"POLYGON Z ((-86.92120 32.65754 0.00000, -86.92..."
1,01,005,00161528,0500000US01005,01005,Barbour,06,2291820953,50864677,"POLYGON Z ((-85.74803 31.61918 0.00000, -85.74..."
2,01,023,00161537,0500000US01023,01023,Choctaw,06,2365954803,19059247,"POLYGON Z ((-88.47323 31.89386 0.00000, -88.46..."
3,01,033,00161542,0500000US01033,01033,Colbert,06,1534878355,80029923,"POLYGON Z ((-88.13925 34.58779 0.00000, -88.13..."
4,01,047,00161549,0500000US01047,01047,Dallas,06,2534807336,39134779,"POLYGON Z ((-87.47308 32.30761 0.00000, -87.42..."
...,...,...,...,...,...,...,...,...,...,...
3228,17,015,00424209,0500000US17015,17015,Carroll,06,1152707810,55841451,"POLYGON Z ((-90.31795 42.19391 0.00000, -90.22..."
3229,31,131,00835887,0500000US31131,31131,Otoe,06,1594475777,8775373,"POLYGON Z ((-96.46384 40.69696 0.00000, -96.46..."
3230,33,011,00873179,0500000US33011,33011,Hillsborough,06,2269220216,41604851,"POLYGON Z ((-72.06124 42.96584 0.00000, -72.05..."
3231,39,027,01074026,0500000US39027,39027,Clinton,06,1058488036,9350863,"POLYGON Z ((-84.00373 39.28847 0.00000, -84.00..."


In [6]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON Z ((-86.92120 32.65754 0.00000, -86.92...
1       POLYGON Z ((-85.74803 31.61918 0.00000, -85.74...
2       POLYGON Z ((-88.47323 31.89386 0.00000, -88.46...
3       POLYGON Z ((-88.13925 34.58779 0.00000, -88.13...
4       POLYGON Z ((-87.47308 32.30761 0.00000, -87.42...
                              ...                        
3228    POLYGON Z ((-90.31795 42.19391 0.00000, -90.22...
3229    POLYGON Z ((-96.46384 40.69696 0.00000, -96.46...
3230    POLYGON Z ((-72.06124 42.96584 0.00000, -72.05...
3231    POLYGON Z ((-84.00373 39.28847 0.00000, -84.00...
3232    POLYGON Z ((-83.26448 39.54067 0.00000, -83.26...
Name: geometry, Length: 3233, dtype: geometry

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-86.921195999999995,32.657541999999999
0,1,-86.920351999999994,32.658563000000001
0,2,-86.920411000000001,32.660077000000001
0,3,-86.917594999999992,32.664169000000001
0,4,-86.914609999999996,32.664355000000000
...,...,...,...
3232,120,-83.155472000000003,39.511775999999998
3232,121,-83.165610999999998,39.512189999999997
3232,122,-83.224839000000003,39.514578999999998
3232,123,-83.266736999999992,39.516248999999995


#### Fraction of time PM2.5 was above EPA threshold

In [8]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a 100 or less latitude and longitude pair.
        And then to find the fraction of time, PM2.5 was above EPA standard
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            pm_above_df: Dataframe consisting the values of the variables interpolated in the county.
    '''   
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    extract_val=len(longitude) ## find number of longitude that a shapfile has

    ## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
    if extract_val < 100:
        extract_val=extract_val
    else:
        extract_val=100

    longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
    latitude=lat_lon.loc[(var),'y']    ## get the latitude values
    latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    ## find the corresponding values of the variables in the finer grid

    year_avg_finer= pm2_5.interp(longitude=lon_list, latitude=lat_list)
    
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
    get_vals=[]

    for i in range(0, extract_val):
        row=year_avg_finer.isel(latitude=[i], longitude=[i])
        row_df=row.to_dataframe()
        get_vals.append(row_df)
    
    initial_df=pd.concat(get_vals)
    summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary

    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['pm2p5'].values[i] > pm_threshold:
            c=c+1

## convert number to a fraction
    frac_time=(c/total_rows)*100
    pm_above_df=pd.DataFrame({'PM2.5 above threshold':[frac_time]})


    return pm_above_df

In [9]:
len(county_gdf)

3233

In [10]:
%%time

## get the values for all counties

df_list_below=[]

for i in range(0,len(county_gdf)): ## loop for the all list of counties
    try:
        df_list_below.append(single_county(i))
    except:
        pass

CPU times: total: 48min 26s
Wall time: 48min 55s


In [11]:
concatenate_df=pd.concat(df_list_below)
final_df=concatenate_df.reset_index(drop=True)
final_df

Unnamed: 0,PM2.5 above threshold
0,65.684931506849324
1,66.061643835616437
2,65.924657534246577
3,61.267123287671232
4,65.547945205479451
...,...
3228,67.294520547945197
3229,59.520547945205479
3230,61.815068493150683
3231,72.328767123287676


In [12]:
## lets first create a fips column in the original shape file

county_gdf['fips']=county_gdf['STATEFP'] + county_gdf['COUNTYFP']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

county_gdf = county_gdf.drop(['STATEFP','COUNTYFP','COUNTYNS','AFFGEOID','GEOID','LSAD','ALAND','AWATER'], axis=1)
county_gdf

Unnamed: 0,NAME,geometry,fips
0,Autauga,"POLYGON Z ((-86.92120 32.65754 0.00000, -86.92...",1001
1,Barbour,"POLYGON Z ((-85.74803 31.61918 0.00000, -85.74...",1005
2,Choctaw,"POLYGON Z ((-88.47323 31.89386 0.00000, -88.46...",1023
3,Colbert,"POLYGON Z ((-88.13925 34.58779 0.00000, -88.13...",1033
4,Dallas,"POLYGON Z ((-87.47308 32.30761 0.00000, -87.42...",1047
...,...,...,...
3228,Carroll,"POLYGON Z ((-90.31795 42.19391 0.00000, -90.22...",17015
3229,Otoe,"POLYGON Z ((-96.46384 40.69696 0.00000, -96.46...",31131
3230,Hillsborough,"POLYGON Z ((-72.06124 42.96584 0.00000, -72.05...",33011
3231,Clinton,"POLYGON Z ((-84.00373 39.28847 0.00000, -84.00...",39027


In [13]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Autauga,"POLYGON Z ((-86.92120 32.65754 0.00000, -86.92...",1001,65.684931506849324
1,Barbour,"POLYGON Z ((-85.74803 31.61918 0.00000, -85.74...",1005,66.061643835616437
2,Choctaw,"POLYGON Z ((-88.47323 31.89386 0.00000, -88.46...",1023,65.924657534246577
3,Colbert,"POLYGON Z ((-88.13925 34.58779 0.00000, -88.13...",1033,61.267123287671232
4,Dallas,"POLYGON Z ((-87.47308 32.30761 0.00000, -87.42...",1047,65.547945205479451
...,...,...,...,...
3228,Carroll,"POLYGON Z ((-90.31795 42.19391 0.00000, -90.22...",17015,67.294520547945197
3229,Otoe,"POLYGON Z ((-96.46384 40.69696 0.00000, -96.46...",31131,59.520547945205479
3230,Hillsborough,"POLYGON Z ((-72.06124 42.96584 0.00000, -72.05...",33011,61.815068493150683
3231,Clinton,"POLYGON Z ((-84.00373 39.28847 0.00000, -84.00...",39027,72.328767123287676


In [14]:
county_df=merged_df.dropna()
county_df.head()

Unnamed: 0,NAME,geometry,fips,PM2.5 above threshold
0,Autauga,"POLYGON Z ((-86.92120 32.65754 0.00000, -86.92...",1001,65.68493150684932
1,Barbour,"POLYGON Z ((-85.74803 31.61918 0.00000, -85.74...",1005,66.06164383561644
2,Choctaw,"POLYGON Z ((-88.47323 31.89386 0.00000, -88.46...",1023,65.92465753424658
3,Colbert,"POLYGON Z ((-88.13925 34.58779 0.00000, -88.13...",1033,61.26712328767123
4,Dallas,"POLYGON Z ((-87.47308 32.30761 0.00000, -87.42...",1047,65.54794520547945


In [15]:
## convert into pandas dataframe without the geometry column
pd.options.display.float_format = '{:.15f}'.format ## see 15 decimal places of the numbers
county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

Unnamed: 0,NAME,fips,PM2.5 above threshold
0,Autauga,1001,65.684931506849324
1,Barbour,1005,66.061643835616437
2,Choctaw,1023,65.924657534246577
3,Colbert,1033,61.267123287671232
4,Dallas,1047,65.547945205479451
...,...,...,...
3228,Carroll,17015,67.294520547945197
3229,Otoe,31131,59.520547945205479
3230,Hillsborough,33011,61.815068493150683
3231,Clinton,39027,72.328767123287676


In [16]:
county_var.to_pickle('FE_PM25_2014.pkl')