### Notebook to engineer features  (a) Fraction of time temperature was above 90 F, (b) Fraction of time temperature was below 0 C and (c) Fraction of time PM2.5 was above EPA standards

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

In [2]:
PATH_1=os.path.join('../Weather_Data','CAMS','2010','2010_multi_level_26_variables.nc')
ml_26_variables = xr.open_dataset(PATH_1) ## multi level 26 variables
temperature=ml_26_variables['t'] ## get only temperature variable
temperature_squeezed=temperature.squeeze('model_level') ## remove model level dimension
temperature=temperature_squeezed.drop_vars('model_level')
temperature

In [3]:
temperature['longitude'] = temperature['longitude']-360 ## change longitude to -180 to 180 format
temperature

In [4]:
temperature.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t
valid_time,latitude,longitude,Unnamed: 3_level_1
2010-01-01 00:00:00,49.25,-124.85,275.571350
2010-01-01 00:00:00,49.25,-124.10,275.530334
2010-01-01 00:00:00,49.25,-123.35,275.405334
2010-01-01 00:00:00,49.25,-122.60,274.099670
2010-01-01 00:00:00,49.25,-121.85,271.778381
...,...,...,...
2010-12-31 21:00:00,24.50,-70.10,295.699036
2010-12-31 21:00:00,24.50,-69.35,295.426575
2010-12-31 21:00:00,24.50,-68.60,295.209778
2010-12-31 21:00:00,24.50,-67.85,295.077942


In [5]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join('../Shapefiles','county_shapefiles','2010_county_shapefile','gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

county_gdf.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,1,29,Cleburne,County,560.1,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,1,31,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,1,37,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,1,39,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,1,41,Crenshaw,County,608.84,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."


In [6]:
geometry_column=county_gdf['geometry'] ## get the geometry column
geometry_column

0       POLYGON ((-85.38872 33.91304, -85.38088 33.873...
1       POLYGON ((-86.03044 31.61894, -86.00408 31.619...
2       POLYGON ((-86.00928 33.10164, -86.00917 33.090...
3       POLYGON ((-86.34851 30.99434, -86.35023 30.994...
4       POLYGON ((-86.14699 31.68045, -86.14711 31.663...
                              ...                        
3216    POLYGON ((-66.90748 18.25314, -66.90739 18.253...
3217    POLYGON ((-66.37968 17.94398, -66.38029 17.943...
3218    MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...
3219    POLYGON ((-66.02917 18.37590, -66.02828 18.376...
3220    POLYGON ((-66.85229 17.95500, -66.85280 17.955...
Name: geometry, Length: 3221, dtype: geometry

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True) ## get lat and lon from the geometry
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3220,202,-66.833718,17.989763
3220,203,-66.835282,17.988274
3220,204,-66.835429,17.986323
3220,205,-66.836682,17.965971


In [8]:
max_temp=305.372  ## threshold expressed in temperature, which is 90 Fahrenheit
min_temp=273.15 ## threshold expressed in temperature, which is 0 degree celsius

In [9]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a sample of 10 latitudes and
         10 longitudes. The interpolation done by xarray will give a set of 100 points. A single county
         will consist of 365 X 8 = 2920 data. From this set we find the fraction of time a particular
         county was above a certain threshold or below a threshold.
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            county_df: Dataframe consisting the fraction of time in a county above a certain threshold.
    '''    
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    longitude=longitude.sample(n=10)    # randomly select 10 points

    latitude= lat_lon.loc[(var), 'y']  # get the latitude
    latitude=latitude.sample(n=10)    ## randomly select 10 points

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()
    
    year_avg_finer= temperature.interp(longitude=lon_list, latitude=lat_list)
    

    summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary.to_dataframe()
    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['t'].values[i] > max_temp :
            c=c+1
    ## convert the number to a fraction 

    frac_time=(c/total_rows)*100
    county_df=pd.DataFrame({'Temp above threshold':[frac_time]})
    
    return county_df

In [10]:
single_county(0)

Unnamed: 0,Temp above threshold
0,2.979452
