#### Notebook to feature engineer fraction of time temperature was above 90F

In [None]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 15)

In [None]:
PATH_1=os.path.join('..','..','..','..','Weather_Data','CAMS','2003','2003_multi_level_26_variables.nc')
ml_26_variables = xr.open_dataset(PATH_1) ## multilevel 26 variables
ml_26_variables = ml_26_variables.squeeze(dim="model_level") ## remove the model_level dimension
ml_26_variables=ml_26_variables.drop_vars('model_level') ## the dropped dimension becomes a column, so remove that
ml_26_variables['longitude'] = ml_26_variables['longitude']-360 ## necessary because of longitude data in multi-level case
temp=ml_26_variables['t'] ## get only the temperature variable
temp

In [None]:
temp.to_dataframe()

In [None]:
PATH_2=os.path.join('..','CAMS_79_variables_2003.pkl')
get_data = pd.read_pickle(PATH_2) ## load the data with all variables
get_data.head()

In [None]:
threshold= 273.15 ## set threshold 90 F

In [None]:
## use county shapefile from 2008 as the shapefile before 2008 could not be found
SHAPE_PATH=os.path.join('..','..','..','..','Shapefiles','county_shapefiles','2008_county_shapefile','tl_2008_us_county.shp')
county_gdf = gpd.read_file(SHAPE_PATH)
county_gdf

In [None]:
geometry_column=county_gdf['geometry']
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

In [None]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a 100 or less latitude and longitude pair.
        And then to find the fraction of time, PM2.5 was above EPA standard
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            pm_above_df: Dataframe consisting the values of the variables interpolated in the county.
    '''   
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    extract_val=len(longitude) ## find number of longitude that a shapfile has

    ## if number of longitude or latitude is less than 100, that many lat-lon pairs will be extracted, if not, a 100 values
    if extract_val < 100:
        extract_val=extract_val
    else:
        extract_val=100

    longitude=longitude[:extract_val]  ## extract first 100 values or values less than 100
    latitude=lat_lon.loc[(var),'y']    ## get the latitude values
    latitude=latitude[:extract_val]    ## extract first 100 values or values less than 100

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    ## find the corresponding values of the variables in the finer grid

    year_avg_finer= temp.interp(longitude=lon_list, latitude=lat_list)
    
## get only the 100 (or less) pair of latitude and longitude from the lat and lon list from the 100,000 (or less) rows
    get_vals=[]

    for i in range(0, extract_val):
        row=year_avg_finer.isel(latitude=[i], longitude=[i])
        row_df=row.to_dataframe()
        get_vals.append(row_df)
    
    initial_df=pd.concat(get_vals)
    summary = initial_df.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary

    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['t'].values[i] > threshold:
            c=c+1

## convert number to a fraction
    frac_time=(c/total_rows)*100
    pm_above_df=pd.DataFrame({'Temp above threshold':[frac_time]})


    return pm_above_df

In [None]:
len(county_gdf)

In [None]:
%%time

## get the values for all counties

df_list_below=[]

for i in range(0,len(county_gdf)): ## loop for the all list of counties
    try:
        df_list_below.append(single_county(i))
    except:
        pass

In [None]:
concatenate_df=pd.concat(df_list_below)
final_df=concatenate_df.reset_index(drop=True)
final_df

In [None]:
## lets first create a fips column in the original shape file

county_gdf['fips']=county_gdf['STATEFP'] + county_gdf['COUNTYFP']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

county_gdf = county_gdf.drop([ 'STATEFP','COUNTYFP','COUNTYNS','CNTYIDFP','NAMELSAD','LSAD','CLASSFP','MTFCC',
                                 'CSAFP','CBSAFP','METDIVFP','FUNCSTAT'], axis=1)
county_gdf

In [None]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

In [None]:
county_df=merged_df.dropna()
county_df.head()

In [None]:
## convert into pandas dataframe without the geometry column
pd.options.display.float_format = '{:.15f}'.format ## see 15 decimal places of the numbers
county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

In [None]:
county_var.to_pickle('temp_above_threshold.pkl')