In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

### This notebook calculates the fraction of time the 2m temperature was above a ceratain threshold. Lets do it for a single county

In [2]:
data = xr.open_dataset("data_sfc.nc") ## data with 3 hour value for the entire year of 2010
                                      ## of the variables 2m temperature and 2m dew point temperature
data

In [3]:
data.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,48.75,-125.00,277.681854,279.335510
2010-01-01 00:00:00,48.75,-124.25,276.716034,277.768127
2010-01-01 00:00:00,48.75,-123.50,276.760956,277.172424
2010-01-01 00:00:00,48.75,-122.75,276.804901,276.965393
2010-01-01 00:00:00,48.75,-122.00,274.138885,274.655823
...,...,...,...,...
2010-12-31 21:00:00,24.00,-69.50,287.732727,295.728790
2010-12-31 21:00:00,24.00,-68.75,287.549133,295.571564
2010-12-31 21:00:00,24.00,-68.00,287.462219,295.528595
2010-12-31 21:00:00,24.00,-67.25,287.574524,295.526642


In [4]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join("County_shapefile",'gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

county_gdf.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,1,29,Cleburne,County,560.1,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,1,31,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,1,37,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,1,39,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,1,41,Crenshaw,County,608.84,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."


In [5]:
geometry_column=county_gdf['geometry'] ## get the geometry column
geometry_column

0       POLYGON ((-85.38872 33.91304, -85.38088 33.873...
1       POLYGON ((-86.03044 31.61894, -86.00408 31.619...
2       POLYGON ((-86.00928 33.10164, -86.00917 33.090...
3       POLYGON ((-86.34851 30.99434, -86.35023 30.994...
4       POLYGON ((-86.14699 31.68045, -86.14711 31.663...
                              ...                        
3216    POLYGON ((-66.90748 18.25314, -66.90739 18.253...
3217    POLYGON ((-66.37968 17.94398, -66.38029 17.943...
3218    MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...
3219    POLYGON ((-66.02917 18.37590, -66.02828 18.376...
3220    POLYGON ((-66.85229 17.95500, -66.85280 17.955...
Name: geometry, Length: 3221, dtype: geometry

In [6]:
type(geometry_column)

geopandas.geoseries.GeoSeries

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True) ## get lat and lon from the geometry
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3220,202,-66.833718,17.989763
3220,203,-66.835282,17.988274
3220,204,-66.835429,17.986323
3220,205,-66.836682,17.965971


In [8]:
type(lat_lon)

pandas.core.frame.DataFrame

## Find value of the variables in the county with index 0 in the shape file

In [9]:
longitude= lat_lon.loc[(0), 'x']  # get the longitude values
longitude=longitude.sample(n=10)  ## extract a random sample of 10 longitude

latitude= lat_lon.loc[(0), 'y']   # get the latitude values
latitude=latitude.sample(n=10)    ## extract a random sample of 10 latitude

lat_list=latitude.tolist()
print(lat_list)
lon_list=longitude.tolist()
print(lon_list)

[33.648413, 33.491644, 33.846495, 33.875101, 33.627254, 33.874865, 33.773508, 33.875273, 33.649158, 33.788445541993795]
[-85.36459509570929, -85.62783499999999, -85.324856, -85.344923, -85.585201, -85.316028, -85.63858599999999, -85.72364999999999, -85.35531499999999, -85.3427217288762]


In [10]:
## find the corresponding values of the variables in the finer grid

year_avg_finer= data.interp(longitude=lon_list, latitude=lat_list)
year_avg_finer

In [11]:
year_avg_finer.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,33.648413,-85.364595,281.138717,281.473242
2010-01-01 00:00:00,33.648413,-85.627835,280.976463,281.433412
2010-01-01 00:00:00,33.648413,-85.324856,281.163211,281.479255
2010-01-01 00:00:00,33.648413,-85.344923,281.150842,281.476219
2010-01-01 00:00:00,33.648413,-85.585201,281.002741,281.439863
...,...,...,...,...
2010-12-31 21:00:00,33.788446,-85.316028,284.347838,293.184543
2010-12-31 21:00:00,33.788446,-85.638586,284.787294,293.469182
2010-12-31 21:00:00,33.788446,-85.723650,284.903186,293.544246
2010-12-31 21:00:00,33.788446,-85.355315,284.401363,293.219212


In [12]:
## group by the valid time dimension, and take the average in the latitude and longitude

summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
summary.to_dataframe()

Unnamed: 0_level_0,d2m,t2m
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,280.938616,281.315625
2010-01-01 03:00:00,279.473883,280.432857
2010-01-01 06:00:00,278.320478,279.613605
2010-01-01 09:00:00,277.418087,278.795451
2010-01-01 12:00:00,275.527593,277.620671
...,...,...
2010-12-31 09:00:00,277.606315,282.957717
2010-12-31 12:00:00,277.669115,282.870283
2010-12-31 15:00:00,279.233399,285.183035
2010-12-31 18:00:00,282.843364,291.324004


In [13]:
type(summary)

xarray.core.dataset.Dataset

In [14]:
df=summary.to_dataframe()
df

Unnamed: 0_level_0,d2m,t2m
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,280.938616,281.315625
2010-01-01 03:00:00,279.473883,280.432857
2010-01-01 06:00:00,278.320478,279.613605
2010-01-01 09:00:00,277.418087,278.795451
2010-01-01 12:00:00,275.527593,277.620671
...,...,...
2010-12-31 09:00:00,277.606315,282.957717
2010-12-31 12:00:00,277.669115,282.870283
2010-12-31 15:00:00,279.233399,285.183035
2010-12-31 18:00:00,282.843364,291.324004


In [15]:
total_rows=len(df)
total_rows

2920

In [16]:
threshold_temp=290 ## threshold temperature. The temperature is in Kelvin

In [17]:
## data points above the threshold temperature
c=0
for i in range(0, total_rows):
    if df['t2m'].values[i] > threshold_temp :
        c=c+1
print(c)

1528


In [18]:
## convert the number to a fraction 

frac_time=(c/total_rows)*100
frac_time

52.32876712328767

## Lets do it for all the counties

In [19]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a sample of 10 latitudes and
         10 longitudes. The interpolation done by xarray will give a set of 100 points. A single county
         will consist of 365 X 8 = 2920 counties. From this set we find the fraction of time a particular
         county was above a certain threshold of a given variable, in this case, 2m temperature.
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            county_df: Dataframe consisting the fraction of time in a county above a certain threshold.
    '''    
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    longitude=longitude.sample(n=10)    # randomly select 10 points

    latitude= lat_lon.loc[(var), 'y']  # get the latitude
    latitude=latitude.sample(n=10)    ## randomly select 10 points

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()
    
    year_avg_finer= data.interp(longitude=lon_list, latitude=lat_list)
    

    summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary.to_dataframe()
    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['t2m'].values[i] > threshold_temp :
            c=c+1
    ## convert the number to a fraction 

    frac_time=(c/total_rows)*100
    county_df=pd.DataFrame({'Temp above threshold':[frac_time]})
    
    return county_df

In [20]:
single_county(0)

Unnamed: 0,Temp above threshold
0,52.60274


In [21]:
%%time

## get the average for all of the counties

df_list=[]

for i in range(0,50): ## A total of 3221 counties. index in shape file starts from 0 and ends in 3220
    try:
        df_list.append(single_county(i))
    except:
        pass

CPU times: total: 1min 35s
Wall time: 1min 39s


In [22]:
initial_df=pd.concat(df_list)

#### Note that there are NaN values because the data, 'data.nc' is for the Continental USA, whereas the county shapefile consists of other locations as well such as Alaska, hawaii, guam, puertorico etc.

In [23]:
final_df=initial_df.reset_index(drop=True) ## reset index \
final_df

Unnamed: 0,Temp above threshold
0,52.705479
1,58.732877
2,54.623288
3,58.69863
4,58.458904
5,58.732877
6,51.130137
7,60.342466
8,53.356164
9,59.212329


### We will now need to merge the dataframe with the original shapefile.

In [24]:
## lets first create a fips column in the original shape file

## lets create the fips column in the dataframe which is the sum of state and county both as dtype string.
## Note: not a numerical sum
## This step is necessary because there can be several counties with idential names. 

county_gdf['fips']=county_gdf['STATE'] + county_gdf['COUNTY']
county_gdf.dtypes
## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)
county_gdf.dtypes

GEO_ID          object
STATE           object
COUNTY          object
NAME            object
LSAD            object
CENSUSAREA     float64
geometry      geometry
fips             int32
dtype: object

In [25]:
county_gdf = county_gdf.drop([ 'GEO_ID','CENSUSAREA','STATE','COUNTY','LSAD'], axis=1)
county_gdf

Unnamed: 0,NAME,geometry,fips
0,Cleburne,"POLYGON ((-85.38872 33.91304, -85.38088 33.873...",1029
1,Coffee,"POLYGON ((-86.03044 31.61894, -86.00408 31.619...",1031
2,Coosa,"POLYGON ((-86.00928 33.10164, -86.00917 33.090...",1037
3,Covington,"POLYGON ((-86.34851 30.99434, -86.35023 30.994...",1039
4,Crenshaw,"POLYGON ((-86.14699 31.68045, -86.14711 31.663...",1041
...,...,...,...
3216,San Sebastián,"POLYGON ((-66.90748 18.25314, -66.90739 18.253...",72131
3217,Santa Isabel,"POLYGON ((-66.37968 17.94398, -66.38029 17.943...",72133
3218,Toa Baja,"MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...",72137
3219,Trujillo Alto,"POLYGON ((-66.02917 18.37590, -66.02828 18.376...",72139


In [26]:
merged_df=pd.merge(county_gdf, final_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,NAME,geometry,fips,Temp above threshold
0,Cleburne,"POLYGON ((-85.38872 33.91304, -85.38088 33.873...",1029,52.705479
1,Coffee,"POLYGON ((-86.03044 31.61894, -86.00408 31.619...",1031,58.732877
2,Coosa,"POLYGON ((-86.00928 33.10164, -86.00917 33.090...",1037,54.623288
3,Covington,"POLYGON ((-86.34851 30.99434, -86.35023 30.994...",1039,58.69863
4,Crenshaw,"POLYGON ((-86.14699 31.68045, -86.14711 31.663...",1041,58.458904
5,Dale,"POLYGON ((-85.79043 31.32027, -85.79033 31.323...",1045,58.732877
6,DeKalb,"POLYGON ((-85.57593 34.82373, -85.56142 34.750...",1049,51.130137
7,Escambia,"POLYGON ((-87.16308 30.99904, -87.16408 30.999...",1053,60.342466
8,Fayette,"POLYGON ((-87.63593 33.87874, -87.63604 33.872...",1057,53.356164
9,Geneva,"POLYGON ((-85.77267 30.99462, -85.77966 30.994...",1061,59.212329


In [27]:
county_df=merged_df.dropna()
county_df.head()

Unnamed: 0,NAME,geometry,fips,Temp above threshold
0,Cleburne,"POLYGON ((-85.38872 33.91304, -85.38088 33.873...",1029,52.705479
1,Coffee,"POLYGON ((-86.03044 31.61894, -86.00408 31.619...",1031,58.732877
2,Coosa,"POLYGON ((-86.00928 33.10164, -86.00917 33.090...",1037,54.623288
3,Covington,"POLYGON ((-86.34851 30.99434, -86.35023 30.994...",1039,58.69863
4,Crenshaw,"POLYGON ((-86.14699 31.68045, -86.14711 31.663...",1041,58.458904


In [28]:
type(county_df)

geopandas.geodataframe.GeoDataFrame

In [29]:
## convert into pandas dataframe without the geometry column

county_var=pd.DataFrame(county_df.drop(columns='geometry')) 
county_var

Unnamed: 0,NAME,fips,Temp above threshold
0,Cleburne,1029,52.705479
1,Coffee,1031,58.732877
2,Coosa,1037,54.623288
3,Covington,1039,58.69863
4,Crenshaw,1041,58.458904
5,Dale,1045,58.732877
6,DeKalb,1049,51.130137
7,Escambia,1053,60.342466
8,Fayette,1057,53.356164
9,Geneva,1061,59.212329


In [None]:
#county_var.to_pickle('2m_temp_feature_engineering.pkl')
