In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

### This notebook calculates the fraction of time the 2m temperature was above a ceratain threshold. Lets do it for a single county

In [2]:
data = xr.open_dataset("data_sfc.nc") ## data with 3 hour value for the entire year of 2010
                                      ## of the variables 2m temperature and 2m dew point temperature
data

In [3]:
data.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,48.75,-125.00,277.681854,279.335510
2010-01-01 00:00:00,48.75,-124.25,276.716034,277.768127
2010-01-01 00:00:00,48.75,-123.50,276.760956,277.172424
2010-01-01 00:00:00,48.75,-122.75,276.804901,276.965393
2010-01-01 00:00:00,48.75,-122.00,274.138885,274.655823
...,...,...,...,...
2010-12-31 21:00:00,24.00,-69.50,287.732727,295.728790
2010-12-31 21:00:00,24.00,-68.75,287.549133,295.571564
2010-12-31 21:00:00,24.00,-68.00,287.462219,295.528595
2010-12-31 21:00:00,24.00,-67.25,287.574524,295.526642


In [4]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join("County_shapefile",'gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

county_gdf.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,1,29,Cleburne,County,560.1,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,1,31,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,1,37,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,1,39,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,1,41,Crenshaw,County,608.84,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."


In [5]:
geometry_column=county_gdf['geometry'] ## get the geometry column
geometry_column

0       POLYGON ((-85.38872 33.91304, -85.38088 33.873...
1       POLYGON ((-86.03044 31.61894, -86.00408 31.619...
2       POLYGON ((-86.00928 33.10164, -86.00917 33.090...
3       POLYGON ((-86.34851 30.99434, -86.35023 30.994...
4       POLYGON ((-86.14699 31.68045, -86.14711 31.663...
                              ...                        
3216    POLYGON ((-66.90748 18.25314, -66.90739 18.253...
3217    POLYGON ((-66.37968 17.94398, -66.38029 17.943...
3218    MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...
3219    POLYGON ((-66.02917 18.37590, -66.02828 18.376...
3220    POLYGON ((-66.85229 17.95500, -66.85280 17.955...
Name: geometry, Length: 3221, dtype: geometry

In [6]:
type(geometry_column)

geopandas.geoseries.GeoSeries

In [7]:
lat_lon=geometry_column.get_coordinates(index_parts=True) ## get lat and lon from the geometry
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3220,202,-66.833718,17.989763
3220,203,-66.835282,17.988274
3220,204,-66.835429,17.986323
3220,205,-66.836682,17.965971


In [8]:
type(lat_lon)

pandas.core.frame.DataFrame

## Find value of the variables in the county with index 0 in the shape file

In [9]:
longitude= lat_lon.loc[(0), 'x']  # get the longitude values
longitude=longitude.sample(n=10)  ## extract a random sample of 10 longitude

latitude= lat_lon.loc[(0), 'y']   # get the latitude values
latitude=latitude.sample(n=10)    ## extract a random sample of 10 latitude

lat_list=latitude.tolist()
print(lat_list)
lon_list=longitude.tolist()
print(lon_list)

[33.498593, 33.495371, 33.488358999999996, 33.675952845587, 33.535897999999996, 33.476757, 33.496623, 33.476768, 33.627254, 33.927068064286296]
[-85.308211, -85.313999, -85.49582, -85.36459509570929, -85.532482, -85.781244, -85.723074, -85.765427, -85.636882, -85.314852]


In [10]:
## find the corresponding values of the variables in the finer grid

year_avg_finer= data.interp(longitude=lon_list, latitude=lat_list)
year_avg_finer

In [11]:
year_avg_finer.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,33.498593,-85.308211,281.416343,281.735901
2010-01-01 00:00:00,33.498593,-85.313999,281.412271,281.734689
2010-01-01 00:00:00,33.498593,-85.495820,281.284358,281.696632
2010-01-01 00:00:00,33.498593,-85.364595,281.376676,281.724099
2010-01-01 00:00:00,33.498593,-85.532482,281.258566,281.688958
...,...,...,...,...
2010-12-31 21:00:00,33.927068,-85.781244,284.886402,293.381570
2010-12-31 21:00:00,33.927068,-85.723074,284.805148,293.328951
2010-12-31 21:00:00,33.927068,-85.765427,284.864308,293.367263
2010-12-31 21:00:00,33.927068,-85.636882,284.684753,293.250983


In [12]:
## group by the valid time dimension, and take the average in the latitude and longitude

summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
summary.to_dataframe()

Unnamed: 0_level_0,d2m,t2m
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,281.160694,281.580796
2010-01-01 03:00:00,279.604649,280.592181
2010-01-01 06:00:00,278.397518,279.757538
2010-01-01 09:00:00,277.552315,278.999388
2010-01-01 12:00:00,275.736988,277.813634
...,...,...
2010-12-31 09:00:00,278.118978,283.009587
2010-12-31 12:00:00,278.152192,282.964479
2010-12-31 15:00:00,279.609578,285.489597
2010-12-31 18:00:00,283.216541,291.791872


In [13]:
type(summary)

xarray.core.dataset.Dataset

In [14]:
df=summary.to_dataframe()
df

Unnamed: 0_level_0,d2m,t2m
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,281.160694,281.580796
2010-01-01 03:00:00,279.604649,280.592181
2010-01-01 06:00:00,278.397518,279.757538
2010-01-01 09:00:00,277.552315,278.999388
2010-01-01 12:00:00,275.736988,277.813634
...,...,...
2010-12-31 09:00:00,278.118978,283.009587
2010-12-31 12:00:00,278.152192,282.964479
2010-12-31 15:00:00,279.609578,285.489597
2010-12-31 18:00:00,283.216541,291.791872


In [15]:
total_rows=len(df)
total_rows

2920

In [16]:
threshold_temp=290 ## threshold temperature. The temperature is in Kelvin

In [17]:
## data points above the threshold temperature
c=0
for i in range(0, total_rows):
    if df['t2m'].values[i] > threshold_temp :
        c=c+1
print(c)

1543


In [18]:
## convert the number to a fraction 

frac_time=(c/total_rows)*100
frac_time

52.84246575342466

## Lets do it for all the counties

In [19]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a sample of 10 latitudes and
         10 longitudes. The interpolation done by xarray will give a set of 100 points. A single county
         will consist of 365 X 8 = 2920 counties. From this set we find the fraction of time a particular
         county was above a certain threshold of a given variable, in this case, 2m temperature.
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            summary_df: Dataframe consisting the values of the variables interpolated in the county.
    '''    
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    longitude=longitude.sample(n=10)    # randomly select 10 points

    latitude= lat_lon.loc[(var), 'y']  # get the latitude
    latitude=latitude.sample(n=10)    ## randomly select 10 points

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    year_avg_finer= year_avg.interp(longitude=lon_list, latitude=lat_list)

    summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
    df=summary.to_dataframe()
    total_rows=len(df)

    c=0
    for i in range(0, total_rows):
        if df['t2m'].values[i] > threshold_temp :
            c=c+1
    ## convert the number to a fraction 

    frac_time=(c/total_rows)*100
    
    

    return summary_df