## This notebook calculates the average of the variables for the entire year of the states in  CONtinental USA.

In [1]:
## conda env Weather_Prediction

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import geopandas as gpd
import xarray as xr
import rioxarray ## what we need from rioxarray can be done using rio acessor
import regionmask

In [2]:
## lets load the data

data = xr.open_dataset("data_sfc.nc")
data

In [3]:
df=data.to_dataframe()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,48.75,-125.00,277.681854,279.335510
2010-01-01 00:00:00,48.75,-124.25,276.716034,277.768127
2010-01-01 00:00:00,48.75,-123.50,276.760956,277.172424
2010-01-01 00:00:00,48.75,-122.75,276.804901,276.965393
2010-01-01 00:00:00,48.75,-122.00,274.138885,274.655823
...,...,...,...,...
2010-12-31 21:00:00,24.00,-69.50,287.732727,295.728790
2010-12-31 21:00:00,24.00,-68.75,287.549133,295.571564
2010-12-31 21:00:00,24.00,-68.00,287.462219,295.528595
2010-12-31 21:00:00,24.00,-67.25,287.574524,295.526642


In [4]:
## lets resamle the data to that of 1 Year
## that is, this is the 1 year average
year_avg = data.resample(valid_time='1YE').mean(dim='valid_time')
year_avg

In [5]:
df=year_avg.to_dataframe()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-31,48.75,-125.00,280.158722,282.717682
2010-12-31,48.75,-124.25,279.476868,282.623596
2010-12-31,48.75,-123.50,279.457245,283.005524
2010-12-31,48.75,-122.75,279.182251,282.910004
2010-12-31,48.75,-122.00,276.495148,280.681885
2010-12-31,...,...,...,...
2010-12-31,24.00,-69.50,293.573669,297.577881
2010-12-31,24.00,-68.75,293.532562,297.538727
2010-12-31,24.00,-68.00,293.503845,297.490417
2010-12-31,24.00,-67.25,293.475891,297.458191


In [6]:
## Lets load the shapefile

SHAPE_PATH = os.path.join("State_shapefile",'cb_2018_us_state_500k.shp')
states_gdf = gpd.read_file(SHAPE_PATH)

states_gdf.head()

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ..."
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ..."
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6..."
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ..."
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169..."


In [7]:
## removing alaska and other territories
## this step is not necessary as when merging will be done later, these territories will have NaN values
## which will be removed

state_gdf=states_gdf.drop([13,27, 36, 37, 38,42, 44, 45 ])

In [8]:
state_list=state_gdf['NAME'].tolist()
print(len(state_list))
print(state_list[0])

48
Mississippi


In [9]:
def single_state(var):
    
    ''' Function to find the average value of weather data variables of a single state
        as listed in state_list.
         Args:
         --------
             var (str): The name of the state.

         Returns:
         --------
            final_df: Dataframe consisting of the columns in the shape file and corresponding value of the variables
                      which in this case is the d2m and t2m.
    '''
    
    states = state_gdf[state_gdf.NAME.isin([var])]
    state_mask = regionmask.mask_3D_geopandas(states,year_avg.longitude,year_avg.latitude)
    df_masked = year_avg.where(state_mask)
    summary = df_masked.groupby("region").mean(["latitude", "longitude","valid_time"])
    df2=summary.to_dataframe()   
    
    df1=pd.DataFrame(states_gdf)
    df = pd.merge(df1, df2, left_index=True, right_index=True, how='left')  
    final_df=df.dropna()
    return final_df

In [10]:
single_state(state_list[0])

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry,d2m,t2m
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ...",284.287445,291.118561


In [13]:
## use the function to every state in the list of states
df_list=[]

for i in range(0,len(state_list)):
    df_list.append(single_state(state_list[i]))

df_list[:4] ## view first 4 in the list

[  STATEFP   STATENS     AFFGEOID GEOID STUSPS         NAME LSAD         ALAND  \
 0      28  01779790  0400000US28    28     MS  Mississippi   00  121533519481   
 
        AWATER                                           geometry         d2m  \
 0  3926919758  MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ...  284.287445   
 
           t2m  
 0  291.118561  ,
   STATEFP   STATENS     AFFGEOID GEOID STUSPS            NAME LSAD  \
 1      37  01027616  0400000US37    37     NC  North Carolina   00   
 
           ALAND       AWATER  \
 1  125923656064  13466071395   
 
                                             geometry         d2m         t2m  
 1  MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ...  281.780121  288.697479  ,
   STATEFP   STATENS     AFFGEOID GEOID STUSPS      NAME LSAD         ALAND  \
 2      40  01102857  0400000US40    40     OK  Oklahoma   00  177662925723   
 
        AWATER                                           geometry         d2m  \
 2  3374587997  POLYG

In [14]:
## test the constructed dataframe

df_list[1]

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry,d2m,t2m
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ...",281.780121,288.697479


In [15]:
## concetanete all the dataframe in the list

final_df=pd.concat(df_list)
final_df

Unnamed: 0,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,geometry,d2m,t2m
0,28,1779790,0400000US28,28,MS,Mississippi,0,121533519481,3926919758,"MULTIPOLYGON (((-88.50297 30.21523, -88.49176 ...",284.287445,291.118561
1,37,1027616,0400000US37,37,NC,North Carolina,0,125923656064,13466071395,"MULTIPOLYGON (((-75.72681 35.93584, -75.71827 ...",281.780121,288.697479
2,40,1102857,0400000US40,40,OK,Oklahoma,0,177662925723,3374587997,"POLYGON ((-103.00257 36.52659, -103.00219 36.6...",280.990356,289.267578
3,51,1779803,0400000US51,51,VA,Virginia,0,102257717110,8528531774,"MULTIPOLYGON (((-75.74241 37.80835, -75.74151 ...",279.872681,287.02005
4,54,1779805,0400000US54,54,WV,West Virginia,0,62266474513,489028543,"POLYGON ((-82.64320 38.16909, -82.64300 38.169...",278.584015,284.734833
5,22,1629543,0400000US22,22,LA,Louisiana,0,111897594374,23753621895,"MULTIPOLYGON (((-88.86770 29.86155, -88.86566 ...",286.155823,292.650696
6,26,1779789,0400000US26,26,MI,Michigan,0,146600952990,103885855702,"MULTIPOLYGON (((-83.19159 42.03537, -83.18993 ...",276.965485,282.09848
7,25,606926,0400000US25,25,MA,Massachusetts,0,20205125364,7129925486,"MULTIPOLYGON (((-70.23405 41.28565, -70.22361 ...",277.961884,284.236633
8,16,1779783,0400000US16,16,ID,Idaho,0,214049787659,2391722557,"POLYGON ((-117.24267 44.39655, -117.23484 44.3...",271.65448,279.552246
9,12,294478,0400000US12,12,FL,Florida,0,138949136250,31361101223,"MULTIPOLYGON (((-80.17628 25.52505, -80.17395 ...",288.242004,293.948853
