### This Notebook finds the values in all counties from a gridded data. A sample of 100 latitude and longitude pair is taken from the shapefile of a county. The values of the variables from the netcdf are then interpolated from the grid to this set of latitude and longitude.

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

In [2]:
data = xr.open_dataset("data.nc") ## one month average of the variables
data

In [3]:
## resample the data to 1 year average

year_avg = data.resample(valid_time='1YE').mean(dim='valid_time')

## delete variables that have NaN in them

vars_with_nan = [var for var in year_avg if year_avg[var].isnull().any()]
year_avg= year_avg.drop_vars(vars_with_nan)
year_avg

In [4]:
## note that even though the time dimension, essentially shows 2010-12-31,
## what it actually means is yearly average
year_avg.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010-12-31,49.5,-126.00,278.348907,281.556824,0.006512,0.014148,0.001253,3.752065,2.928701,101390.726562,0.055563,5.288607e-09,...,5.697568e-07,1.359443e-07,2.107467e-06,5.559153e-07,0.000009,6.538563e-07,0.000050,0.000015,0.000003,6.057056e-07
2010-12-31,49.5,-125.25,279.088837,282.437836,0.006916,0.013934,0.001246,4.228424,1.811178,101402.250000,0.060980,6.618710e-09,...,5.552510e-07,1.506114e-07,2.379295e-06,5.843853e-07,0.000010,6.358721e-07,0.000048,0.000014,0.000003,7.764698e-07
2010-12-31,49.5,-124.50,279.470001,282.923370,0.007213,0.013523,0.001238,2.194234,0.692861,101427.570312,0.063776,7.792362e-09,...,5.443322e-07,1.616488e-07,2.564180e-06,6.047914e-07,0.000011,6.118582e-07,0.000046,0.000013,0.000003,9.925919e-07
2010-12-31,49.5,-123.75,279.041290,282.562347,0.007478,0.013581,0.001204,0.905955,0.624898,101459.429688,0.067136,1.080445e-08,...,5.231616e-07,1.798799e-07,2.806439e-06,6.146706e-07,0.000011,5.744274e-07,0.000043,0.000011,0.000003,1.408917e-06
2010-12-31,49.5,-123.00,277.851837,281.525085,0.008017,0.014813,0.001140,3.628265,2.724324,101485.656250,0.076543,1.931933e-08,...,4.871432e-07,2.252995e-07,3.389544e-06,6.264335e-07,0.000012,5.279849e-07,0.000039,0.000010,0.000004,2.356623e-06
2010-12-31,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31,24.0,-69.00,293.523407,297.542999,0.004872,0.012293,0.024328,0.000000,0.000000,101571.789062,0.046011,9.519893e-09,...,1.086019e-05,3.103107e-08,5.009322e-07,4.866120e-07,0.000009,7.770465e-07,0.000062,0.000016,0.000005,1.695241e-07
2010-12-31,24.0,-68.25,293.495728,297.488953,0.004856,0.012303,0.024477,0.000000,0.000000,101578.000000,0.045661,9.399162e-09,...,1.102058e-05,3.011268e-08,4.867323e-07,4.857583e-07,0.000009,7.737776e-07,0.000062,0.000015,0.000005,1.793884e-07
2010-12-31,24.0,-67.50,293.465454,297.455261,0.004833,0.012302,0.024507,0.000000,0.000000,101583.164062,0.045540,9.288230e-09,...,1.103588e-05,2.886006e-08,4.733052e-07,4.846137e-07,0.000009,7.727766e-07,0.000061,0.000015,0.000005,1.564795e-07
2010-12-31,24.0,-66.75,293.443970,297.427094,0.004824,0.012318,0.024585,0.000000,0.000000,101590.195312,0.045691,9.213097e-09,...,1.106749e-05,2.775320e-08,4.601205e-07,4.847639e-07,0.000009,7.745404e-07,0.000062,0.000015,0.000005,1.387977e-07


In [5]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join("County_shapefile",'gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

county_gdf

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,01,029,Cleburne,County,560.100,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,01,031,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,01,037,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,01,039,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,01,041,Crenshaw,County,608.840,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."
...,...,...,...,...,...,...,...
3216,0500000US72131,72,131,San Sebastián,Muno,70.423,"POLYGON ((-66.90748 18.25314, -66.90739 18.253..."
3217,0500000US72133,72,133,Santa Isabel,Muno,34.023,"POLYGON ((-66.37968 17.94398, -66.38029 17.943..."
3218,0500000US72137,72,137,Toa Baja,Muno,23.241,"MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ..."
3219,0500000US72139,72,139,Trujillo Alto,Muno,20.764,"POLYGON ((-66.02917 18.37590, -66.02828 18.376..."


In [6]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON ((-85.38872 33.91304, -85.38088 33.873...
1       POLYGON ((-86.03044 31.61894, -86.00408 31.619...
2       POLYGON ((-86.00928 33.10164, -86.00917 33.090...
3       POLYGON ((-86.34851 30.99434, -86.35023 30.994...
4       POLYGON ((-86.14699 31.68045, -86.14711 31.663...
                              ...                        
3216    POLYGON ((-66.90748 18.25314, -66.90739 18.253...
3217    POLYGON ((-66.37968 17.94398, -66.38029 17.943...
3218    MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...
3219    POLYGON ((-66.02917 18.37590, -66.02828 18.376...
3220    POLYGON ((-66.85229 17.95500, -66.85280 17.955...
Name: geometry, Length: 3221, dtype: geometry

In [7]:
type(geometry_column)

geopandas.geoseries.GeoSeries

In [8]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3220,202,-66.833718,17.989763
3220,203,-66.835282,17.988274
3220,204,-66.835429,17.986323
3220,205,-66.836682,17.965971


In [9]:
def single_county(var):
    
    ''' Function to interpolate the values in a single county by taking a sample of 10 latitudes and
         10 longitudes. The interpolation done by xarray will give a set of 100 points.  
        Args:
        --------
             var (int): The index of the county in the shapefile.

        Returns:
        --------
            summary_df: Dataframe consisting the values of the variables interpolated in the county.
    '''    
    
    
    longitude= lat_lon.loc[(var), 'x']  # get the longitude
    longitude=longitude.sample(n=10)    # randomly select 10 points

    latitude= lat_lon.loc[(var), 'y']  # get the latitude
    latitude=latitude.sample(n=10)    ## randomly select 10 points

    lat_list=latitude.tolist()
    lon_list=longitude.tolist()

    year_avg_finer= year_avg.interp(longitude=lon_list, latitude=lat_list)

    summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
    summary_df = summary.to_dataframe()

    return summary_df

In [10]:
%%time

## get the average for all of the counties

df_list=[]

for i in range(0,3221): ## A total of 3221 counties. index in shape file starts from 0 and ends in 3220
    try:
        df_list.append(single_county(i))
    except:
        pass

CPU times: total: 2min 15s
Wall time: 2min 19s


In [11]:
initial_df=pd.concat(df_list)
initial_df

Unnamed: 0_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,282.615197,289.807924,0.006226,0.018000,0.007990,3.549790,2.739781,101708.405246,0.077264,1.671023e-08,...,0.000002,1.374248e-07,0.000003,5.240474e-07,0.000013,2.005139e-07,0.000013,7.403231e-07,0.000009,3.686137e-06
2010-12-31,284.441159,291.411693,0.006488,0.017673,0.011682,3.998708,2.346034,101689.631369,0.073586,1.516928e-08,...,0.000004,1.424448e-07,0.000002,5.468301e-07,0.000013,2.536508e-07,0.000018,1.307695e-06,0.000008,9.859617e-07
2010-12-31,283.213424,290.532010,0.006218,0.018000,0.009540,3.816695,2.585873,101700.475982,0.076082,1.542476e-08,...,0.000003,1.307955e-07,0.000003,5.297963e-07,0.000013,2.172036e-07,0.000015,8.857673e-07,0.000008,2.597143e-06
2010-12-31,284.306617,291.559378,0.006542,0.017846,0.012079,3.941387,2.558229,101690.568853,0.074415,1.539579e-08,...,0.000004,1.427734e-07,0.000002,5.522925e-07,0.000013,2.565877e-07,0.000018,1.342617e-06,0.000008,9.653151e-07
2010-12-31,283.955372,291.362576,0.006415,0.017985,0.011524,4.009065,2.479845,101691.410692,0.074276,1.477971e-08,...,0.000004,1.363801e-07,0.000002,5.451337e-07,0.000013,2.476614e-07,0.000017,1.225512e-06,0.000008,1.051246e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31,,,,,,,,,,,...,,,,,,,,,,
2010-12-31,,,,,,,,,,,...,,,,,,,,,,
2010-12-31,,,,,,,,,,,...,,,,,,,,,,
2010-12-31,,,,,,,,,,,...,,,,,,,,,,


## Note that there are NaN values because the data, 'data.nc' is for the Continental USA, whereas the county shapefile consists of other locations as well such as Alaska, hawaii, guam, puertorico etc.

In [12]:
final_df=initial_df.reset_index(drop=True) ## reset index and dont include the valid_time as a column
final_df

Unnamed: 0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
0,282.615197,289.807924,0.006226,0.018000,0.007990,3.549790,2.739781,101708.405246,0.077264,1.671023e-08,...,0.000002,1.374248e-07,0.000003,5.240474e-07,0.000013,2.005139e-07,0.000013,7.403231e-07,0.000009,3.686137e-06
1,284.441159,291.411693,0.006488,0.017673,0.011682,3.998708,2.346034,101689.631369,0.073586,1.516928e-08,...,0.000004,1.424448e-07,0.000002,5.468301e-07,0.000013,2.536508e-07,0.000018,1.307695e-06,0.000008,9.859617e-07
2,283.213424,290.532010,0.006218,0.018000,0.009540,3.816695,2.585873,101700.475982,0.076082,1.542476e-08,...,0.000003,1.307955e-07,0.000003,5.297963e-07,0.000013,2.172036e-07,0.000015,8.857673e-07,0.000008,2.597143e-06
3,284.306617,291.559378,0.006542,0.017846,0.012079,3.941387,2.558229,101690.568853,0.074415,1.539579e-08,...,0.000004,1.427734e-07,0.000002,5.522925e-07,0.000013,2.565877e-07,0.000018,1.342617e-06,0.000008,9.653151e-07
4,283.955372,291.362576,0.006415,0.017985,0.011524,4.009065,2.479845,101691.410692,0.074276,1.477971e-08,...,0.000004,1.363801e-07,0.000002,5.451337e-07,0.000013,2.476614e-07,0.000017,1.225512e-06,0.000008,1.051246e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3216,,,,,,,,,,,...,,,,,,,,,,
3217,,,,,,,,,,,...,,,,,,,,,,
3218,,,,,,,,,,,...,,,,,,,,,,
3219,,,,,,,,,,,...,,,,,,,,,,


## We will now need to merge the dataframe with the original shapefile.