In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import geopandas as gpd
import xarray as xr
import regionmask

In [2]:
data = xr.open_dataset("data.nc") ## one month average of the variables
data

In [3]:
## resample the data to 1 year average

year_avg = data.resample(valid_time='1YE').mean(dim='valid_time')
year_avg

In [4]:
## delete variables that have NaN in them

vars_with_nan = [var for var in year_avg if year_avg[var].isnull().any()]
year_avg= year_avg.drop_vars(vars_with_nan)
year_avg

In [5]:
## note that even though the time dimesion, essentially shows 2010-12-31,
## what it actually means is yearly average
year_avg.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010-12-31,49.5,-126.00,278.348907,281.556824,0.006512,0.014148,0.001253,3.752065,2.928701,101390.726562,0.055563,5.288607e-09,...,5.697568e-07,1.359443e-07,2.107467e-06,5.559153e-07,0.000009,6.538563e-07,0.000050,0.000015,0.000003,6.057056e-07
2010-12-31,49.5,-125.25,279.088837,282.437836,0.006916,0.013934,0.001246,4.228424,1.811178,101402.250000,0.060980,6.618710e-09,...,5.552510e-07,1.506114e-07,2.379295e-06,5.843853e-07,0.000010,6.358721e-07,0.000048,0.000014,0.000003,7.764698e-07
2010-12-31,49.5,-124.50,279.470001,282.923370,0.007213,0.013523,0.001238,2.194234,0.692861,101427.570312,0.063776,7.792362e-09,...,5.443322e-07,1.616488e-07,2.564180e-06,6.047914e-07,0.000011,6.118582e-07,0.000046,0.000013,0.000003,9.925919e-07
2010-12-31,49.5,-123.75,279.041290,282.562347,0.007478,0.013581,0.001204,0.905955,0.624898,101459.429688,0.067136,1.080445e-08,...,5.231616e-07,1.798799e-07,2.806439e-06,6.146706e-07,0.000011,5.744274e-07,0.000043,0.000011,0.000003,1.408917e-06
2010-12-31,49.5,-123.00,277.851837,281.525085,0.008017,0.014813,0.001140,3.628265,2.724324,101485.656250,0.076543,1.931933e-08,...,4.871432e-07,2.252995e-07,3.389544e-06,6.264335e-07,0.000012,5.279849e-07,0.000039,0.000010,0.000004,2.356623e-06
2010-12-31,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31,24.0,-69.00,293.523407,297.542999,0.004872,0.012293,0.024328,0.000000,0.000000,101571.789062,0.046011,9.519893e-09,...,1.086019e-05,3.103107e-08,5.009322e-07,4.866120e-07,0.000009,7.770465e-07,0.000062,0.000016,0.000005,1.695241e-07
2010-12-31,24.0,-68.25,293.495728,297.488953,0.004856,0.012303,0.024477,0.000000,0.000000,101578.000000,0.045661,9.399162e-09,...,1.102058e-05,3.011268e-08,4.867323e-07,4.857583e-07,0.000009,7.737776e-07,0.000062,0.000015,0.000005,1.793884e-07
2010-12-31,24.0,-67.50,293.465454,297.455261,0.004833,0.012302,0.024507,0.000000,0.000000,101583.164062,0.045540,9.288230e-09,...,1.103588e-05,2.886006e-08,4.733052e-07,4.846137e-07,0.000009,7.727766e-07,0.000061,0.000015,0.000005,1.564795e-07
2010-12-31,24.0,-66.75,293.443970,297.427094,0.004824,0.012318,0.024585,0.000000,0.000000,101590.195312,0.045691,9.213097e-09,...,1.106749e-05,2.775320e-08,4.601205e-07,4.847639e-07,0.000009,7.745404e-07,0.000062,0.000015,0.000005,1.387977e-07


In [10]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join("County_shapefile",'gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

county_gdf.head()

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,1,29,Cleburne,County,560.1,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,1,31,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,1,37,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,1,39,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,1,41,Crenshaw,County,608.84,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."


In [12]:
geometry_column=county_gdf['geometry']
geometry_column

0       POLYGON ((-85.38872 33.91304, -85.38088 33.873...
1       POLYGON ((-86.03044 31.61894, -86.00408 31.619...
2       POLYGON ((-86.00928 33.10164, -86.00917 33.090...
3       POLYGON ((-86.34851 30.99434, -86.35023 30.994...
4       POLYGON ((-86.14699 31.68045, -86.14711 31.663...
                              ...                        
3216    POLYGON ((-66.90748 18.25314, -66.90739 18.253...
3217    POLYGON ((-66.37968 17.94398, -66.38029 17.943...
3218    MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...
3219    POLYGON ((-66.02917 18.37590, -66.02828 18.376...
3220    POLYGON ((-66.85229 17.95500, -66.85280 17.955...
Name: geometry, Length: 3221, dtype: geometry

In [26]:
type(geometry_column)

geopandas.geoseries.GeoSeries

In [21]:
lat_lon=geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3220,202,-66.833718,17.989763
3220,203,-66.835282,17.988274
3220,204,-66.835429,17.986323
3220,205,-66.836682,17.965971


In [22]:
type(lat_lon)

pandas.core.frame.DataFrame

In [90]:
longitude= lat_lon.loc[(0), 'x']  # Access specific row and column
longitude=longitude[:10 ]  ## extract only the first 10 values
longitude

0   -85.388717
1   -85.380885
2   -85.379455
3   -85.377426
4   -85.376403
5   -85.364595
6   -85.361844
7   -85.360491
8   -85.357402
9   -85.355252
Name: x, dtype: float64

In [91]:
type(value)

pandas.core.series.Series

In [92]:
latitude= lat_lon.loc[(0), 'y']  # Access specific row and column
latitude=latitude[:10]
latitude

0    33.913044
1    33.873508
2    33.866291
3    33.856047
4    33.850656
5    33.788446
6    33.773951
7    33.767958
8    33.750104
9    33.739245
Name: y, dtype: float64

In [93]:
lat_list=latitude.tolist()
lat_list

[33.9130442466707,
 33.8735079039461,
 33.8662906454797,
 33.856047,
 33.8506557456616,
 33.788445541993795,
 33.773951,
 33.767958,
 33.750104,
 33.739245277912495]

In [94]:
lon_list=longitude.tolist()
lon_list

[-85.3887171312565,
 -85.38088499793109,
 -85.3794552619804,
 -85.377426,
 -85.3764027281585,
 -85.36459509570929,
 -85.36184399999999,
 -85.360491,
 -85.357402,
 -85.35525221574429]

In [95]:
## find the corresponding values of the variables in the finer grid

year_avg_finer= year_avg.interp(longitude=lon_list, latitude=lat_list)
year_avg_finer

In [109]:
year_avg_finer.to_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2010-12-31,33.913044,-85.388717,282.366088,289.507646,0.006246,0.018,0.007574,3.500011,2.744643,101710.212300,0.078224,1.694797e-08,...,0.000002,1.390953e-07,0.000003,5.244997e-07,0.000013,1.949974e-07,0.000013,6.903684e-07,0.000009,0.000004
2010-12-31,33.913044,-85.380885,282.365056,289.505433,0.006248,0.018,0.007570,3.504259,2.742813,101710.142255,0.078250,1.695802e-08,...,0.000002,1.392644e-07,0.000003,5.246019e-07,0.000013,1.949734e-07,0.000013,6.902337e-07,0.000009,0.000004
2010-12-31,33.913044,-85.379455,282.364867,289.505029,0.006249,0.018,0.007570,3.505034,2.742479,101710.129468,0.078255,1.695986e-08,...,0.000002,1.392953e-07,0.000003,5.246206e-07,0.000013,1.949690e-07,0.000013,6.902091e-07,0.000009,0.000004
2010-12-31,33.913044,-85.377426,282.364600,289.504456,0.006250,0.018,0.007569,3.506134,2.742005,101710.111320,0.078262,1.696246e-08,...,0.000002,1.393391e-07,0.000003,5.246471e-07,0.000013,1.949628e-07,0.000013,6.901742e-07,0.000009,0.000004
2010-12-31,33.913044,-85.376403,282.364465,289.504167,0.006250,0.018,0.007568,3.506689,2.741766,101710.102168,0.078266,1.696378e-08,...,0.000002,1.393612e-07,0.000003,5.246604e-07,0.000013,1.949596e-07,0.000013,6.901567e-07,0.000009,0.000004
2010-12-31,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-31,33.739245,-85.364595,282.422081,289.666950,0.006275,0.018,0.007671,3.591714,2.729816,101708.716148,0.078084,1.726047e-08,...,0.000002,1.407828e-07,0.000003,5.258568e-07,0.000013,1.973997e-07,0.000013,7.134955e-07,0.000009,0.000004
2010-12-31,33.739245,-85.361844,282.422150,289.666080,0.006276,0.018,0.007669,3.593279,2.729188,101708.688509,0.078094,1.726376e-08,...,0.000002,1.408457e-07,0.000003,5.258913e-07,0.000013,1.973916e-07,0.000013,7.134549e-07,0.000009,0.000004
2010-12-31,33.739245,-85.360491,282.422183,289.665653,0.006276,0.018,0.007669,3.594049,2.728880,101708.674916,0.078099,1.726538e-08,...,0.000002,1.408766e-07,0.000003,5.259083e-07,0.000013,1.973876e-07,0.000013,7.134349e-07,0.000009,0.000004
2010-12-31,33.739245,-85.357402,282.422260,289.664677,0.006277,0.018,0.007667,3.595807,2.728175,101708.643883,0.078111,1.726907e-08,...,0.000002,1.409472e-07,0.000003,5.259470e-07,0.000013,1.973785e-07,0.000013,7.133893e-07,0.000009,0.000004


In [110]:
## group by the valid time dimension. The result is same as above as there is only 1 time.
## this step is necessary to merge with the shape file

summary = year_avg_finer.groupby("valid_time").mean(["latitude", "longitude"])
summary.to_dataframe()

Unnamed: 0_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,282.392891,289.594444,0.006264,0.018,0.007621,3.554148,2.734958,101709.350493,0.078175,1.714163e-08,...,2e-06,1.401843e-07,3e-06,5.253599e-07,1.3e-05,1.962789e-07,1.3e-05,7.027602e-07,9e-06,4e-06


In [111]:
county_df = summary.to_dataframe()
county_df

Unnamed: 0_level_0,d2m,t2m,bcaod550,chnk,duaod550,lai_hv,lai_lv,msl,omaod550,pm2p5,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
valid_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,282.392891,289.594444,0.006264,0.018,0.007621,3.554148,2.734958,101709.350493,0.078175,1.714163e-08,...,2e-06,1.401843e-07,3e-06,5.253599e-07,1.3e-05,1.962789e-07,1.3e-05,7.027602e-07,9e-06,4e-06


In [71]:
index = pd.MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)])
data = {'values': [10, 20, 30, 40], 'next_val':[1, 2, 3, 4]}
df = pd.DataFrame(data, index=index)

In [72]:
df

Unnamed: 0,Unnamed: 1,values,next_val
0,1,10,1
0,2,20,2
1,1,30,3
1,2,40,4


In [74]:
value = df.loc[(0), 'values']  # Access specific row and column
value

1    10
2    20
Name: values, dtype: int64

In [75]:
type(value)

pandas.core.series.Series

In [76]:
my_list = value.tolist()
my_list

[10, 20]

In [77]:
dfs = pd.DataFrame(my_list, columns=['lat'])
dfs

Unnamed: 0,lat
0,10
1,20
