## notebook to combine all the data, from CAMS, ERA5, feature engineering and others

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import geopandas as gpd
import xarray as xr

## Before we merge, lets find out counties whose shapefile does not exist, and those counties whose shapefiles does exist but the life expectancy data doesnot exist


In [2]:
## Lets load the county shapefile

SHAPE_PATH = os.path.join('../Shapefiles','county_shapefiles','2010_county_shapefile','gz_2010_us_050_00_500k.shp')
county_gdf = gpd.read_file(SHAPE_PATH)

## lets create the fips column in the dataframe which is the sum of state and county both as dtype string.
## Note: not a numerical sum
## This step is necessary because there can be several counties with idential names. 

county_gdf['fips']=county_gdf['STATE'] + county_gdf['COUNTY']

## convert the data type
county_gdf['fips']=county_gdf['fips'].astype(str).astype(int)

county_gdf = county_gdf.drop([ 'GEO_ID','CENSUSAREA','COUNTY','LSAD'], axis=1)
county_gdf

Unnamed: 0,STATE,NAME,geometry,fips
0,01,Cleburne,"POLYGON ((-85.38872 33.91304, -85.38088 33.873...",1029
1,01,Coffee,"POLYGON ((-86.03044 31.61894, -86.00408 31.619...",1031
2,01,Coosa,"POLYGON ((-86.00928 33.10164, -86.00917 33.090...",1037
3,01,Covington,"POLYGON ((-86.34851 30.99434, -86.35023 30.994...",1039
4,01,Crenshaw,"POLYGON ((-86.14699 31.68045, -86.14711 31.663...",1041
...,...,...,...,...
3216,72,San Sebastián,"POLYGON ((-66.90748 18.25314, -66.90739 18.253...",72131
3217,72,Santa Isabel,"POLYGON ((-66.37968 17.94398, -66.38029 17.943...",72133
3218,72,Toa Baja,"MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ...",72137
3219,72,Trujillo Alto,"POLYGON ((-66.02917 18.37590, -66.02828 18.376...",72139


In [3]:
## gather only state fips of USA not its territories such as guam, puerto rico

county_gdf['STATE']=county_gdf['STATE'].astype(str).astype(int) ## convert state dtype to int
county_gdf=county_gdf.loc[(county_gdf['STATE'] < 60)]
county_gdf

Unnamed: 0,STATE,NAME,geometry,fips
0,1,Cleburne,"POLYGON ((-85.38872 33.91304, -85.38088 33.873...",1029
1,1,Coffee,"POLYGON ((-86.03044 31.61894, -86.00408 31.619...",1031
2,1,Coosa,"POLYGON ((-86.00928 33.10164, -86.00917 33.090...",1037
3,1,Covington,"POLYGON ((-86.34851 30.99434, -86.35023 30.994...",1039
4,1,Crenshaw,"POLYGON ((-86.14699 31.68045, -86.14711 31.663...",1041
...,...,...,...,...
3138,56,Niobrara,"POLYGON ((-104.05298 42.85955, -104.05286 42.7...",56027
3139,56,Platte,"POLYGON ((-104.77417 42.60996, -104.76422 42.6...",56031
3140,56,Sweetwater,"POLYGON ((-109.05008 41.00066, -109.17368 41.0...",56037
3141,56,Washakie,"POLYGON ((-107.12892 43.99455, -107.12797 43.9...",56043


In [4]:
column_values = county_gdf['STATE'].unique()
column_values

array([ 1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
       38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56])

In [5]:
## The life expectency data

PATH = os.path.join('../life_expectancy_data','LE_2010.csv')
LE_DATA = pd.read_csv(PATH)
LE_DATA

Unnamed: 0,location_name,fips,year,MeanLifeExpectency
0,Autauga County (Alabama),1001,2010,75.728489
1,Baldwin County (Alabama),1003,2010,77.826608
2,Barbour County (Alabama),1005,2010,75.841973
3,Bibb County (Alabama),1007,2010,73.705432
4,Blount County (Alabama),1009,2010,75.753407
...,...,...,...,...
3122,Sweetwater County (Wyoming),56037,2010,77.582422
3123,Teton County (Wyoming),56039,2010,83.771012
3124,Uinta County (Wyoming),56041,2010,77.836526
3125,Washakie County (Wyoming),56043,2010,78.674180


In [6]:
## the counties that are in life expectency data but the shape file does not exist

county_noshape = LE_DATA.loc[~LE_DATA['fips'].isin(county_gdf['fips'])].copy()
county_noshape

Unnamed: 0,location_name,fips,year,MeanLifeExpectency
80,Chugach Census Area (Alaska),2063,2010,77.733009
81,Copper River Census Area (Alaska),2066,2010,77.733009
83,Kusilvak Census Area (Alaska),2158,2010,69.723907
85,Kobuk Census Area (Alaska),2140,2010,73.339132
87,Aleutian Islands Census Area (Alaska),2010,2010,82.631153
96,Skagway-Yakutat-Angoon Census Area (Alaska),2231,2010,80.20283
97,Skagway-Hoonah-Angoon Census Area (Alaska),2232,2010,80.20283
102,Prince of Wales-Outer Ketchikan Census Area (A...,2201,2010,78.06861
104,Wrangell-Petersburg Census Area (Alaska),2280,2010,78.06861
338,Dade County (Florida),12025,2010,80.576828


## Note that, among the 3127 total counties, 38 are that of Alaska, 5 are that of Hawaii and as shown above there are 6 counties where the life expectancy data does not exist. In total, 3127-5-6= 3078 counties 

In [7]:
## the counties that are in shapefile data but life expectancy data does not exist

le_data_nodata = county_gdf.loc[~county_gdf['fips'].isin(LE_DATA['fips'])].copy()
le_data_nodata

Unnamed: 0,STATE,NAME,geometry,fips
244,8,Hinsdale,"POLYGON ((-107.21157 37.42296, -107.21787 37.4...",8053
250,8,Mineral,"POLYGON ((-106.71077 37.40423, -106.71077 37.3...",8079
290,8,San Juan,"POLYGON ((-107.92580 37.63979, -107.92613 37.6...",8111
585,16,Clark,"POLYGON ((-112.81416 44.37720, -112.81324 44.3...",16033
1625,30,Petroleum,"POLYGON ((-108.50506 46.74914, -108.63095 46.7...",30069
1640,30,Treasure,"POLYGON ((-107.05209 45.87700, -107.05203 45.8...",30103
1645,30,Golden Valley,"POLYGON ((-109.31590 46.75150, -109.26339 46.7...",30037
1666,31,Grant,"POLYGON ((-101.62042 41.74240, -101.63796 41.7...",31075
1667,31,Hooker,"POLYGON ((-101.41220 42.09211, -101.37545 42.0...",31091
1669,31,Logan,"POLYGON ((-100.71171 41.73976, -100.50060 41.7...",31113


## Lets start combining data

In [8]:
## load the data with county and corresponding variables

county_var=pd.read_pickle('CAMS_79_variables.pkl')
county_var

Unnamed: 0,NAME,fips,u10,v10,d2m,t2m,bcaod550,duaod550,lsm,msl,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
0,Cleburne,1029,0.434079,-0.122697,282.638367,289.807770,0.006281,0.007998,0.985105,101706.890625,...,0.000002,1.405128e-07,0.000003,5.258444e-07,0.000013,1.997616e-07,0.000013,7.354205e-07,0.000009,3.305535e-06
1,Coffee,1031,0.277523,-0.034499,284.563934,291.594208,0.006543,0.012055,0.992762,101687.679688,...,0.000004,1.444494e-07,0.000002,5.499585e-07,0.000013,2.590336e-07,0.000018,1.376015e-06,0.000008,9.460180e-07
2,Coosa,1037,0.262713,-0.041072,283.402100,290.892761,0.006274,0.010251,0.984186,101696.562500,...,0.000003,1.306877e-07,0.000003,5.350508e-07,0.000013,2.238712e-07,0.000015,9.479429e-07,0.000008,2.127009e-06
3,Covington,1039,0.219254,0.026002,284.972931,291.764221,0.006611,0.012473,0.938187,101689.382812,...,0.000004,1.465366e-07,0.000002,5.553931e-07,0.000013,2.631946e-07,0.000019,1.460832e-06,0.000008,9.821802e-07
4,Crenshaw,1041,0.260638,-0.015809,284.221710,291.456573,0.006468,0.011628,0.980219,101690.453125,...,0.000004,1.404297e-07,0.000002,5.461097e-07,0.000013,2.487252e-07,0.000017,1.250067e-06,0.000008,1.100702e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,Niobrara,56027,1.226940,0.222518,271.670319,282.334412,0.005382,0.007533,0.998026,101400.070312,...,0.000007,1.079335e-07,0.000002,4.633204e-07,0.000010,2.116289e-07,0.000013,8.331535e-07,0.000004,1.016333e-06
3139,Platte,56031,1.541398,0.317140,271.038757,281.676422,0.005196,0.006530,0.997104,101402.078125,...,0.000006,1.059757e-07,0.000002,4.455685e-07,0.000009,2.033576e-07,0.000013,7.784643e-07,0.000004,1.096191e-06
3140,Sweetwater,56037,1.430520,0.814952,269.132935,278.168732,0.004792,0.003537,0.992391,101611.039062,...,0.000003,9.444518e-08,0.000002,4.143257e-07,0.000008,2.047461e-07,0.000013,9.296629e-07,0.000003,9.021953e-07
3141,Washakie,56043,0.710449,0.532009,270.020416,278.829376,0.005376,0.003680,0.997272,101436.078125,...,0.000003,9.973952e-08,0.000002,4.707082e-07,0.000009,2.159900e-07,0.000013,9.542745e-07,0.000003,5.718468e-07


In [9]:
## The life expectency data

PATH = os.path.join('../life_expectancy_data','LE_2010.csv')
LE_DATA = pd.read_csv(PATH)
LE_DATA

Unnamed: 0,location_name,fips,year,MeanLifeExpectency
0,Autauga County (Alabama),1001,2010,75.728489
1,Baldwin County (Alabama),1003,2010,77.826608
2,Barbour County (Alabama),1005,2010,75.841973
3,Bibb County (Alabama),1007,2010,73.705432
4,Blount County (Alabama),1009,2010,75.753407
...,...,...,...,...
3122,Sweetwater County (Wyoming),56037,2010,77.582422
3123,Teton County (Wyoming),56039,2010,83.771012
3124,Uinta County (Wyoming),56041,2010,77.836526
3125,Washakie County (Wyoming),56043,2010,78.674180


In [10]:
## merge life expectency data with the variables by fips

final_df=LE_DATA.merge(county_var,how='left', on='fips')
final_df=final_df.dropna()
final_df

Unnamed: 0,location_name,fips,year,MeanLifeExpectency,NAME,u10,v10,d2m,t2m,bcaod550,...,aermssdul,aermssbchphil,aermssomhphil,aermssbchphob,aermssomhphob,aermsssss,aermssssm,aermssssl,aermsssu,aermssso2
0,Autauga County (Alabama),1001,2010,75.728489,Autauga,0.244891,-0.022881,283.480743,291.070984,0.006298,...,3.245817e-06,1.298927e-07,0.000003,5.384176e-07,0.000013,2.288023e-07,0.000016,9.983653e-07,0.000008,1.850165e-06
1,Baldwin County (Alabama),1003,2010,77.826608,Baldwin,0.065638,0.113299,286.096985,292.149780,0.006543,...,4.267411e-06,1.304158e-07,0.000002,5.643739e-07,0.000013,2.814684e-07,0.000020,1.889070e-06,0.000008,1.206832e-06
2,Barbour County (Alabama),1005,2010,75.841973,Barbour,0.336253,-0.075642,284.153351,291.171997,0.006456,...,3.332815e-06,1.467424e-07,0.000002,5.382911e-07,0.000013,2.391392e-07,0.000017,1.143327e-06,0.000007,1.193010e-06
3,Bibb County (Alabama),1007,2010,73.705432,Bibb,0.221962,-0.004751,283.564850,290.792206,0.006248,...,3.212181e-06,1.263458e-07,0.000003,5.366624e-07,0.000013,2.228762e-07,0.000015,9.271469e-07,0.000008,1.879509e-06
4,Blount County (Alabama),1009,2010,75.753407,Blount,0.284933,0.005598,282.847870,289.685669,0.006085,...,2.530443e-06,1.275312e-07,0.000003,5.178088e-07,0.000013,1.962538e-07,0.000013,6.851880e-07,0.000008,2.187212e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3122,Sweetwater County (Wyoming),56037,2010,77.582422,Sweetwater,1.430520,0.814952,269.132935,278.168732,0.004792,...,2.973532e-06,9.444518e-08,0.000002,4.143257e-07,0.000008,2.047461e-07,0.000013,9.296629e-07,0.000003,9.021953e-07
3123,Teton County (Wyoming),56039,2010,83.771012,Teton,1.021023,0.729751,268.077728,274.648254,0.004971,...,9.510231e-07,9.750062e-08,0.000002,4.297702e-07,0.000008,2.081166e-07,0.000013,1.032238e-06,0.000003,2.619064e-07
3124,Uinta County (Wyoming),56041,2010,77.836526,Uinta,1.101981,0.647358,270.266602,277.642670,0.005171,...,1.688695e-06,1.069743e-07,0.000002,4.422247e-07,0.000008,2.189009e-07,0.000014,1.091205e-06,0.000003,6.770198e-07
3125,Washakie County (Wyoming),56043,2010,78.674180,Washakie,0.710449,0.532009,270.020416,278.829376,0.005376,...,2.971480e-06,9.973952e-08,0.000002,4.707082e-07,0.000009,2.159900e-07,0.000013,9.542745e-07,0.000003,5.718468e-07
