## Purpose: 

To get 2016 NLCD Land Cover data for the urban counties

#### Import

In [179]:
import rasterstats as rs
import geopandas as gpd
import rasterio as rio

import pandas as pd

import glob
import os

#### Load Urban County Shapefile, NLCD Land Cover GeoTIFF

<img src="https://i.imgur.com/sN5uH08.png">

In [61]:
path_to_urban_counties = '../shapefile/urban_counties_wnv_4326.gpkg'
path_to_nlcd_land_cover = './NLCD.geotiff'

assert os.path.exists(path_to_urban_counties), 'Failed to find urban county shapefile'
assert os.path.exists(path_to_nlcd_land_cover), 'Failed to find land cover image'

#### Zonal Stats: all_touched=True

In [64]:
stats = rs.zonal_stats(path_to_urban_counties, path_to_nlcd_land_cover, categorical=True, all_touched=True)

In [75]:
frame = pd.DataFrame.from_dict(stats)

In [77]:
urban_counties = gpd.read_file(path_to_urban_counties)
frame = frame.join([urban_counties['GEOID'], urban_counties['NAME']])
frame = frame.set_index('GEOID', drop=True)

In [80]:
frame

Unnamed: 0_level_0,11,21,22,23,24,31,41,42,43,52,71,81,82,90,95,0,12,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,30742.0,93549,117726,61618,27818,906,104044.0,294.0,2651.0,876.0,485357,43609.0,1219635.0,6684.0,9730.0,,,Lancaster
46099,31110.0,127943,57238,41687,18492,4529,28025.0,13.0,1438.0,57.0,63475,257074.0,1544300.0,1187.0,41440.0,,,Minnehaha
39003,10949.0,91319,61203,20170,12316,508,84701.0,671.0,774.0,104.0,7452,33051.0,731914.0,3062.0,2628.0,,,Allen
42007,22639.0,124920,69074,33639,18794,2561,601300.0,2387.0,80424.0,3711.0,8500,151124.0,31616.0,4025.0,1191.0,,,Beaver
37037,71936.0,98476,20782,6929,2125,2108,459118.0,328956.0,310211.0,43254.0,69626,266940.0,21959.0,22121.0,1562.0,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,27066.0,202212,166130,77265,22318,8819,1186749.0,5645.0,431155.0,7391.0,20274,303965.0,219900.0,3985.0,1113.0,,,Westmoreland
12015,358979.0,176011,109534,36098,7742,3331,4141.0,36196.0,17142.0,73797.0,10036,278643.0,61691.0,523675.0,207939.0,304.0,,Charlotte
44003,48082.0,36260,43297,56206,17368,3657,104502.0,37624.0,90957.0,1891.0,4889,5641.0,622.0,45132.0,2075.0,,,Kent
28089,64148.0,92764,49725,25501,6002,607,121627.0,264340.0,257355.0,33615.0,39291,419411.0,116880.0,233006.0,19787.0,,,Madison


In [81]:
frame.to_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched.pkl')

#### Zonal Stats: all_touched = False

In [82]:
new_stats = rs.zonal_stats(path_to_urban_counties, path_to_nlcd_land_cover, categorical=True)



In [83]:
new_frame = pd.DataFrame.from_dict(new_stats)

In [84]:
urban_counties = gpd.read_file(path_to_urban_counties)
new_frame = new_frame.join([urban_counties['GEOID'], urban_counties['NAME']])
new_frame = new_frame.set_index('GEOID', drop=True)

In [85]:
new_frame

Unnamed: 0_level_0,11,21,22,23,24,31,41,42,43,52,71,81,82,90,95,0,12,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,30741.0,91755,117482,61610,27818,906,104008.0,294.0,2651.0,876.0,485017,43572.0,1219200.0,6684.0,9730.0,,,Lancaster
46099,31020.0,126950,57173,41663,18488,4529,28013.0,13.0,1438.0,57.0,63431,256971.0,1543701.0,1187.0,41419.0,,,Minnehaha
39003,10949.0,90418,61040,20142,12303,508,84500.0,671.0,771.0,104.0,7446,33025.0,730799.0,3061.0,2628.0,,,Allen
42007,22610.0,124676,68937,33577,18787,2561,600264.0,2386.0,80197.0,3706.0,8497,150870.0,31527.0,4006.0,1184.0,,,Beaver
37037,70783.0,98291,20740,6897,2122,2108,458254.0,328402.0,309711.0,43196.0,69511,266550.0,21929.0,21995.0,1549.0,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,24008.0,201693,165791,77109,22291,8791,1184674.0,5605.0,430823.0,7391.0,20227,303776.0,219859.0,3904.0,1101.0,,,Westmoreland
12015,358244.0,175636,109321,36054,7739,3318,4135.0,36083.0,17112.0,73604.0,9993,277908.0,61573.0,522958.0,207821.0,44.0,,Charlotte
44003,47566.0,36110,43185,56136,17346,3656,104023.0,37528.0,90604.0,1871.0,4884,5628.0,621.0,44838.0,2060.0,,,Kent
28089,62482.0,92559,49657,25486,5998,571,121387.0,264103.0,256998.0,33571.0,39209,419154.0,116859.0,231656.0,19283.0,,,Madison


In [86]:
new_frame.to_pickle('./urban_county_data/NLCD_Urban_Counties.pkl')

### Convert Pixel Values to Land Cover Categories (ex. 11 --> Open Water)

In [97]:
key = {11: 'open_water',
       12: 'perennial_ice',
       
       21: 'developed_open',
       22: 'developed_low',
       23: 'developed_medium',
       24: 'developed_high',
       
       31: 'barren_land',
       41: 'deciduous_forest',
       42: 'evergreen_forest',
       43: 'mixed_forest',
       
       51: 'dwarf_scrub',  # alaska only
       52: 'shrub',
       71: 'grassland',
       
       72: 'sedge',  # alaska only
       73: 'lichens',  # alaska only
       74: 'moss',  # alaska only
       
       81: 'pasture_hay',
       82: 'cultivated_crops',
       90: 'woody_wetlands',
       95: 'emergent_herbaceous_wetlands'
      }

In [114]:
all_touched = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched.pkl')
all_touched = all_touched.rename(columns=key)
all_touched.to_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched_keyed.pkl')
all_touched

Unnamed: 0_level_0,open_water,developed_open,developed_low,developed_medium,developed_high,barren_land,deciduous_forest,evergreen_forest,mixed_forest,shrub,grassland,pasture_hay,cultivated_crops,woody_wetlands,emergent_herbaceous_wetlands,0,perennial_ice,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,30742.0,93549,117726,61618,27818,906,104044.0,294.0,2651.0,876.0,485357,43609.0,1219635.0,6684.0,9730.0,,,Lancaster
46099,31110.0,127943,57238,41687,18492,4529,28025.0,13.0,1438.0,57.0,63475,257074.0,1544300.0,1187.0,41440.0,,,Minnehaha
39003,10949.0,91319,61203,20170,12316,508,84701.0,671.0,774.0,104.0,7452,33051.0,731914.0,3062.0,2628.0,,,Allen
42007,22639.0,124920,69074,33639,18794,2561,601300.0,2387.0,80424.0,3711.0,8500,151124.0,31616.0,4025.0,1191.0,,,Beaver
37037,71936.0,98476,20782,6929,2125,2108,459118.0,328956.0,310211.0,43254.0,69626,266940.0,21959.0,22121.0,1562.0,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,27066.0,202212,166130,77265,22318,8819,1186749.0,5645.0,431155.0,7391.0,20274,303965.0,219900.0,3985.0,1113.0,,,Westmoreland
12015,358979.0,176011,109534,36098,7742,3331,4141.0,36196.0,17142.0,73797.0,10036,278643.0,61691.0,523675.0,207939.0,304.0,,Charlotte
44003,48082.0,36260,43297,56206,17368,3657,104502.0,37624.0,90957.0,1891.0,4889,5641.0,622.0,45132.0,2075.0,,,Kent
28089,64148.0,92764,49725,25501,6002,607,121627.0,264340.0,257355.0,33615.0,39291,419411.0,116880.0,233006.0,19787.0,,,Madison


In [115]:
frame = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties.pkl')
frame = frame.rename(columns=key)
frame.to_pickle('./urban_county_data/NLCD_Urban_Counties_keyed.pkl')
frame

Unnamed: 0_level_0,open_water,developed_open,developed_low,developed_medium,developed_high,barren_land,deciduous_forest,evergreen_forest,mixed_forest,shrub,grassland,pasture_hay,cultivated_crops,woody_wetlands,emergent_herbaceous_wetlands,0,perennial_ice,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,30741.0,91755,117482,61610,27818,906,104008.0,294.0,2651.0,876.0,485017,43572.0,1219200.0,6684.0,9730.0,,,Lancaster
46099,31020.0,126950,57173,41663,18488,4529,28013.0,13.0,1438.0,57.0,63431,256971.0,1543701.0,1187.0,41419.0,,,Minnehaha
39003,10949.0,90418,61040,20142,12303,508,84500.0,671.0,771.0,104.0,7446,33025.0,730799.0,3061.0,2628.0,,,Allen
42007,22610.0,124676,68937,33577,18787,2561,600264.0,2386.0,80197.0,3706.0,8497,150870.0,31527.0,4006.0,1184.0,,,Beaver
37037,70783.0,98291,20740,6897,2122,2108,458254.0,328402.0,309711.0,43196.0,69511,266550.0,21929.0,21995.0,1549.0,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,24008.0,201693,165791,77109,22291,8791,1184674.0,5605.0,430823.0,7391.0,20227,303776.0,219859.0,3904.0,1101.0,,,Westmoreland
12015,358244.0,175636,109321,36054,7739,3318,4135.0,36083.0,17112.0,73604.0,9993,277908.0,61573.0,522958.0,207821.0,44.0,,Charlotte
44003,47566.0,36110,43185,56136,17346,3656,104023.0,37528.0,90604.0,1871.0,4884,5628.0,621.0,44838.0,2060.0,,,Kent
28089,62482.0,92559,49657,25486,5998,571,121387.0,264103.0,256998.0,33571.0,39209,419154.0,116859.0,231656.0,19283.0,,,Madison


### Convert Pixel Counts to Percentages

The following function will take a row of the dataframe (a GEOID) and convert its values to percentages.

In [154]:
def convert_to_percent(row):
    total = row.sum()
    for land_type in row.index:
        row[land_type] = row[land_type] / total * 100
    return row

In [158]:
all_touched = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched_keyed.pkl')
data = all_touched.iloc[:,:-1]  # ignore the NAME column
data = data.apply(convert_to_percent, axis='columns')
data = data.join(all_touched['NAME'])

data.to_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched_keyed_percentage.pkl')
data

Unnamed: 0_level_0,open_water,developed_open,developed_low,developed_medium,developed_high,barren_land,deciduous_forest,evergreen_forest,mixed_forest,shrub,grassland,pasture_hay,cultivated_crops,woody_wetlands,emergent_herbaceous_wetlands,0,perennial_ice,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,1.394044,4.242125,5.338469,2.794164,1.261451,0.041084,4.718037,0.013332,0.120214,0.039724,22.009270,1.977518,55.306250,0.303096,0.441222,,,Lancaster
46099,1.402610,5.768374,2.580604,1.879479,0.833721,0.204192,1.263521,0.000586,0.064833,0.002570,2.861802,11.590310,69.625538,0.053516,1.868343,,,Minnehaha
39003,1.032124,8.608324,5.769394,1.901356,1.160986,0.047887,7.984469,0.063253,0.072962,0.009804,0.702474,3.115603,68.994987,0.288644,0.247732,,,Allen
42007,1.958552,10.807117,5.975751,2.910187,1.625912,0.221558,52.019846,0.206505,6.957665,0.321047,0.735355,13.074085,2.735173,0.348212,0.103036,,,Beaver
37037,4.167538,5.705106,1.203984,0.401424,0.123110,0.122125,26.598529,19.057727,17.971755,2.505876,4.033711,15.464894,1.272172,1.281557,0.090493,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,1.008425,7.534016,6.189672,2.878740,0.831524,0.328578,44.215900,0.210321,16.063975,0.275374,0.755369,11.325129,8.193035,0.148473,0.041468,,,Westmoreland
12015,18.841480,9.238167,5.749035,1.894651,0.406349,0.174832,0.217346,1.899794,0.899720,3.873332,0.526753,14.624941,3.237932,27.485764,10.913949,0.015956,,Charlotte
44003,9.651086,7.278158,8.690634,11.281747,3.486129,0.734038,20.975787,7.551942,18.257016,0.379564,0.981327,1.132269,0.124849,9.058958,0.416497,,,Kent
28089,3.678087,5.318857,2.851108,1.462164,0.344140,0.034804,6.973789,15.156597,14.756095,1.927400,2.252848,24.047982,6.701608,13.359984,1.134537,,,Madison


In [159]:
frame = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_keyed.pkl')
data = frame.iloc[:,:-1]  # ignore the NAME column
data = data.apply(convert_to_percent, axis='columns')
data = data.join(frame['NAME'])

data.to_pickle('./urban_county_data/NLCD_Urban_Counties_keyed_percentage.pkl')
data

Unnamed: 0_level_0,open_water,developed_open,developed_low,developed_medium,developed_high,barren_land,deciduous_forest,evergreen_forest,mixed_forest,shrub,grassland,pasture_hay,cultivated_crops,woody_wetlands,emergent_herbaceous_wetlands,0,perennial_ice,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
31109,1.395831,4.166243,5.334407,2.797474,1.263109,0.041138,4.722605,0.013349,0.120372,0.039776,22.022763,1.978438,55.359199,0.303495,0.441802,,,Lancaster
46099,1.399786,5.728654,2.579947,1.880054,0.834276,0.204372,1.264094,0.000587,0.064890,0.002572,2.862341,11.595887,69.659931,0.053564,1.869044,,,Minnehaha
39003,1.034520,8.543177,5.767386,1.903124,1.162453,0.047999,7.984013,0.063400,0.072848,0.009826,0.703538,3.120379,69.049808,0.289220,0.248308,,,Allen
42007,1.959637,10.805826,5.974857,2.910161,1.628293,0.221965,52.025637,0.206798,6.950775,0.321204,0.736446,13.076093,2.732485,0.347205,0.102619,,,Beaver
37037,4.110420,5.707830,1.204387,0.400514,0.123226,0.122413,26.611143,19.070543,17.985143,2.508423,4.036554,15.478753,1.273433,1.277266,0.089952,,,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42129,0.896810,7.534171,6.193065,2.880380,0.832672,0.328385,44.253081,0.209373,16.093242,0.276088,0.755572,11.347446,8.212756,0.145833,0.041127,,,Westmoreland
12015,18.839648,9.236499,5.749068,1.896039,0.406985,0.174490,0.217455,1.897564,0.899901,3.870751,0.525521,14.614868,3.238055,27.501771,10.929072,0.002314,,Charlotte
44003,9.588837,7.279420,8.705670,11.316464,3.496783,0.737014,20.970011,7.565275,18.264873,0.377175,0.984566,1.134549,0.125187,9.038899,0.415276,,,Kent
28089,3.593040,5.322624,2.855536,1.465578,0.344916,0.032835,6.980384,15.187297,14.778723,1.930507,2.254722,24.103537,6.720001,13.321426,1.108873,,,Madison


## Data Validation

### Compare all_touched=True to all_touched=False

We need to compare the rasterization methods to see if it is worth keeping both data-sets, or just drop one of them

#### Mean Difference

In [168]:
all_touched = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched_keyed_percentage.pkl').iloc[:,:-1]
frame = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_keyed_percentage.pkl').iloc[:,:-1]
name = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_keyed_percentage.pkl').iloc[:,-1]

diff = frame - all_touched
diff.mean()

open_water                     -0.049410
developed_open                 -0.001718
developed_low                   0.006314
developed_medium                0.007438
developed_high                  0.005430
barren_land                    -0.000169
deciduous_forest                0.003555
evergreen_forest                0.006178
mixed_forest                    0.003779
shrub                           0.001569
grassland                       0.002272
pasture_hay                     0.011245
cultivated_crops                0.018905
woody_wetlands                 -0.010483
emergent_herbaceous_wetlands    0.000013
0                              -0.036400
perennial_ice                  -0.000423
dtype: float64

#### Max Difference

In [169]:
diff.max()

open_water                      0.161638
developed_open                  0.514656
developed_low                   0.663477
developed_medium                0.680536
developed_high                  0.291033
barren_land                     0.044649
deciduous_forest                0.258764
evergreen_forest                0.144565
mixed_forest                    0.224322
shrub                           0.047364
grassland                       0.100768
pasture_hay                     0.223514
cultivated_crops                0.270083
woody_wetlands                  0.087645
emergent_herbaceous_wetlands    0.073431
0                              -0.001578
perennial_ice                   0.001849
dtype: float64

#### Min Difference

In [170]:
diff.min()

open_water                     -0.854152
developed_open                 -0.273160
developed_low                  -0.333846
developed_medium               -0.241097
developed_high                 -0.225026
barren_land                    -0.029819
deciduous_forest               -0.816647
evergreen_forest               -0.044384
mixed_forest                   -0.232131
shrub                          -0.076921
grassland                      -0.247348
pasture_hay                    -0.448139
cultivated_crops               -0.190795
woody_wetlands                 -0.612995
emergent_herbaceous_wetlands   -0.098358
0                              -0.215276
perennial_ice                  -0.002957
dtype: float64

### Check that percentages sum to 100

#### All Touched Data

In [176]:
all_touched = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_all_touched_keyed_percentage.pkl').iloc[:,:-1]
total = all_touched.apply(lambda x: x.sum(), axis='columns')
total.mean()

100.0

#### Regular Data

In [177]:
frame = pd.read_pickle('./urban_county_data/NLCD_Urban_Counties_keyed_percentage.pkl').iloc[:,:-1]
total = frame.apply(lambda x: x.sum(), axis='columns')
total.mean()

100.0

## Convert Data to CSV

In [208]:
files = glob.glob('./urban_county_data/*.pkl')
for file in files:
    frame = pd.read_pickle(file)
    cols = frame.columns.tolist()
    cols.insert(0, 'NAME')
    cols = cols[:-1]
    frame = frame[cols]
    
    frame = frame.fillna(value=0)
    
    
    name = file[file.rfind('/')+1:file.rfind('.')]
    print(name)
    frame.to_csv(file[:file.rfind('/')] + '/csv_files/' + name + '.csv')

NLCD_Urban_Counties_keyed
NLCD_Urban_Counties_all_touched_keyed_percentage
NLCD_Urban_Counties
NLCD_Urban_Counties_all_touched_keyed
NLCD_Urban_Counties_keyed_percentage
NLCD_Urban_Counties_all_touched


## Conclusion

We have acquired county-level land cover data using two different rasterization strategies: all_touched=False, and all_touched=True. The methods are shown respectively in the figure below.

<br>
<figure>
    <img src="https://github.com/perrygeo/python-raster-stats/raw/master/docs/img/rasterization.png">
    <figcaption>All_touched = False on the left, All_touched = True on the right</figcaption>
</figure>
<br><br>


The all_touched option is typically used for rasterization when the raster pixel size is close to the shapefile's polygon size, but the land cover pixels have 30m resolution, which is very small compared to the county sizes.

Therefore, I suggest to use the all_touched=False data-set.

1. NLCD_Urban_Counties: This contains the raw data created by zonal_stats
2. NLCD_Urban_Counties_keyed: Raw data but the columns are converted based on the Land Cover Key (ex. 11 --> Open Water)
3. **NLCD_Urban_Counties_keyed_percentage: Keyed data but the pixel counts have been converted to percentages**

The last one is probably the one we want to use since it is easy to interpret.