## This notebook counts the number of counties of in the USA where the number of lat/lon pairs provided in shapefile is less than 100

In [1]:
## conda environment Weather_Prediction

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
pd.set_option('display.max_columns', None)

In [2]:
%%time
for i in range (2008, 2020):
    year=str(i)
    
    PATH = os.path.join('..', 'Shapefiles', 'county_shapefiles', year + '_county_shapefile')
    for filename in os.listdir(PATH):
        if filename.endswith(".shp"):
            file_path = os.path.join(PATH, filename)

    county_gdf = gpd.read_file(file_path)

    if i == 2010:
        county_gdf.rename(columns={'STATE': 'STATEFP'}, inplace=True)  ## rename for the year 2010      
    
    else:
        pass
        
    county_gdf['STATEFP'] = county_gdf['STATEFP'].astype(str).astype(int) ## convert dtype of STATE from str to int   
    ## remove states such as guam, puerto rico
    county_gdf=county_gdf.loc[(county_gdf['STATEFP'] < 60)]
    county_gdf

    ## get rid of hawaii and alaska

    county_gdf=county_gdf.loc[(county_gdf['STATEFP'] !=2 ) & (county_gdf['STATEFP'] !=15 )]
    county_gdf=county_gdf.reset_index(drop=True) ## reset index

    geometry_column = county_gdf['geometry']
    lat_lon = geometry_column.get_coordinates(index_parts=True)

    ## In this step we find the smallest number of latitude (or longitude) coordinates that we can extract
    ## among all counties from the geometry attribute of the county shapefile

    total_num = []
    county_len = len(county_gdf) 
    for j in range(county_len): 
        longitude = lat_lon.loc[(j), 'x'] # get the longitude values. the number of longitudes will be the same as number of latitudes
        num = len(longitude)
        total_num.append(num)  

    ind = min(total_num) 
    index_val = total_num.index(ind)  ## get index of the county that has only 18 number of longitude

    row_index = county_gdf.iloc[index_val] ## get the row values with index index_val 

    num_county = [] ## find the number of longitudes in each of of the county

    for j in range(county_len): 
        longitude = lat_lon.loc[(j), 'x']
        num = len(longitude)
        num_county.append(num)

    count = 0 ## find the number of county with less than 100 points

    for x in num_county:
        if x < 100:
            count = count+1
    BOLD = "\033[1m"
    RESET = "\033[0m"
    print(BOLD+'Total number of counties in ' + year + ' is:::' + RESET, len(county_gdf))
    print('The smallest number of lat/lon pair in ' + year+ ' is:::', ind)
    print('Details of corresponding county', row_index)
    print('Number of counties less than 100 lat/lon pair:::', count)
    print('                       ### \n                       ### \n')
    

[1mTotal number of counties in 2008 is:::[0m 3109
The smallest number of lat/lon pair in 2008 is::: 129
Details of corresponding county STATEFP                                                    25
COUNTYFP                                                  019
COUNTYNS                                             00606936
CNTYIDFP                                                25019
NAME                                                Nantucket
NAMELSAD                                     Nantucket County
LSAD                                                       06
CLASSFP                                                    H4
MTFCC                                                   G4020
CSAFP                                                    None
CBSAFP                                                   None
METDIVFP                                                 None
FUNCSTAT                                                    N
geometry    POLYGON ((-70.43684499999999 41.299054, -70.

## An example with single year

In [3]:
PATH=os.path.join('..','Shapefiles','county_shapefiles','2010_county_shapefile')

In [4]:
for filename in os.listdir(PATH):
    if filename.endswith(".shp"):
        file_path = os.path.join(PATH, filename)

In [5]:
county_gdf = gpd.read_file(file_path)

county_gdf

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,01,029,Cleburne,County,560.100,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,01,031,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,01,037,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,01,039,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,01,041,Crenshaw,County,608.840,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."
...,...,...,...,...,...,...,...
3216,0500000US72131,72,131,San Sebastián,Muno,70.423,"POLYGON ((-66.90748 18.25314, -66.90739 18.253..."
3217,0500000US72133,72,133,Santa Isabel,Muno,34.023,"POLYGON ((-66.37968 17.94398, -66.38029 17.943..."
3218,0500000US72137,72,137,Toa Baja,Muno,23.241,"MULTIPOLYGON (((-66.13957 18.46232, -66.13956 ..."
3219,0500000US72139,72,139,Trujillo Alto,Muno,20.764,"POLYGON ((-66.02917 18.37590, -66.02828 18.376..."


In [6]:
## remove states such as guam, puerto rico

county_gdf['STATE'] = county_gdf['STATE'].astype(str).astype(int) ## convert dtype of STATE from str to int
county_gdf=county_gdf.loc[(county_gdf['STATE'] < 60)]
county_gdf

## get rid of hawaii and alaska

county_gdf=county_gdf.loc[(county_gdf['STATE'] !=2 ) & (county_gdf['STATE'] !=15 )]
county_gdf=county_gdf.reset_index(drop=True) ## reset index
county_gdf

Unnamed: 0,GEO_ID,STATE,COUNTY,NAME,LSAD,CENSUSAREA,geometry
0,0500000US01029,1,029,Cleburne,County,560.100,"POLYGON ((-85.38872 33.91304, -85.38088 33.873..."
1,0500000US01031,1,031,Coffee,County,678.972,"POLYGON ((-86.03044 31.61894, -86.00408 31.619..."
2,0500000US01037,1,037,Coosa,County,650.926,"POLYGON ((-86.00928 33.10164, -86.00917 33.090..."
3,0500000US01039,1,039,Covington,County,1030.456,"POLYGON ((-86.34851 30.99434, -86.35023 30.994..."
4,0500000US01041,1,041,Crenshaw,County,608.840,"POLYGON ((-86.14699 31.68045, -86.14711 31.663..."
...,...,...,...,...,...,...,...
3104,0500000US56027,56,027,Niobrara,County,2626.037,"POLYGON ((-104.05298 42.85955, -104.05286 42.7..."
3105,0500000US56031,56,031,Platte,County,2084.208,"POLYGON ((-104.77417 42.60996, -104.76422 42.6..."
3106,0500000US56037,56,037,Sweetwater,County,10426.649,"POLYGON ((-109.05008 41.00066, -109.17368 41.0..."
3107,0500000US56043,56,043,Washakie,County,2238.549,"POLYGON ((-107.12892 43.99455, -107.12797 43.9..."


In [7]:
geometry_column = county_gdf['geometry']
lat_lon = geometry_column.get_coordinates(index_parts=True)
lat_lon

Unnamed: 0,Unnamed: 1,x,y
0,0,-85.388717,33.913044
0,1,-85.380885,33.873508
0,2,-85.379455,33.866291
0,3,-85.377426,33.856047
0,4,-85.376403,33.850656
...,...,...,...
3108,45,-104.054487,44.180381
3108,46,-104.054950,43.938090
3108,47,-104.055077,43.936535
3108,48,-104.055488,43.853477


#### In this step we find the minimum number of latitude (or longitude) coordinates that we can extract among all counties from the geometry attribute of the county shapefile

In [8]:
len(county_gdf)

3109

In [9]:
total_num = []
county_len = len(county_gdf) 
for i in range(county_len): 
    longitude = lat_lon.loc[(i), 'x'] # get the longitude values. the number of longitudes will be the same as number of latitudes
    num = len(longitude)
    total_num.append(num)
print(min(total_num))

18


In [10]:
ind = min(total_num) ## get index of the county that has only 18 number of longitude
index_val = total_num.index(ind)  

row_index = county_gdf.iloc[index_val] ## get the row values with index index_val 
print(row_index)

GEO_ID                                           0500000US48173
STATE                                                        48
COUNTY                                                      173
NAME                                                  Glasscock
LSAD                                                     County
CENSUSAREA                                              900.218
geometry      POLYGON ((-101.776085 32.086925, -101.772008 3...
Name: 2603, dtype: object


In [11]:
num_county = [] ## find the number of longitudes in each of of the county

for i in range(county_len): 
    longitude = lat_lon.loc[(i), 'x']
    num = len(longitude)
    num_county.append(num)

count = 0 ## find the number of county with less than 100 points
for x in num_county:
    if x < 100:
        count = count+1

In [12]:
print(count)

796
