In [2]:
import pandas as pd
import numpy as np
import json
import io
from io import StringIO
import os

from shapely.geometry import Point, LineString
from shapely.geometry.polygon import Polygon
from shapely import wkt

from math import sin, cos, sqrt, atan2, radians


## Data captures from 
https://gis-txdot.opendata.arcgis.com/datasets/TXDOT::texas-county-boundaries-detailed/about


In [4]:
# Opening JSON file
f = open('.geojson')
# returns JSON object as
# a dictionary
data = json.load(f)

In [154]:
county_detail = []
for i in range(len(data['features'])):
    ft_obj = data['features'][i]['properties']
    county = ft_obj['CNTY_NM']
    polygon_temp = data['features'][i]['geometry']['coordinates'][0][0]
    line = LineString(polygon_temp)
    centroid = line.centroid.wkt
    county_detail.append({'county':county,'centroid':centroid})

In [155]:
county_detail = pd.DataFrame(county_detail)

In [156]:
county_detail.head()

Unnamed: 0,county,centroid
0,Atascosa,POINT (-98.52550129237503 28.89332098109323)
1,Matagorda,POINT (-95.99530169303894 28.84771261549633)
2,Jackson,POINT (-96.56923395706752 28.8874181215156)
3,De Witt,POINT (-97.36382516011945 29.07072652031978)
4,Wilson,POINT (-98.07891405628709 29.18641646756914)


### Clean up Data frame

In [157]:
## Remove 'POINT (' , trailing close parentasis, replace space for ,
county_detail['centroid'] = county_detail['centroid'].str.replace('POINT \(','')
county_detail['centroid'] = county_detail['centroid'].str.replace('\)','')
county_detail['centroid'] = county_detail['centroid'].str.replace(' ',',')
# create new lat / long fields
county_detail['longitude'] = county_detail['centroid'].astype(str).str.split(',').str[0]
county_detail['latitude'] = county_detail['centroid'].astype(str).str.split(',').str[1]
# make series float
county_detail['longitude'] = county_detail['longitude'].astype(float)
county_detail['latitude'] = county_detail['latitude'].astype(float)
# lower case counties
county_detail['county'] = [x.lower() for x in county_detail['county']]
# there was discprepency between 2 lists on dewitt county
county_detail['county'] = county_detail['county'].str.replace('de witt','dewitt')


  county_detail['centroid'] = county_detail['centroid'].str.replace('POINT \(','')
  county_detail['centroid'] = county_detail['centroid'].str.replace('\)','')


In [162]:
county_detail.head()

Unnamed: 0,county,centroid,longitude,latitude
0,atascosa,"-98.52550129237503,28.89332098109323",-98.52550129237504,28.89332098109323
1,matagorda,"-95.99530169303894,28.84771261549633",-95.99530169303894,28.84771261549633
2,jackson,"-96.56923395706752,28.8874181215156",-96.56923395706752,28.8874181215156
3,dewitt,"-97.36382516011945,29.07072652031978",-97.36382516011945,29.07072652031978
4,wilson,"-98.07891405628709,29.18641646756914",-98.07891405628708,29.18641646756914


### Load clinic data

In [86]:
clinic = pd.read_excel('contra&ab_ data.xlsx',sheet_name='Data_ for_STATA')

In [138]:
# clean trailing and leading spaces in 'counties
clinic['county'] = [x.strip() for x in clinic['county']]

In [103]:
#clinic.shape
clinic.columns

Index(['county', 'fempop2015_ACS', 'fempop2010_ACS', 'fempop2010_TDC',
       'fempop2015_TDC', 'whitefempop2015_TDC', 'blackfempop2015_TDC',
       'otherfempop2015_TDC', 'hispfempop2015_TDC', 'job2015all',
       ...
       'ffs2015', 'mco2015', 'Medtot2015', 'ffs2010', 'pccm2010', 'mco2010',
       'Medtot2010', 'HSR (8)', 'border', 'metro'],
      dtype='object', length=106)

In [139]:
#columns = 'county','totclinic2010','totclin2015'
clnc_df = clinic[['county','totclinic2010','totclin2015']]


In [140]:
clnc_df.dtypes

county           object
totclinic2010     int64
totclin2015       int64
dtype: object

In [141]:
clnc_df['county'] = [x.lower() for x in clnc_df['county']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clnc_df['county'] = [x.lower() for x in clnc_df['county']]


In [None]:
# check that the county lists are literally identical

In [168]:
#list(county_detail.sort_values('county')['county'])[:10]
#list(clnc_df.sort_values('county')['county'])[60:70]
#len(list(county_detail.county))
# list(county_detail.county)[:10]
# clnc_df.head()
#list(county_detail.county)
#len(clnc_df[clnc_df.county.isin(list(county_detail.county))])
#clnc_df[~clnc_df.county.isin(list(county_detail.county))]

In [169]:
## Create 2 lists - one with counties with no clinics and the other with clinics
no_clc_list = list(clnc_df[clnc_df['totclinic2010'] == 0]['county'])
clc_list = list(clnc_df[clnc_df['totclinic2010'] >0 ]['county'])


In [173]:
len(no_clc_list)+len(clc_list)

254

In [187]:
# approximate radius of earth in km
def distance_earth(lat1,lon1,lat2,lon2):
    R = 3963.19

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [188]:
data_1_2010 = []
for cnty in no_clc_list:
    #print(cnty)
    #temp_cnty = cnty
    lat1 = county_detail[county_detail['county'] == cnty]['latitude'].values[0]
    lon1 = county_detail[county_detail['county'] == cnty]['longitude'].values[0]
    closest_distance = 1000
    for cnty2 in clc_list:
        lat2 = county_detail[county_detail['county'] == cnty2]['latitude'].values[0]
        lon2 = county_detail[county_detail['county'] == cnty2]['longitude'].values[0]
        temp_distance = distance_earth(lat1,lon1,lat2,lon2)
        if temp_distance < closest_distance:
            closest_distance = temp_distance
            closest_county = cnty2
    data_1_2010.append({'county':cnty,'closest_county':closest_county,'closest_distance':closest_distance})


In [189]:
df1_1_2010 = pd.DataFrame(data_1_2010)

In [190]:
df1_1_2010.head()

Unnamed: 0,county,closest_county,closest_distance
0,anderson,cherokee,28.589673
1,archer,wichita,26.615393
2,armstrong,randall,30.573348
3,bandera,medina,26.014339
4,blanco,hays,27.04781


In [192]:
df1_1_2010.shape

(113, 3)

In [200]:
#df1_1_2010.sort_values(['closest_county'])
df1_1_2010 = df1_1_2010[['closest_county','county','closest_distance']].sort_values(['closest_county'])

In [201]:
#df1_1_2010.drop('closest_distance',axis=1).groupby('closest_county')
df1_1_2010.head()
#groupby()

Unnamed: 0,closest_county,county,closest_distance
101,angelina,trinity,32.802964
81,angelina,polk,34.84786
85,aransas,refugio,13.003857
11,aransas,calhoun,26.310291
105,austin,washington,25.240788


### 2015 data

In [203]:
no_clc_list = list(clnc_df[clnc_df['totclin2015'] == 0]['county'])
clc_list = list(clnc_df[clnc_df['totclin2015'] >0 ]['county'])
len(no_clc_list)+len(clc_list)

254

In [204]:
data_1_2015 = []
for cnty in no_clc_list:
    #print(cnty)
    #temp_cnty = cnty
    lat1 = county_detail[county_detail['county'] == cnty]['latitude'].values[0]
    lon1 = county_detail[county_detail['county'] == cnty]['longitude'].values[0]
    closest_distance = 1000
    for cnty2 in clc_list:
        lat2 = county_detail[county_detail['county'] == cnty2]['latitude'].values[0]
        lon2 = county_detail[county_detail['county'] == cnty2]['longitude'].values[0]
        temp_distance = distance_earth(lat1,lon1,lat2,lon2)
        if temp_distance < closest_distance:
            closest_distance = temp_distance
            closest_county = cnty2
    data_1_2015.append({'county':cnty,'closest_county':closest_county,'closest_distance':closest_distance})


In [205]:
df1_1_2015 = pd.DataFrame(data_1_2015)

In [206]:
df1_1_2015 = df1_1_2015[['closest_county','county','closest_distance']].sort_values(['closest_county'])

In [207]:
df1_1_2015.head()

Unnamed: 0,closest_county,county,closest_distance
94,aransas,refugio,13.003857
10,aransas,calhoun,26.310291
20,austin,colorado,24.971734
69,bailey,lamb,27.42005
22,bell,coryell,31.087835


In [208]:
df1_1_2015.shape

(126, 3)

In [209]:
len(no_clc_list)

126

### Output files

In [210]:
df1_1_2015.to_csv('data_2015.csv')
df1_1_2010.to_csv('data_2010.csv')


In [211]:
county_detail.to_csv('data_county_centroid.csv')