In [8]:
# 500 Cities Data Analysis - basic code
# Written by Michelle Schmitz, originally in Jupyter notebooks and then in Python code.
# Initially written on 02 March 2019

In [9]:
# Importing in pandas and numpy libraries to handle data management aspects of the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd 

pd.set_option('max_columns', 50)
%matplotlib inline

In [13]:
### Reading in the 500 Cities data set, created by the CDC's Division of Population Health (2018 release)

## This is the original dataset's link:
# read.csv("https://catalog.data.gov/dataset/500-cities-local-data-for-better-health-b32fd/resource/8a49a1f7-4fcc-49a6-acb5-fcd3c0796782")

url = ('https://data.cdc.gov/api/views/6vp6-wxuq/rows.csv')

data_500_cities = pd.read_csv(url)
data_500_cities.head(3)

KeyboardInterrupt: 

In [None]:
# checking the first state as well
data_500_cities[data_500_cities.StateAbbr == 'AL'].head()

In [None]:
# doing more specific descriptive checks for all of the data - and the numeric data in particular
data_500_cities.info() #basic information about the dataset
data_500_cities.dtypes
data_500_cities.describe() #descriptive statistics for all numeric variables - including FIPS codes

In [None]:
# Examining categorical values of Measures variable, creating series object
Col_Measure = ['CityName','StateAbbr','DataValueTypeID','Measure','Data_Value','Low_Confidence_Limit','High_Confidence_Limit','CityFIPS','TractFIPS','PopulationCount','GeoLocation','UniqueID'] 
print(Col_Measure)

In [None]:
# Pivot for Age-Adjusted Prevalence (updated)
pivot1 = pd.pivot_table(data_500_cities[Col_Measure][(data_500_cities.DataValueTypeID == 'AgeAdjPrv') & (data_500_cities.StateAbbr == 'US')], values=['Low_Confidence_Limit','Data_Value','High_Confidence_Limit'], index='Measure', margins=False, dropna=False, aggfunc={'Low_Confidence_Limit': np.mean, 'Data_Value': np.mean,'High_Confidence_Limit': np.mean})
pivot1

In [None]:
# Pivot for Crude Prevalence (updated)
pivot2 = pd.pivot_table(data_500_cities[Col_Measure][(data_500_cities.DataValueTypeID == 'CrdPrv') & (data_500_cities.StateAbbr == 'US')], values=['Low_Confidence_Limit','Data_Value','High_Confidence_Limit'], index='Measure', margins=False, dropna=False, aggfunc={'Low_Confidence_Limit': np.mean, 'Data_Value': np.mean,'High_Confidence_Limit': np.mean})
pivot2
#print(pivot2)

In [None]:
# Want to display the data for US overall, and averaged for all counties together

## pivot 1

#converting back into another data frame to display data together
flattened1 = pd.DataFrame(pivot1.to_records())
with_new_index1 = flattened1.set_index('Measure')
with_new_index1.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.AgeAdjPrv').replace('High_Confidence_Limit','High_CL.AgeAdjPrv').replace('Data_Value','Data_Value.AgeAdjPrv') \
                           for hdr in with_new_index1.columns]
with_new_index1

## pivot 2

#converting back into another data frame to display data together
flattened2 = pd.DataFrame(pivot2.to_records())
with_new_index2 = flattened2.set_index('Measure')
with_new_index2.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.CrdPrv').replace('High_Confidence_Limit','High_CL.CrdPrv').replace('Data_Value','Data_Value.CrdPrv') \
                           for hdr in with_new_index2.columns]

with_new_index2

In [None]:
# Joining 2 pivot tables together in case if I want to output larger table
# Dropping a good reference on different types of joins here -- http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/

US_data_table = pd.merge(with_new_index1, with_new_index2, on='Measure', how='outer')
US_data_table
US_data_table_ft = pd.DataFrame(US_data_table.to_records())
US_data_table_ft

In [None]:
# Reordering columns so that it makes sense in a comparison framework
US_data_table_index = US_data_table_ft.set_index('Measure')
US_data_table_ro = US_data_table_index[['Low_CL.CrdPrv','Data_Value.CrdPrv','High_CL.CrdPrv','Low_CL.AgeAdjPrv','Data_Value.AgeAdjPrv','High_CL.AgeAdjPrv']]
US_data_table_ro

In [None]:
# Now that I've presented the data in a nice-to-see way for the entire US - let's see what the distribution is of
# the PREVALENCES (both CRUDE and AGE-ADJUSTED) for all cities (n=500)

In [None]:
# Getting ALL PREVALENCES for all observations that ARE NOT the US summary (i.e., not summed across US)
DF_Prev = data_500_cities[Col_Measure][(data_500_cities.StateAbbr != 'US')] 
DF_Prev

## However, we need to know how many observations are in each grouping; age-adjusted prevalences are demographically adjusted based on age distributions from each crude population!
DF_Prev.groupby('DataValueTypeID').groups

In [None]:
# I will split the prevalences dataset into crude and age-adjusted prevalences, before splitting them further 
# into each measure and recombining them. This reshapes the dataset so each Measure will have its own column,
# making it easier to compare across measures for future analyses.

In [None]:
## Recoding Measure into MeasureShort

conditions = [
    (DF_Prev['Measure'] == 'All teeth lost among adults aged >=65 Years'),
    (DF_Prev['Measure'] == 'Arthritis among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Binge drinking among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Cancer (excluding skin cancer) among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Cholesterol screening among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Chronic kidney disease among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Chronic obstructive pulmonary disease among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Coronary heart disease among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Current asthma among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Current lack of health insurance among adults aged 18–64 Years'),
    (DF_Prev['Measure'] == 'Current smoking among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Diagnosed diabetes among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50–75 Years'),
    (DF_Prev['Measure'] == 'High blood pressure among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years'),
    (DF_Prev['Measure'] == 'Mammography use among women aged 50–74 Years'),
    (DF_Prev['Measure'] == 'Mental health not good for >=14 days among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'No leisure-time physical activity among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Obesity among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Older adult men aged >=65 Years who are up to date on a core set of clinical preventive services: Flu shot past Year, PPV shot ever, Colorectal cancer screening'),
    (DF_Prev['Measure'] == 'Older adult women aged >=65 Years who are up to date on a core set of clinical preventive services: Flu shot past Year, PPV shot ever, Colorectal cancer screening, and Mammogram past 2 Years'),
    (DF_Prev['Measure'] == 'Papanicolaou smear use among adult women aged 21–65 Years'),
    (DF_Prev['Measure'] == 'Physical health not good for >=14 days among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Sleeping less than 7 hours among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Stroke among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Taking medicine for high blood pressure control among adults aged >=18 Years with high blood pressure'),
    (DF_Prev['Measure'] == 'Visits to dentist or dental clinic among adults aged >=18 Years'),
    (DF_Prev['Measure'] == 'Visits to doctor for routine checkup within the past Year among adults aged >=18 Years')
     ]

choices = [
    'TeethLost', 'Arthritis', 'BngDrnk', 'Cancer', 'Cholesterol', 'KidneyDis', 'COPD', 
    'CHD', 'Asthma', 'NoHlthIns', 'CurrSmoke', 'Diabetes','FecBldTst', 'HighBP', 'HighChol',
    'Mammo', 'MentHlth', 'NoPhysAct', 'Obesity', 'OlderMen', 'OlderWomen', 'PapSmear', 
    'PhysHlthBad', 'SleepLittle', 'Stroke', 'HtnMeds', 'DentalVisits', 'DocVisits'
    ]

DF_Prev['Measure_Short'] = np.select(conditions, choices)

#print(DF_Prev)

In [None]:
# Cross-tab to check the outputs of my shortened measure variable
pd.crosstab(DF_Prev['Measure'],DF_Prev['Measure_Short'])

DF_Prev.head()

In [None]:
# Getting the CRUDE PREVALENCE for all observations that ARE NOT the US summary
DF_CrdPrev = DF_Prev[(DF_Prev.DataValueTypeID == 'CrdPrv')]
DF_CrdPrev

In [None]:
# Boxplot of crude prevalences

fig1, ax1 = plt.subplots()
ax1.set_title('Boxplot, Crude Prevalence')
pd.DataFrame(DF_CrdPrev.Data_Value).boxplot(grid=False)

In [None]:
# Getting the AGE-ADJUSTED PREVALENCE for all observations that ARE NOT the US summary
DF_AgeAdjPrev = DF_Prev[(DF_Prev.DataValueTypeID == 'AgeAdjPrv')]
DF_AgeAdjPrev

In [None]:
# BOXPLOT OF AGE-ADJUSTED PREVALENCE (which is adjusted based on all of the crude prevalences) - 
# This is for all causes smashed together
fig2, ax2 = plt.subplots()
ax2.set_title('Boxplot, Age-Adjusted Prevalence')
#pd.DataFrame(DF_AgeAdjPrev_ft.Data_Value).boxplot(grid=False)
pd.DataFrame(DF_AgeAdjPrev.Data_Value).boxplot(grid=False)

In [None]:
## Breaking up the CRUDE PREVALENCES dataset by each measure grouping
measure_groups = DF_CrdPrev.groupby('Measure_Short').groups
measure_groups

In [None]:
# Getting keys of the dataset (so we can break the datasets up appropriately)
gb = DF_CrdPrev.groupby('Measure_Short')
gb.groups.keys()

In [None]:
# If I was better at Python, I'd basically do a FOR loop for the above set of Dict Keys, 
# to basically run the set of commands that would break up each dataset, rename some columns and read them
# to the entire dataset. 

# However, I've only given myself a day to write up this analysis, and I'm out of practice.

# I will revisit the FOR loop idea at a later date.

In [None]:
# I wanted to see the distributions of the 28 variables.
# Could make paneled histograms, but boxplots and other measures could be interesting too.

In [None]:
# Pandas boxplot
DF_CrdPrev.boxplot(by='Measure_Short', 
                       column=['Data_Value'], 
                       grid=False)

In [None]:
# Seaborn boxplot
sns.boxplot(y='Data_Value', x='Measure_Short', 
                 data=DF_CrdPrev, 
                 width=0.5,
                 palette="colorblind")

In [None]:
# # Boxplots weren't helpful, and the axes did not leave much to be seen.
# # I decided to try to do a paneled histogram, to see if we could see approximate distributions.
# # I was still interested in comparisons, though...

# #link 1: https://realpython.com/python-histograms/
# #link 2: https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html

# #fig, axes = plt.subplots(nrows=2, ncols=2)
# #ax0, ax1, ax2, ax3 = axes.flatten()

# # Set up the plot
# #ax = plt.subplot(14, 2, Measure_i) #set it up to be a very long, very narrow plot
# # fig = plt.figure()
# # fig.subplots_adjust(hspace=0.4, wspace=0.4)
                    
# for Measure_i in range(1, 28):

# #     fig = plt.figure()
# #     fig.subplots_adjust(hspace=0.4, wspace=0.4)

#     # Subset to the Measure of interest
#     subset = DF_CrdPrev[DF_CrdPrev['Measure_Short'] == Measure_i]

#     fig, ax = plt.subplots(14, 2, sharex='col', sharey='row')

#     # Draw the plot
#     ax.hist(subset['Data_Value'], bins = 10,
#              color = 'blue', edgecolor = 'black', label = Measure_i)
    
#     # Title and labels
# #    ax.set_title('Histogram', size = 10)
#     ax.set_xlabel('Prevalence (per 100,000)', size = 10)
#     ax.set_ylabel('Flights', size= 10)

# #plt.tight_layout()
# plt.show()

In [None]:
#Looking at the list of keys - as we have 28 variables, we can't use all variables in one graphic.
list(gb.groups.keys()) 

In [None]:
## Boxplots aren't helpful (surprise - there's 28 variables, with a lot of overlap!)
# So, I'm creating a density plot of all of our measures to see what the relative densities are.

# Remember: PREVALENCE is a measure of the population (new and current cases, per 100,000)
# Density here measures the number of CENSUS TRACTS that have a certain prevalence.

# idea taken from https://towardsdatascience.com/histograms-and-density-plots-in-python-f6bda88f5ac0

In [None]:
# List of first 7 indicators to plot
Measures_Plot1 = ['Arthritis','Asthma','BngDrnk','CHD','COPD','Cancer','Cholesterol']

# Iterate through the indicators
for Measure_i in Measures_Plot1:
    subset = DF_CrdPrev[DF_CrdPrev['Measure_Short'] == Measure_i]  # subset to measure of interest
 
    # Draw the density plot
    sns.distplot(subset['Data_Value'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = Measure_i)
    
plt.legend(prop={'size': 12}, title = 'Measure',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Density Plot with Chronic Disease Measures (1)')
plt.xlabel('Prevalence (per 100,000)')
plt.ylabel('Density')

In [None]:
# List of second 7 indicators to plot
Measures_Plot2 = ['CurrSmoke','DentalVisits','Diabetes','DocVisits','FecBldTst','HighBP','HighChol']

# Iterate through the indicators
for Measure_i in Measures_Plot2:
    # Subset to the Measure of interest
    subset = DF_CrdPrev[DF_CrdPrev['Measure_Short'] == Measure_i]
    
    # Draw the density plot
    sns.distplot(subset['Data_Value'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = Measure_i)
    
# Plot formatting
plt.legend(prop={'size': 12}, title = 'Measure',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Density Plot with Chronic Disease Measures (2)')
plt.xlabel('Prevalence (per 100,000)')
plt.ylabel('Density')

In [None]:
# List of third 7 indicators to plot
Measures_Plot3 = ['HtnMeds','KidneyDis','Mammo','MentHlth','NoHlthIns','NoPhysAct','Obesity']

# Iterate through the indicators
for Measure_i in Measures_Plot3:
    # Subset to the Measure of interest
    subset = DF_CrdPrev[DF_CrdPrev['Measure_Short'] == Measure_i]
    
    # Draw the density plot
    sns.distplot(subset['Data_Value'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = Measure_i)
    
# Plot formatting
plt.legend(prop={'size': 12}, title = 'Measure',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Density Plot with Chronic Disease Measures (3)')
plt.xlabel('Prevalence (per 100,000)')
plt.ylabel('Density')

In [None]:
# List of last 7 indicators to plot
Measures_Plot4 = ['OlderMen','OlderWomen','PapSmear','PhysHlthBad','SleepLittle','Stroke','TeethLost']

# Iterate through the indicators
for Measure_i in Measures_Plot4:
    # Subset to the Measure of interest
    subset = DF_CrdPrev[DF_CrdPrev['Measure_Short'] == Measure_i]
    
    # Draw the density plot
    sns.distplot(subset['Data_Value'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = Measure_i)
    
# Plot formatting
plt.legend(prop={'size': 12}, title = 'Measure',bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Density Plot with Chronic Disease Measures (4)')
plt.xlabel('Prevalence (per 100,000)')
plt.ylabel('Density')


WHAT DOES THE ABOVE ANALYSIS TELL US?

A lot of census tracts are reporting a low prevalence of many chronic conditions, such as CHD, stroke, and diabetes (i.e., high density at a very low prevalence).

Obviously, the indicators with much higher prevalences are those that affect larger swathes of the population such as mammograms and Pap smears (for women), dental visits and doctor's visits (ideally for the entire population).

 However, there were some outcomes I was intrigued in - sleeping little, Pap smears, hypertension meds, and cholesterol that I want to map out - because the density plot can't tell us much. We know there are spatial relationships to some of these indicators - especially here in the south, where there's worse health outcomes overall.

My interests, for a long time, have been in spatial epidemiology - the effects of place upon health outcomes.

This dataset, as part of the 500 Cities project, would play well with this interest.

So, I decided to reshape my dataset, so each indicator's census tract was a distinct row and each measure had its own column. I figured this would play better with the mapping capabilities found in Geopandas, Pandas, and MatPlotLib

Link:
https://towardsdatascience.com/lets-make-a-map-using-geopandas-pandas-and-matplotlib-to-make-a-chloropleth-map-dddc31c1983d

In [None]:
# Below is my code to reshape a dataset so I had more control over the columns - 
# say, if I was creating a dataset for analysis.

In [None]:
#Splitting dataset into seperate instances
Arthritis = gb.get_group('Arthritis')
Arthritis.drop(columns=['Measure']) #Measure used to be specific to each measure - we're outputting data, so not necessary
Arthritis.head()

In [None]:
# Changing the index variable and starting to rename variables.
Arthritis_index = Arthritis.set_index('UniqueID')
Arthritis_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Arthritis').replace('High_Confidence_Limit','High_CL.Arthritis').replace('Data_Value','Data_Value.Arthritis').replace('PopulationCount','PopCount') \
                           for hdr in Arthritis_index.columns]
Arthritis_index.head()
Arthritis = Arthritis_index

#check to make sure I removed Arthritis index correctly
#del Arthritis_index
Arthritis
Arthritis_index

In [None]:
## All other indicators will only have Unique Identifiers and the CLs and Data Values Attached to Them

Col_Grps = ['Low_Confidence_Limit','High_Confidence_Limit','Data_Value','UniqueID'] 
print(Col_Grps)

#List: 'Asthma', 'BngDrnk', 'CHD', 'COPD', 'Cancer', 'Cholesterol', 'CurrSmoke', 'DentalVisits', 'Diabetes', 'DocVisits', 'FecBldTst', 'HighBP', 'HighChol', 'HtnMeds', 'KidneyDis', 'Mammo', 'MentHlth', 'NoHlthIns', 'NoPhysAct', 'Obesity', 'OlderMen', 'OlderWomen', 'PapSmear', 'PhysHlthBad', 'SleepLittle', 'Stroke', 'TeethLost']

In [None]:
#Asthma
Asthma = gb.get_group('Asthma')[Col_Grps]
Asthma.head()
Asthma_index = Asthma.set_index('UniqueID')
Asthma_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Asthma').replace('High_Confidence_Limit','High_CL.Asthma').replace('Data_Value','Data_Value.Asthma') \
                           for hdr in Asthma_index.columns]
Asthma_index.head()
#Asthma = Asthma_index

In [None]:
#Binge Drinking
BngDrnk = gb.get_group('BngDrnk')[Col_Grps]
#BngDrnk.head()
BngDrnk_index = BngDrnk.set_index('UniqueID')
BngDrnk_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.BngDrnk').replace('High_Confidence_Limit','High_CL.BngDrnk').replace('Data_Value','Data_Value.BngDrnk') \
                           for hdr in BngDrnk_index.columns]
BngDrnk_index.head()
#BngDrnk = BngDrnk_index

In [None]:
#CHD
CHD = gb.get_group('CHD')[Col_Grps]
#CHD.head()
CHD_index = CHD.set_index('UniqueID')
CHD_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.CHD').replace('High_Confidence_Limit','High_CL.CHD').replace('Data_Value','Data_Value.CHD') \
                           for hdr in CHD_index.columns]
CHD_index.head()
#CHD = CHD_index

In [None]:
#COPD
COPD = gb.get_group('COPD')[Col_Grps]
#COPD.head()
COPD_index = COPD.set_index('UniqueID')
COPD_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.COPD').replace('High_Confidence_Limit','High_CL.COPD').replace('Data_Value','Data_Value.COPD') \
                           for hdr in COPD_index.columns]
COPD_index.head()
#COPD = COPD_index

In [None]:
#Cancer
Cancer = gb.get_group('Cancer')[Col_Grps]
#Cancer.head()
Cancer_index = Cancer.set_index('UniqueID')
Cancer_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Cancer').replace('High_Confidence_Limit','High_CL.Cancer').replace('Data_Value','Data_Value.Cancer') \
                           for hdr in Cancer_index.columns]
Cancer_index.head()
#Cancer = Cancer_index

In [None]:
#Cholesterol
Cholesterol = gb.get_group('Cholesterol')[Col_Grps]
#Cholesterol.head()
Cholesterol_index = Cholesterol.set_index('UniqueID')
Cholesterol_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Cholesterol').replace('High_Confidence_Limit','High_CL.Cholesterol').replace('Data_Value','Data_Value.Cholesterol') \
                           for hdr in Cholesterol_index.columns]
Cholesterol_index.head()
#Cholesterol = Cholesterol_index

In [None]:
#Current Smoking
CurrSmoke = gb.get_group('CurrSmoke')[Col_Grps]
#CurrSmoke.head()
CurrSmoke_index = CurrSmoke.set_index('UniqueID')
CurrSmoke_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.CurrSmoke').replace('High_Confidence_Limit','High_CL.CurrSmoke').replace('Data_Value','Data_Value.CurrSmoke') \
                           for hdr in CurrSmoke_index.columns]
CurrSmoke_index.head()
#CurrSmoke = CurrSmoke_index

In [None]:
#Dental Visits
DentalVisits = gb.get_group('DentalVisits')[Col_Grps]
#DentalVisits.head()
DentalVisits_index = DentalVisits.set_index('UniqueID')
DentalVisits_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.DentalVisits').replace('High_Confidence_Limit','High_CL.DentalVisits').replace('Data_Value','Data_Value.DentalVisits') \
                           for hdr in DentalVisits_index.columns]
DentalVisits_index.head()
#DentalVisits = DentalVisits_index

In [None]:
#Diabetes
Diabetes = gb.get_group('Diabetes')[Col_Grps]
#Diabetes.head()
Diabetes_index = Diabetes.set_index('UniqueID')
Diabetes_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Diabetes').replace('High_Confidence_Limit','High_CL.Diabetes').replace('Data_Value','Data_Value.Diabetes') \
                           for hdr in Diabetes_index.columns]
Diabetes_index.head()
#Diabetes = Diabetes_index

In [None]:
# Doctor's Visits
DocVisits = gb.get_group('DocVisits')[Col_Grps]
#DocVisits.head()
DocVisits_index = DocVisits.set_index('UniqueID')
DocVisits_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.DocVisits').replace('High_Confidence_Limit','High_CL.DocVisits').replace('Data_Value','Data_Value.DocVisits') \
                           for hdr in DocVisits_index.columns]
DocVisits_index.head()
#DocVisits = DocVisits_index

In [None]:
#Fecal Blood Tests
FecBldTst = gb.get_group('FecBldTst')[Col_Grps]
#FecBldTst.head()
FecBldTst_index = FecBldTst.set_index('UniqueID')
FecBldTst_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.FecBldTst').replace('High_Confidence_Limit','High_CL.FecBldTst').replace('Data_Value','Data_Value.FecBldTst') \
                           for hdr in FecBldTst_index.columns]
FecBldTst_index.head()
#FecBldTst = FecBldTst_index

In [None]:
# High Blood Pressures
HighBP = gb.get_group('HighBP')[Col_Grps]
#HighBP.head()
HighBP_index = HighBP.set_index('UniqueID')
HighBP_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.HighBP').replace('High_Confidence_Limit','High_CL.HighBP').replace('Data_Value','Data_Value.HighBP') \
                           for hdr in HighBP_index.columns]
HighBP_index.head()
#HighBP = HighBP_index

In [None]:
# High Cholesterol
HighChol = gb.get_group('HighChol')[Col_Grps]
#HighChol.head()
HighChol_index = HighChol.set_index('UniqueID')
HighChol_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.HighChol').replace('High_Confidence_Limit','High_CL.HighChol').replace('Data_Value','Data_Value.HighChol') \
                           for hdr in HighChol_index.columns]
HighChol_index.head()
#HighChol = HighChol_index

In [None]:
#Hypertension Medication
HtnMeds = gb.get_group('HtnMeds')[Col_Grps]
#HtnMeds.head()
HtnMeds_index = HtnMeds.set_index('UniqueID')
HtnMeds_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.HtnMeds').replace('High_Confidence_Limit','High_CL.HtnMeds').replace('Data_Value','Data_Value.HtnMeds') \
                           for hdr in HtnMeds_index.columns]
HtnMeds_index.head()
#HtnMeds = HtnMeds_index

In [None]:
#Kidney Disease
KidneyDis = gb.get_group('KidneyDis')[Col_Grps]
#KidneyDis.head()
KidneyDis_index = KidneyDis.set_index('UniqueID')
KidneyDis_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.KidneyDis').replace('High_Confidence_Limit','High_CL.KidneyDis').replace('Data_Value','Data_Value.KidneyDis') \
                           for hdr in KidneyDis_index.columns]
KidneyDis_index.head()
#KidneyDis = KidneyDis_index

In [None]:
#Mammograms
Mammo = gb.get_group('Mammo')[Col_Grps]
#Mammo.head()
Mammo_index = Mammo.set_index('UniqueID')
Mammo_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Mammo').replace('High_Confidence_Limit','High_CL.Mammo').replace('Data_Value','Data_Value.Mammo') \
                           for hdr in Mammo_index.columns]
Mammo_index.head()
#Mammo = Mammo_index

In [None]:
#Mental Health
MentHlth = gb.get_group('MentHlth')[Col_Grps]
#MentHlth.head()
MentHlth_index = MentHlth.set_index('UniqueID')
MentHlth_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.MentHlth').replace('High_Confidence_Limit','High_CL.MentHlth').replace('Data_Value','Data_Value.MentHlth') \
                           for hdr in MentHlth_index.columns]
MentHlth_index.head()
#MentHlth = MentHlth_index

In [None]:
#No Health Insurance
NoHlthIns = gb.get_group('NoHlthIns')[Col_Grps]
#NoHlthIns.head()
NoHlthIns_index = NoHlthIns.set_index('UniqueID')
NoHlthIns_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.NoHlthIns').replace('High_Confidence_Limit','High_CL.NoHlthIns').replace('Data_Value','Data_Value.NoHlthIns') \
                           for hdr in NoHlthIns_index.columns]
NoHlthIns_index.head()
#NoHlthIns = NoHlthIns_index

In [None]:
# No Physical Activity
NoPhysAct = gb.get_group('NoPhysAct')[Col_Grps]
#NoPhysAct.head()
NoPhysAct_index = NoPhysAct.set_index('UniqueID')
NoPhysAct_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.NoPhysAct').replace('High_Confidence_Limit','High_CL.NoPhysAct').replace('Data_Value','Data_Value.NoPhysAct') \
                           for hdr in NoPhysAct_index.columns]
NoPhysAct_index.head()
#NoPhysAct = NoPhysAct_index

In [None]:
# Obesity
Obesity = gb.get_group('Obesity')[Col_Grps]
#Obesity.head()
Obesity_index = Obesity.set_index('UniqueID')
Obesity_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Obesity').replace('High_Confidence_Limit','High_CL.Obesity').replace('Data_Value','Data_Value.Obesity') \
                           for hdr in Obesity_index.columns]
Obesity_index.head()
#Obesity = Obesity_index

In [None]:
# Tests for Older Men
OlderMen = gb.get_group('OlderMen')[Col_Grps]
#OlderMen.head()
OlderMen_index = OlderMen.set_index('UniqueID')
OlderMen_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.OlderMen').replace('High_Confidence_Limit','High_CL.OlderMen').replace('Data_Value','Data_Value.OlderMen') \
                           for hdr in OlderMen_index.columns]
OlderMen_index.head()
#OlderMen = OlderMen_index

In [None]:
# Tests for Older Women
OlderWomen = gb.get_group('OlderWomen')[Col_Grps]
#OlderWomen.head()
OlderWomen_index = OlderWomen.set_index('UniqueID')
OlderWomen_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.OlderWomen').replace('High_Confidence_Limit','High_CL.OlderWomen').replace('Data_Value','Data_Value.OlderWomen') \
                           for hdr in OlderWomen_index.columns]
OlderWomen_index.head()
#OlderWomen = OlderWomen_index

In [None]:
# Pap Smears
PapSmear = gb.get_group('PapSmear')[Col_Grps]
#PapSmear.head()
PapSmear_index = PapSmear.set_index('UniqueID')
PapSmear_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.PapSmear').replace('High_Confidence_Limit','High_CL.PapSmear').replace('Data_Value','Data_Value.PapSmear') \
                           for hdr in PapSmear_index.columns]
PapSmear_index.head()
#PapSmear = PapSmear_index

In [None]:
# Physical Health is Bad
PhysHlthBad = gb.get_group('PhysHlthBad')[Col_Grps]
#PhysHlthBad.head()
PhysHlthBad_index = PhysHlthBad.set_index('UniqueID')
PhysHlthBad_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.PhysHlthBad').replace('High_Confidence_Limit','High_CL.PhysHlthBad').replace('Data_Value','Data_Value.PhysHlthBad') \
                           for hdr in PhysHlthBad_index.columns]
PhysHlthBad_index.head()
#PhysHlthBad = PhysHlthBad_index

In [None]:
# Sleep litle (less than 7 hrs, day)
SleepLittle = gb.get_group('SleepLittle')[Col_Grps]
#SleepLittle.head()
SleepLittle_index = SleepLittle.set_index('UniqueID')
SleepLittle_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.SleepLittle').replace('High_Confidence_Limit','High_CL.SleepLittle').replace('Data_Value','Data_Value.SleepLittle') \
                           for hdr in SleepLittle_index.columns]
SleepLittle_index.head()
#SleepLittle = SleepLittle_index

In [None]:
# Stroke
Stroke = gb.get_group('Stroke')[Col_Grps]
#Stroke.head()
Stroke_index = Stroke.set_index('UniqueID')
Stroke_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.Stroke').replace('High_Confidence_Limit','High_CL.Stroke').replace('Data_Value','Data_Value.Stroke') \
                           for hdr in Stroke_index.columns]
Stroke_index.head()
#Stroke = Stroke_index

In [None]:
# Population with Missing Teeth
TeethLost = gb.get_group('TeethLost')[Col_Grps]
#TeethLost.head()
TeethLost_index = TeethLost.set_index('UniqueID')
TeethLost_index.columns = [hdr.replace('Low_Confidence_Limit','Low_CL.TeethLost').replace('High_Confidence_Limit','High_CL.TeethLost').replace('Data_Value','Data_Value.TeethLost') \
                           for hdr in TeethLost_index.columns]
TeethLost_index.head()
#TeethLost = TeethLost_index

In [None]:
## https://towardsdatascience.com/lets-make-a-map-using-geopandas-pandas-and-matplotlib-to-make-a-chloropleth-map-dddc31c1983d

In [None]:
## Time to merge all of the datasets together into one big dataset (but I know it's going to be a lot of intermediates!)
# There might be an easier way of doing this in Python, but I'm out of practice enough not to think about it.
# Also, really jetlagged. :D

#'Arthritis_index', 'Asthma_index', 'BngDrnk_index', 'CHD_index', 'COPD_index', 'Cancer_index', 'Cholesterol_index', 'CurrSmoke_index', 'DentalVisits_index', 'Diabetes_index', 'DocVisits_index', 'FecBldTst_index', 'HighBP_index', 'HighChol_index', 'HtnMeds_index', 'KidneyDis_index', 'Mammo_index', 'MentHlth_index', 'NoHlthIns_index', 'NoPhysAct_index', 'Obesity_index', 'OlderMen_index', 'OlderWomen_index', 'PapSmear_index', 'PhysHlthBad_index', 'SleepLittle_index', 'Stroke_index', 'TeethLost_index' 

int_table1 = pd.merge(Arthritis_index, Asthma_index, on='UniqueID', how='outer')
#int_table1

int_table2 = pd.merge(int_table1, BngDrnk_index, on='UniqueID', how='outer')
#int_table2

del int_table1 #deleting previous intermediary as I go to save memory

int_table3 = pd.merge(int_table2, CHD_index, on='UniqueID', how='outer')
#int_table3

del int_table2 #deleting previous intermediary as I go to save memory

int_table4 = pd.merge(int_table3, COPD_index, on='UniqueID', how='outer')
#int_table4

del int_table3 #deleting previous intermediary as I go to save memory

int_table5 = pd.merge(int_table4, Cancer_index, on='UniqueID', how='outer')
#int_table5

del int_table4 #deleting previous intermediary as I go to save memory

int_table6 = pd.merge(int_table5, Cholesterol_index, on='UniqueID', how='outer')
#int_table6

del int_table5 #deleting previous intermediary as I go to save memory

int_table7 = pd.merge(int_table6, CurrSmoke_index, on='UniqueID', how='outer')
#int_table7

del int_table6 #deleting previous intermediary as I go to save memory

int_table8 = pd.merge(int_table7, DentalVisits_index, on='UniqueID', how='outer')
#int_table8

del int_table7 #deleting previous intermediary as I go to save memory

int_table9 = pd.merge(int_table8, Diabetes_index, on='UniqueID', how='outer')
#int_table9

del int_table8 #deleting previous intermediary as I go to save memory

int_table10 = pd.merge(int_table9, DocVisits_index, on='UniqueID', how='outer')
#int_table10

del int_table9 #deleting previous intermediary as I go to save memory

int_table11 = pd.merge(int_table10, FecBldTst_index, on='UniqueID', how='outer')
#int_table11

del int_table10 #deleting previous intermediary as I go to save memory

int_table12 = pd.merge(int_table11, HighBP_index, on='UniqueID', how='outer')
#int_table12

del int_table11 #deleting previous intermediary as I go to save memory

int_table13 = pd.merge(int_table12, HighChol_index, on='UniqueID', how='outer')
#int_table13

del int_table12 #deleting previous intermediary as I go to save memory

int_table14 = pd.merge(int_table13, HtnMeds_index, on='UniqueID', how='outer')
#int_table14

del int_table13 #deleting previous intermediary as I go to save memory

int_table15 = pd.merge(int_table14, KidneyDis_index, on='UniqueID', how='outer')
#int_table15

del int_table14 #deleting previous intermediary as I go to save memory

int_table16 = pd.merge(int_table15, Mammo_index, on='UniqueID', how='outer')
#int_table16

del int_table15 #deleting previous intermediary as I go to save memory

int_table17 = pd.merge(int_table16, MentHlth_index, on='UniqueID', how='outer')
#int_table17

del int_table16 #deleting previous intermediary as I go to save memory

int_table18 = pd.merge(int_table17, NoHlthIns_index, on='UniqueID', how='outer')
#int_table18

del int_table17 #deleting previous intermediary as I go to save memory

int_table19 = pd.merge(int_table18, NoPhysAct_index, on='UniqueID', how='outer')
#int_table19

del int_table18 #deleting previous intermediary as I go to save memory

int_table20 = pd.merge(int_table19, Obesity_index, on='UniqueID', how='outer')
#int_table20

del int_table19 #deleting previous intermediary as I go to save memory

int_table21 = pd.merge(int_table20, OlderMen_index, on='UniqueID', how='outer')
#int_table21

del int_table20 #deleting previous intermediary as I go to save memory

int_table22 = pd.merge(int_table21, OlderWomen_index, on='UniqueID', how='outer')
#int_table22

del int_table21 #deleting previous intermediary as I go to save memory

int_table23 = pd.merge(int_table22, PapSmear_index, on='UniqueID', how='outer')
#int_table23

del int_table22 #deleting previous intermediary as I go to save memory

int_table24 = pd.merge(int_table23, PhysHlthBad_index, on='UniqueID', how='outer')
#int_table24

del int_table23 #deleting previous intermediary as I go to save memory

int_table25 = pd.merge(int_table24, SleepLittle_index, on='UniqueID', how='outer')
#int_table25

del int_table24 #deleting previous intermediary as I go to save memory

int_table26 = pd.merge(int_table25, Stroke_index, on='UniqueID', how='outer')
#int_table26

del int_table25 #deleting previous intermediary as I go to save memory

int_table27 = pd.merge(int_table26, TeethLost_index, on='UniqueID', how='outer')
#int_table26

del int_table26 #deleting previous intermediary as I go to save memory

## FINAL TABLE
int_table27

In [None]:
# RENAMING DATASET - Resetting index and reordering variables

df_Ind_int = pd.DataFrame(int_table27.to_records())
df_Ind_int_index = df_Ind_int.set_index('UniqueID')
#df_Indicators.drop(columns=['Measure', 'Measure_Short'])

df_Indicators = df_Ind_int_index.drop(['Measure', 'Measure_Short'], axis=1)
df_Indicators

In [None]:
df_Indicators.info()

In [None]:
# Reorder tables for nicer appearance

indicator_table = df_Indicators[['DataValueTypeID','CityName','StateAbbr','CityFIPS','TractFIPS','PopCount','GeoLocation',
                                     'Data_Value.Arthritis','Low_CL.Arthritis','High_CL.Arthritis',
                                     'Data_Value.Asthma','Low_CL.Asthma','High_CL.Asthma',
                                     'Data_Value.BngDrnk','Low_CL.BngDrnk','High_CL.BngDrnk',
                                     'Data_Value.CHD','Low_CL.CHD','High_CL.CHD',
                                     'Data_Value.COPD','Low_CL.COPD','High_CL.COPD',
                                     'Data_Value.Cancer','Low_CL.Cancer','High_CL.Cancer',
                                     'Data_Value.Cholesterol','Low_CL.Cholesterol','High_CL.Cholesterol',
                                     'Data_Value.CurrSmoke','Low_CL.CurrSmoke','High_CL.CurrSmoke',
                                     'Data_Value.DentalVisits','Low_CL.DentalVisits','High_CL.DentalVisits',
                                     'Data_Value.Diabetes','Low_CL.Diabetes','High_CL.Diabetes',
                                     'Data_Value.DocVisits','Low_CL.DocVisits','High_CL.DocVisits',
                                     'Data_Value.FecBldTst','Low_CL.FecBldTst','High_CL.FecBldTst',
                                     'Data_Value.HighBP','Low_CL.HighBP','High_CL.HighBP',
                                     'Data_Value.HighChol','Low_CL.HighChol','High_CL.HighChol',
                                     'Data_Value.HtnMeds','Low_CL.HtnMeds','High_CL.HtnMeds',
                                     'Data_Value.KidneyDis','Low_CL.KidneyDis','High_CL.KidneyDis',
                                     'Data_Value.Mammo','Low_CL.Mammo','High_CL.Mammo',
                                     'Data_Value.MentHlth', 'Low_CL.MentHlth','High_CL.MentHlth',
                                     'Data_Value.NoHlthIns','Low_CL.NoHlthIns','High_CL.NoHlthIns',
                                     'Data_Value.NoPhysAct','Low_CL.NoPhysAct','High_CL.NoPhysAct',
                                     'Data_Value.Obesity','Low_CL.Obesity','High_CL.Obesity',
                                     'Data_Value.OlderMen','Low_CL.OlderMen','High_CL.OlderMen',
                                     'Data_Value.OlderWomen','Low_CL.OlderWomen','High_CL.OlderWomen',
                                     'Data_Value.PapSmear','Low_CL.PapSmear','High_CL.PapSmear',
                                     'Data_Value.PhysHlthBad','Low_CL.PhysHlthBad','High_CL.PhysHlthBad',
                                     'Data_Value.SleepLittle','Low_CL.SleepLittle','High_CL.SleepLittle',
                                     'Data_Value.Stroke','Low_CL.Stroke','High_CL.Stroke',
                                     'Data_Value.TeethLost','Low_CL.TeethLost','High_CL.TeethLost']]

indicator_table

In [None]:
# Describe all of the seperate indicators for each of the variables
indicator_description = indicator_table.dropna().describe()
indicator_description

In [None]:
indicator_description.to_csv('C:/Users/mmsch/OneDrive/Desktop/Data Science/indicator_description.csv')

In [None]:
## EXPORTING OUT MY LAST DATAFRAME INTO A CSV FILE FOR USE IN GEOPANDAS (which is a local install on my computer)

indicator_table.to_csv('C:/Users/mmsch/OneDrive/Desktop/Data Science/indicator_table.csv')

In [None]:
## Note - I am linking Geopandas code onto Github and simply exporting out my dataset as a CSV. (03/03/2019)

In [None]:
# # %load https://raw.githubusercontent.com/censusreporter/census-shapefile-utils/master/fetch_shapefiles.py
# '''
# This script will download TIGER data shapefiles from the Census FTP site.
# It can be used to download a set of geographies defined in GEO_TYPES_LIST,
# or can be used to fetch files for a single state and/or single geography type.
# Pass an -s argument to limit by state, pass a -g argument to limit
# to a single geography type, and/or pass a -y argument to change the year
# from 2012 to something else (e.g. 2015).

#     >> python fetch_shapefiles.py
#     >> python fetch_shapefiles.py -s WA
#     >> python fetch_shapefiles.py -g place
#     >> python fetch_shapefiles.py -y 2015
#     >> python fetch_shapefiles.py -s WA -g place -y 2015

# If you use the -s argument to fetch files for a single state, the script
# will also download the national county, state and congressional district
# files that include data for your chosen state.

# The script will create DOWNLOAD_DIR and EXTRACT_DIR directories
# if necessary, fetch a zipfile or set of zipfiles from the Census website,
# then extract the shapefiles from each zipfile retrieved.

# DISABLE_AUTO_DOWNLOADS will prevent certain geography types from being
# automatically downloaded if no -g argument is passed to fetch_shapefiles.py.
# This may be useful because certain files, such as those for Zip Code
# Tabulation Areas, are extremely large. You can still target any geography
# in GEO_TYPES_LIST specifically, however. So to fetch the ZCTA data:

#     >> python fetch_shapefiles.py -g zcta5
# '''

# import optparse
# import os
# import sys
# import zipfile
# from os.path import isdir, join, normpath

# try:
#     from six.moves.urllib import request as urllib2
# except ImportError:
#     import urllib2

# from __init__ import (DOWNLOAD_DIR, EXTRACT_DIR, STATE_ABBREV_LIST,
#                       GEO_TYPES_LIST, DISABLE_AUTO_DOWNLOADS,
#                       get_fips_code_for_state)

# FTP_HOME = 'ftp://ftp2.census.gov/geo/tiger/TIGER2012/'


# def get_filename_list_from_ftp(target, state):
#     target_files = urllib2.urlopen(target).read().splitlines()
#     filename_list = []

#     for line in target_files:
#         filename = '%s%s' % (target, line.decode().split()[-1])
#         filename_list.append(filename)

#     if state:
#         state_check = '_%s_' % get_fips_code_for_state(state)
#         filename_list = filter(
#             lambda filename:
#                 state_check in filename or
#                 ('_us_' in filename and
#                  '_us_zcta5' not in filename),
#             filename_list
#         )

#     return filename_list


# def get_content_length(u):
#     # u is returned by urllib2.urlopen
#     if sys.version_info[0] == 2:
#         return int(u.info().getheader("Content-Length"))
#     else:
#         return int(u.headers["Content-Length"])


# def download_files_in_list(filename_list, force=False):
#     downloaded_filename_list = []
#     for file_location in filename_list:
#         filename = '%s/%s' % (DOWNLOAD_DIR, file_location.split('/')[-1])
#         if force or not os.path.exists(filename):
#             # Only download if required.
#             u = urllib2.urlopen(file_location)
#             f = open(filename, 'wb')
#             file_size = get_content_length(u)

#             print("Downloading: %s Bytes: %s" % (filename, file_size))
#             file_size_dl = 0
#             block_sz = 8192
#             while True:
#                 buffer = u.read(block_sz)
#                 if not buffer:
#                     break

#                 file_size_dl += len(buffer)
#                 f.write(buffer)
#                 status = r"%10d  [%3.2f%%]" % (
#                     file_size_dl, file_size_dl * 100. / file_size)
#                 status = status + chr(8) * (len(status) + 1)
#                 sys.stdout.write(status)
#                 sys.stdout.flush()

#             f.close()
#         downloaded_filename_list.append(filename)

#     return downloaded_filename_list


# def extract_downloaded_file(filename, remove_on_error=True):
#     zip_dir = filename.replace('.zip', '').split('/')[-1]
#     target_dir = normpath(join(EXTRACT_DIR, zip_dir))

#     print("Extracting: " + filename + " ...")
#     try:
#         zipped = zipfile.ZipFile(filename, 'r')
#     except zipfile.BadZipFile as ze:
#         if remove_on_error:
#             os.remove(filename)
#             raise Exception(
#                 "Removed corrupt zip file (%s). Retry download." % filename)
#         raise ze

#     zipped.extractall(target_dir)
#     zipped.close()


# def get_one_geo_type(geo_type, state=None, year='2012'):
#     target = '%s%s/' % (FTP_HOME.replace('2012', year), geo_type.upper())

#     print("Finding files in: " + target + " ...")
#     filename_list = get_filename_list_from_ftp(target, state)
#     downloaded_filename_list = download_files_in_list(filename_list)

#     for filename in downloaded_filename_list:
#         extract_downloaded_file(filename)


# def get_all_geo_types(state=None, year='2012'):
#     AUTO_DOWNLOADS = filter(
#         lambda geo_type: geo_type not in DISABLE_AUTO_DOWNLOADS,
#         GEO_TYPES_LIST
#     )
#     for geo_type in AUTO_DOWNLOADS:
#         get_one_geo_type(geo_type, state, year)


# def process_options(arglist=None):
#     global options, args
#     parser = optparse.OptionParser()
#     parser.add_option(
#         '-s', '--state',
#         dest='state',
#         help='specific state to download',
#         choices=STATE_ABBREV_LIST,
#         default=None
#     )
#     parser.add_option(
#         '-g', '--geo', '--geo_type',
#         dest='geo_type',
#         help='specific geographic type to download',
#         choices=GEO_TYPES_LIST,
#         default=None
#     )
#     parser.add_option(
#         '-y', '--year',
#         dest='year',
#         help='specific year to download',
#         default='2012'
#     )

#     options, args = parser.parse_args(arglist)
#     return options, args


# def main(args=None):
#     """
#     >> python fetch_shapefiles.py
#     >> python fetch_shapefiles.py -s WA
#     >> python fetch_shapefiles.py -g place
#     >> python fetch_shapefiles.py -s WA -g place
#     """
#     if args is None:
#         args = sys.argv[1:]
#     options, args = process_options(args)

#     # make sure we have the expected directories
#     for path in [DOWNLOAD_DIR, EXTRACT_DIR]:
#         if not isdir(path):
#             os.makedirs(path)

#     # get one geo_type or all geo_types
#     if options.geo_type:
#         get_one_geo_type(
#             geo_type = options.geo_type,
#             state = options.state,
#             year=options.year
#         )
#     else:
#         get_all_geo_types(
#             state = options.state,
#             year=options.year
#         )


# if __name__ == '__main__':
#     main()


In [None]:
# Note - as of 03/03/2019, Geopandas hasn't been working on this notebook.
# Until I properly debug this, I am commenting out all geopandas references, exporting my dataset
# as a CSV, then linking the Python file onto Github - Michelle

# # To date, my entire Jupyter analysis has been using shapefiles online. 
# # Until I figure out how to get the Census Reporter Python file working, I will change my CD to the Data Science
# # subfolder set up on my Desktop.

# cd = "C:/Users/mmsch/OneDrive/Desktop/Data Science"

# # set the filepath and load in a shapefile
# fp = "C:/Users/mmsch/OneDrive/Desktop/Data Science/tl_2018_us_ttract.shp"

# map_df = gpd.read_file(fp)
# # check data type so we can see that this is not a normal dataframe, but a GEOdataframe
# map_df.head()

In [None]:
# Other interesting links!

# https://jakevdp.github.io/PythonDataScienceHandbook/04.13-geographic-data-with-basemap.html