In [4]:
# Packages for data munging
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import warnings
warnings.filterwarnings(
    action='ignore',
    category=FutureWarning, 
    module='pyproj'
)

# Functions for data munging
def clean_state_fips_code(cellcontents ):
    """This function takes pandas Series cell contents as input 
    to reformat with leading zeros. Used to convert integer
    state FIPS code to string representation with leading zeros.
    Returns modified cell contents.    
    """
    
    # Use conditionals to separate cell contents 
    # which satisfy the length criteria and
    # those that do not. 
    
    if len(str(cellcontents)) == 1:
        return format(cellcontents, '02')
    else:
        return cellcontents
    
def clean_county_fips_code(cellcontents):
    """This function takes pandas Series cell contents as input 
    to reformat with leading zeros. Used to convert integer 
    county FIPS code to string representation with leading zeros.
    Returns modified cell contents.    
    """
    
    # Use conditionals to separate cell contents 
    # which satisfy the length criteria and
    # those that do not. 
    
    if len(str(cellcontents)) == 1:
        return format(cellcontents, '03')
    elif len(str(cellcontents)) == 2:
        return format(cellcontents, '03')
    else: 
        return cellcontents
    
def munge_data():
    
    """Function to munge county-level case data for 2018 population estimates, 
    hospital (and hospital bed) counts, and median income information. Returns 
    a pandas DataFrame ready for use in plotly. 
    """
    
    # Data sources
    #county_API_URL = "https://covid19-us-api.herokuapp.com/county"
    hospitals_url = "https://opendata.arcgis.com/datasets/6ac5e325468c4cb9b905f1728d6fbf0f_0.geojson"
    counties_url = "https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json"
    population_url = "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/totals/co-est2018-alldata.csv"
    population_path = "/home/alex/Downloads/co-est2018-alldata.csv" #This is a local copy of census income data available at `population_url`. 
    income_path = "/home/alex/data/Unemployment.csv" #This is a local copy of median household income: U.S. Census Bureau, Small Area Income and Poverty Estimates (SAIPE) Program.
    
    # Ingest US County Boundary data from plotly dash github repository
    us_county_boundaries_gdf = gpd.read_file(counties_url)
    
    # Generate latitude and longitude columns
    us_county_boundaries_gdf['Longitude'] = us_county_boundaries_gdf["geometry"].centroid.y
    us_county_boundaries_gdf['Latitude'] = us_county_boundaries_gdf["geometry"].centroid.x
    
    # Reproject county boundary data to match that of County API data
    us_county_df = pd.DataFrame(us_county_boundaries_gdf.drop(["geometry"], axis=1))
    
    # Ingest hospital locations database as geopandas GeoDataFrame
    hospitals_gdf = gpd.read_file(hospitals_url)
    
    # Subset for open status hospitals
    hospitals_gdf = hospitals_gdf[hospitals_gdf["STATUS"] == "OPEN"]
    
    # Drop geometry and status columns then convert to a pandas DataFrame
    hospitals_df = hospitals_gdf.drop(["geometry", 
                                       "STATUS"], 
                                      axis=1)
    
    # Clean BEDS Series by replacing NaN placeholder
    hospitals_df["BEDS"] = hospitals_df["BEDS"].replace(-999, 
                                                        np.NaN)
    
    # Join County API (with FIPS labels) with the hospital locations using FIPS attributes
    counties_with_hospitals = us_county_df.merge(hospitals_df, 
                                                                    left_on="id", 
                                                                    right_on="COUNTYFIPS")
    
    # Count the number of hospitals in each county then create new associated column in County API data (with FIPS labels)
    us_county_df["Hospital Count"] = us_county_df["id"].map(
        counties_with_hospitals.groupby("COUNTYFIPS")["ID"].count().copy().to_dict())
    
    # Repeat the same procedure but sum the number of hospital beds in each county
    us_county_df["Bed Count"] = us_county_df["id"].map(
        counties_with_hospitals.groupby("COUNTYFIPS")["BEDS"].sum().copy().to_dict())
    
    # Convert identifier column to integer for further data joining
    us_county_df["id"] = us_county_df["id"].astype(int)
    
    # Ingest population data 
    population_df = pd.read_csv(population_path, 
                                engine="python")
    
    # Create intermediate state-level FIPS identifier column using convenience function
    population_df["STATE_FC"] = population_df["STATE"].apply(clean_state_fips_code)
    
    # Create intermediate county-level FIPS identifier column using convenience function
    population_df["COUNTY_FC"] = population_df["COUNTY"].apply(clean_county_fips_code)
    
    # Generate integer county-level FIPS identifier from intermediate state- and county-level columns 
    population_df["FIPS_CODE"] = (population_df["STATE"].astype(str) + population_df["COUNTY_FC"].astype(str)).astype(int)
    
    # Create a county-level FIPS identifier for use with plotly Mapbox choropleth module
    population_df["FIPS_plotly"] = population_df["STATE_FC"].astype(str) + population_df["COUNTY_FC"].astype(str)
    
    # Subset population data for columns of interest, namely FIPS identifiers and 2018 population estimates
    population_df = population_df[["FIPS_CODE",
                                   "FIPS_plotly",
                                   "POPESTIMATE2018"]]
    
    # Join County API data (with FIPS labels) with population data 
    county_data = us_county_df.merge(population_df,
                                             left_on="id",
                                             right_on="FIPS_CODE",
                                             how="left")
    
    # Drop duplicate FIPS-code counties
    county_data = county_data.drop_duplicates("id")
    
    # Ingest income data 
    income_df = pd.read_csv(income_path,
                            skiprows=4)
    
    # Clean income column by removing dollar signs and commas then converting to float type
    income_df["Median_Household_Income_2018"] = income_df["Median_Household_Income_2018"].str.replace("$",
                                                                                                      "").str.replace(",",
                                                                                                                      "").astype(float)
    
    # Merge county api (with labels) data with income data
    county_data_with_income = county_data.merge(income_df[["FIPS",
                                                           "Median_Household_Income_2018"]],
                                                left_on="id",
                                                right_on="FIPS",
                                                how="left")
    
    # Drop duplicate columns in final dataframe (based on FIPS ids)
    county_data_with_income = county_data_with_income.drop_duplicates("id")
    
    # Drop unwanted columns from final dataframe
    county_data_with_income = county_data_with_income.drop(["COUNTY","id","GEO_ID","LSAD", "FIPS_CODE","FIPS"], axis=1)
        
    # A dictionary to map FIPS state codes to two-letter abbreviations
    state_codes = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'}

    # Reverse order dictionary to fit data, i.e. {"WA":"53"} becomes {"53":"WA"} 
    inverted_state_codes = dict(map(reversed, state_codes.items()))
    
    # Map dictionary to changes STATE values to abbrevations
    county_data_with_income["STATE"] = county_data_with_income["STATE"].map(inverted_state_codes)
    
    # Save pandas DataFrame of results
    county_data_with_income.to_csv("county-level-hospital-population-and-income-data.csv", index=0)
    
munge_data()