# 6.3 Geographic Visualization 

### This script contains the following:
#### 1. Import data and libraries
#### 2. Data wrangling
#### 3. Data cleaning
#### 4. Plotting a choropleth

### 1. Import data and libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json

In [3]:
# Show plots inline 

%matplotlib inline

In [4]:
# Import ".json" file for the U.S. 

country_geo = r'/Users/peterreadman/Desktop/Python Projects/PortfolioProjects/HealthcareSpending/02 Data/02a Original Data/us-states.json'

In [5]:
# That's just in case you want to look at the JSON file contents here too:

f = open(r'/Users/peterreadman/Desktop/Python Projects/CareerFoundry/Achievement 6 readthroughs/02 Data/02a Original Data/us-states.json',)
  
# returns JSON object asa dictionary
data = json.load(f)
  
# Iterating through the json list
for i in data['features']:
    print(i)

{'type': 'Feature', 'id': 'AL', 'properties': {'name': 'Alabama'}, 'geometry': {'type': 'Polygon', 'coordinates': [[[-87.359296, 35.00118], [-85.606675, 34.984749], [-85.431413, 34.124869], [-85.184951, 32.859696], [-85.069935, 32.580372], [-84.960397, 32.421541], [-85.004212, 32.322956], [-84.889196, 32.262709], [-85.058981, 32.13674], [-85.053504, 32.01077], [-85.141136, 31.840985], [-85.042551, 31.539753], [-85.113751, 31.27686], [-85.004212, 31.003013], [-85.497137, 30.997536], [-87.600282, 30.997536], [-87.633143, 30.86609], [-87.408589, 30.674397], [-87.446927, 30.510088], [-87.37025, 30.427934], [-87.518128, 30.280057], [-87.655051, 30.247195], [-87.90699, 30.411504], [-87.934375, 30.657966], [-88.011052, 30.685351], [-88.10416, 30.499135], [-88.137022, 30.318396], [-88.394438, 30.367688], [-88.471115, 31.895754], [-88.241084, 33.796253], [-88.098683, 34.891641], [-88.202745, 34.995703], [-87.359296, 35.00118]]]}}
{'type': 'Feature', 'id': 'AK', 'properties': {'name': 'Alaska'},

In [7]:
# Turn project folder path into a string
path = r'/Users/peterreadman/Desktop/Python Projects/PortfolioProjects/HealthcareSpending/'

In [117]:
# import data

household_df = pd.read_csv(os.path.join(path,'02 Data','02a Original Data','HealthInsurance_Income.csv'))
person_df = pd.read_csv(os.path.join(path,'02 Data','02a Original Data','Per_Med_Expenses.csv'))

### 2. Data Wrangling

In [118]:
household_df.columns

Index(['Household_ID', 'Region', 'State_Code', 'Census_Division',
       'Household_Type', 'Income_Bracket_CPS', 'Num_Persons_Household',
       'Health_Ins_Last_year', 'Retirement_Distr', 'Retirement_Distr_58',
       'Total_Household_Earnings', 'Income_Farm', 'Income_Bracket',
       'SelfEmployed_Farm', 'SelfEmployed', 'Wage_Salary',
       'Medicaid_Last_Year', 'Inc_Percentiles', 'Health_Ins_Private_Last_year',
       'Health_Ins_Public_Last_year', 'Income_SE', 'Income_Top5_Pct',
       'Total_Household_Income', 'Income_Wages_Salary', 'Health_Ins_Now',
       'Medicaid_Now', 'Health_Ins_Private_Now', 'Health_Ins_Public_Now',
       'Metropolitan_Code', 'County_Code', 'Prin_City', 'Met_Area_Size',
       'Stats_Area_FIPS', 'Metropolitan_Status', 'City_Code_Ind_Princ'],
      dtype='object')

#### 2a) Transform 'State_Code' column to 'State_Name'

Change 'State_Code' FIPS codes to State names to match JSON.
- 1 = Alabama
- 2 = Alaska
    
etc.
        

In [119]:
household_df['State_Code'].value_counts()

6     7123
48    5590
12    4366
36    3550
17    2352
42    2287
22    2129
37    2105
39    2084
11    2069
13    1966
26    1914
1     1863
25    1822
34    1803
35    1761
54    1755
47    1744
28    1739
51    1661
5     1626
30    1579
4     1537
53    1536
45    1482
40    1408
56    1374
15    1361
18    1357
38    1347
50    1318
16    1302
41    1257
49    1245
29    1209
32    1186
55    1138
8     1122
10    1117
24    1116
33    1110
20    1099
27    1086
31    1063
21    1063
2     1018
46     981
9      936
19     857
44     842
23     842
Name: State_Code, dtype: int64

In [120]:
# Mapping dictionary for FIPS codes to state names

fips_to_state = {
    '1': 'Alabama',
    '2': 'Alaska',
    '4': 'Arizona',
    '5': 'Arkansas',
    '6': 'California',
    '8': 'Colorado',
    '9': 'Connecticut',
    '10': 'Delaware',
    '11': 'District of Columbia',
    '12': 'Florida',
    '13': 'Georgia',
    '15': 'Hawaii',
    '16': 'Idaho',
    '17': 'Illinois',
    '18': 'Indiana',
    '19': 'Iowa',
    '20': 'Kansas',
    '21': 'Kentucky',
    '22': 'Louisiana',
    '23': 'Maine',
    '24': 'Maryland',
    '25': 'Massachusetts',
    '26': 'Michigan',
    '27': 'Minnesota',
    '28': 'Mississippi',
    '29': 'Missouri',
    '30': 'Montana',
    '31': 'Nebraska',
    '32': 'Nevada',
    '33': 'New Hampshire',
    '34': 'New Jersey',
    '35': 'New Mexico',
    '36': 'New York',
    '37': 'North Carolina',
    '38': 'North Dakota',
    '39': 'Ohio',
    '40': 'Oklahoma',
    '41': 'Oregon',
    '42': 'Pennsylvania',
    '44': 'Rhode Island',
    '45': 'South Carolina',
    '46': 'South Dakota',
    '47': 'Tennessee',
    '48': 'Texas',
    '49': 'Utah',
    '50': 'Vermont',
    '51': 'Virginia',
    '53': 'Washington',
    '54': 'West Virginia',
    '55': 'Wisconsin',
    '56': 'Wyoming'
}


In [121]:
# Convert 'State_Code' column to string data type
household_df['State_Code'] = household_df['State_Code'].astype(str)

In [122]:
# Apply the mapping to create a new 'State_Name' column
household_df['State_Code'] = household_df['State_Code'].map(fips_to_state)

In [123]:
# Rename 'State_Code' column to 'State_Name'
household_df = household_df.rename(columns={'State_Code': 'State_Name'})

In [124]:
household_df['State_Name']

0         Maine
1         Maine
2         Maine
3         Maine
4         Maine
          ...  
89192    Hawaii
89193    Hawaii
89194    Hawaii
89195    Hawaii
89196    Hawaii
Name: State_Name, Length: 89197, dtype: object

In [125]:
household_df.head()

Unnamed: 0,Household_ID,Region,State_Name,Census_Division,Household_Type,Income_Bracket_CPS,Num_Persons_Household,Health_Ins_Last_year,Retirement_Distr,Retirement_Distr_58,...,Medicaid_Now,Health_Ins_Private_Now,Health_Ins_Public_Now,Metropolitan_Code,County_Code,Prin_City,Met_Area_Size,Stats_Area_FIPS,Metropolitan_Status,City_Code_Ind_Princ
0,2.02209e+19,1,Maine,1,6,11,1,1,0,2,...,3,1,3,0,0,3,0,0,2,0
1,7.70732e+18,1,Maine,1,0,-1,0,0,0,0,...,0,0,0,0,0,3,0,0,2,0
2,1.57712e+19,1,Maine,1,0,-1,0,0,0,0,...,0,0,0,0,0,3,0,0,2,0
3,1.66073e+19,1,Maine,1,0,-1,0,0,0,0,...,0,0,0,0,0,3,0,0,2,0
4,2.10006e+19,1,Maine,1,1,12,2,1,0,2,...,3,1,3,0,0,3,0,0,2,0


In [126]:
household_df['State_Name'].value_counts()

California              7123
Texas                   5590
Florida                 4366
New York                3550
Illinois                2352
Pennsylvania            2287
Louisiana               2129
North Carolina          2105
Ohio                    2084
District of Columbia    2069
Georgia                 1966
Michigan                1914
Alabama                 1863
Massachusetts           1822
New Jersey              1803
New Mexico              1761
West Virginia           1755
Tennessee               1744
Mississippi             1739
Virginia                1661
Arkansas                1626
Montana                 1579
Arizona                 1537
Washington              1536
South Carolina          1482
Oklahoma                1408
Wyoming                 1374
Hawaii                  1361
Indiana                 1357
North Dakota            1347
Vermont                 1318
Idaho                   1302
Oregon                  1257
Utah                    1245
Missouri      

#### 2b) Coverage: drop columns other than State Name and types of insurance coverage

To start with, it will be useful to gain a general sense of how insurance coverage differs, if at all, across the US.<br>
<br>
There are several variables recording different types of coverage:
- Health_Ins_Now: any type of insurance among household members. This is a useful 'big picture' metric to see which households have any kind of insurance or not
- Medicaid_Now: medicaid coverage
- Health_Ins_Private_Now: private insurance coverage
- Health_Ins_Public_Now: public insurnace coverage, e.g. via the Healthcare Marketplace (ACA)

Note that records of coverage through Medicare are not in the 'Household' data set and can be found in the 'Person' data set.<br>
<br>
For the purpose of this exploration, I will map 'Health_Ins_Now' to see the geographic distribution of households with any type of coverage across the US.<br>
<br>
There are three answers to the question:

1. **All persons** *in household have coverage*
2. **Some persons** *in household have coverage*
3. **No persons** *in household have coverage*

In [127]:
# Create a list of columns for inclusion in mapping coverage

columns_cov = [ 'Household_ID'
                 #,'Region'
                 ,'State_Name'
                 #,'Census_Division'
                 #,'Household_Type'
                 #,'Income_Bracket_CPS'
                 #,'Num_Persons_Household'
                 #,'Health_Ins_Last_year'
                 #,'Retirement_Distr'
                 #,'Retirement_Distr_58'
                 #,'Total_Household_Earnings'
                 #,'Income_Farm'
                 #,'Income_Bracket'
                 #,'SelfEmployed_Farm'
                 #,'SelfEmployed'
                 #,'Wage_Salary'
                 #,'Medicaid_Last_Year'
                 #,'Inc_Percentiles'
                 #,'Health_Ins_Private_Last_year'
                 #,'Health_Ins_Public_Last_year'
                 #,'Income_SE'
                 #,'Income_Top5_Pct'
                 #,'Total_Household_Income'
                 #,'Income_Wages_Salary'
                 ,'Health_Ins_Now'
                 ,'Medicaid_Now'
                 ,'Health_Ins_Private_Now'
                 ,'Health_Ins_Public_Now'
                 #,'Metropolitan_Code'
                 #,'County_Code'
                 #,'Prin_City','Met_Area_Size'
                 #,'Stats_Area_FIPS'
                 #,'Metropolitan_Status'
                 #,'City_Code_Ind_Princ'
                   ]

In [128]:
# New dataframe with relevant columns:
household_cov = household_df[columns_cov].copy()

In [129]:
household_cov.head()

Unnamed: 0,Household_ID,State_Name,Health_Ins_Now,Medicaid_Now,Health_Ins_Private_Now,Health_Ins_Public_Now
0,2.02209e+19,Maine,1,3,1,3
1,7.70732e+18,Maine,0,0,0,0
2,1.57712e+19,Maine,0,0,0,0
3,1.66073e+19,Maine,0,0,0,0
4,2.10006e+19,Maine,1,3,1,3


### 3. Conduct consistency checks


#### Each insurance column has values 1–3 and 0. In each column:

1 = All persons in household have coverage <br>
2 = Some persons in household have coverage <br>
3 = No persons in household have coverage <br>
 <br>
0 = Null, not zero members of household

These numerical values will be transformed into the following:

1: **All** <br>
2: **Some** <br>
3: **None** <br>
0: 

In [130]:
household_cov.dtypes

Household_ID              float64
State_Name                 object
Health_Ins_Now              int64
Medicaid_Now                int64
Health_Ins_Private_Now      int64
Health_Ins_Public_Now       int64
dtype: object

In [131]:
# Replace 0 with none (null)
household_cov['Health_Ins_Now'] = household_cov['Health_Ins_Now'].replace({
    0: np.nan
})
household_cov['Medicaid_Now'] = household_cov['Medicaid_Now'].replace({
    0: np.nan
})
household_cov['Health_Ins_Private_Now'] = household_cov['Health_Ins_Private_Now'].replace({
    0: np.nan
})
household_cov['Health_Ins_Public_Now'] = household_cov['Health_Ins_Public_Now'].replace({
    0: np.nan
})


In [132]:
household_cov.head()

Unnamed: 0,Household_ID,State_Name,Health_Ins_Now,Medicaid_Now,Health_Ins_Private_Now,Health_Ins_Public_Now
0,2.02209e+19,Maine,1.0,3.0,1.0,3.0
1,7.70732e+18,Maine,,,,
2,1.57712e+19,Maine,,,,
3,1.66073e+19,Maine,,,,
4,2.10006e+19,Maine,1.0,3.0,1.0,3.0


In [133]:
# Check for missing values

household_cov.isnull().sum()

Household_ID                  0
State_Name                    0
Health_Ins_Now            30049
Medicaid_Now              30049
Health_Ins_Private_Now    30049
Health_Ins_Public_Now     30049
dtype: int64

In [134]:
household_cov.shape

(89197, 6)

#### 3a) Remove rows where State_Name is unknown as these will not be possible to map**

In [135]:
hh_cov = household_cov.dropna(subset=['State_Name'])

In [136]:
# Check for missing values

hh_cov.isnull().sum()

Household_ID                  0
State_Name                    0
Health_Ins_Now            30049
Medicaid_Now              30049
Health_Ins_Private_Now    30049
Health_Ins_Public_Now     30049
dtype: int64

In [137]:
hh_cov.shape

(89197, 6)

***Leave other null values for now***

#### 3b) Change values in coverage columns from 1-3 to 'All' 'Some' and 'None'

In [138]:
hh_cov['Health_Ins_Now'] = hh_cov['Health_Ins_Now'].astype(object)
hh_cov['Medicaid_Now'] = hh_cov['Medicaid_Now'].astype(object)
hh_cov['Health_Ins_Private_Now'] = hh_cov['Health_Ins_Private_Now'].astype(object)
hh_cov['Health_Ins_Public_Now'] = hh_cov['Health_Ins_Public_Now'].astype(object)


In [139]:
hh_cov.dtypes

Household_ID              float64
State_Name                 object
Health_Ins_Now             object
Medicaid_Now               object
Health_Ins_Private_Now     object
Health_Ins_Public_Now      object
dtype: object

In [140]:
# Replace 1–3 with 'All' 'Some' 'None' 

hh_cov['Health_Ins_Now'] = hh_cov['Health_Ins_Now'].replace({
    1: 'All',
    2: 'Some',
    3: 'None'
})
hh_cov['Medicaid_Now'] = hh_cov['Medicaid_Now'].replace({
    1: 'All',
    2: 'Some',
    3: 'None'
})
hh_cov['Health_Ins_Private_Now'] = hh_cov['Health_Ins_Private_Now'].replace({
    1: 'All',
    2: 'Some',
    3: 'None'
})
hh_cov['Health_Ins_Public_Now'] = hh_cov['Health_Ins_Public_Now'].replace({
    1: 'All',
    2: 'Some',
    3: 'None'
})


In [141]:
hh_cov.head()

Unnamed: 0,Household_ID,State_Name,Health_Ins_Now,Medicaid_Now,Health_Ins_Private_Now,Health_Ins_Public_Now
0,2.02209e+19,Maine,All,,All,
1,7.70732e+18,Maine,,,,
2,1.57712e+19,Maine,,,,
3,1.66073e+19,Maine,,,,
4,2.10006e+19,Maine,All,,All,


#### 3c) Check for duplicates

In [142]:
dups = hh_cov.duplicated()

In [143]:
dups.shape

(89197,)

### 4. Choropleth maps of Household Insurance Coverage across the US

In [144]:
# Create a crosstab of value counts for insurance coverage per household in each state

ct_hh_cov = pd.crosstab(hh_cov['State_Name'], hh_cov['Health_Ins_Now'])
print(ct_hh_cov)

Health_Ins_Now         All  None  Some
State_Name                            
Alabama               1160    48   128
Alaska                 525    46    69
Arizona                822    62    89
Arkansas               971    50   110
California            4633   164   514
Colorado               745    25    72
Connecticut            539    14    37
Delaware               648    30    45
District of Columbia  1089    22    24
Florida               2353   182   359
Georgia               1050    77   152
Hawaii                 894    20    46
Idaho                  833    56    92
Illinois              1428    54   119
Indiana                813    24    67
Iowa                   585    18    33
Kansas                 592    31    58
Kentucky               633    20    54
Louisiana             1250    55   114
Maine                  367     6    34
Maryland               643    15    43
Massachusetts         1211    18    48
Michigan              1227    20    63
Minnesota              71

In [145]:
# Show the totals above as percentages per state

ct_pct = pd.crosstab(hh_cov['State_Name'], hh_cov['Health_Ins_Now'], normalize='index') * 100
ct_pct = ct_pct.round(1)
print(ct_pct)

Health_Ins_Now         All  None  Some
State_Name                            
Alabama               86.8   3.6   9.6
Alaska                82.0   7.2  10.8
Arizona               84.5   6.4   9.1
Arkansas              85.9   4.4   9.7
California            87.2   3.1   9.7
Colorado              88.5   3.0   8.6
Connecticut           91.4   2.4   6.3
Delaware              89.6   4.1   6.2
District of Columbia  95.9   1.9   2.1
Florida               81.3   6.3  12.4
Georgia               82.1   6.0  11.9
Hawaii                93.1   2.1   4.8
Idaho                 84.9   5.7   9.4
Illinois              89.2   3.4   7.4
Indiana               89.9   2.7   7.4
Iowa                  92.0   2.8   5.2
Kansas                86.9   4.6   8.5
Kentucky              89.5   2.8   7.6
Louisiana             88.1   3.9   8.0
Maine                 90.2   1.5   8.4
Maryland              91.7   2.1   6.1
Massachusetts         94.8   1.4   3.8
Michigan              93.7   1.5   4.8
Minnesota             94.

In [146]:
# Create new data frames with snippets of the above crosstabs for exploring in choropleth maps

cov_ALL = ct_hh_cov['All'].reset_index(name='All')
cov_SOME = ct_hh_cov['Some'].reset_index(name='Some')
cov_NONE = ct_hh_cov['None'].reset_index(name='None')
cov_pct_ALL = ct_pct['All'].reset_index(name='All')
cov_pct_SOME = ct_pct['Some'].reset_index(name='Some')
cov_pct_NONE = ct_pct['None'].reset_index(name='None')

### Choropleth Maps: Health Insurance Coverage per Household
#### A: All household members covered

In [152]:
# Count of All household members covered per state
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_ALL,
    columns = ['State_Name', 'All'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "Households").add_to(map)
folium.LayerControl().add_to(map)

map

#### B: Some household members covered

In [159]:
# Count of SOME household members covered per state
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_SOME,
    columns = ['State_Name', 'Some'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "Households").add_to(map)
folium.LayerControl().add_to(map)

map

#### C: No household members covered

In [163]:
# Count of NONE household members covered per state
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_NONE,
    columns = ['State_Name', 'None'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "Households").add_to(map)
folium.LayerControl().add_to(map)

map

**Observations**
The three maps above shows a hint of the disparity of health insurance coverage across the US but it is not clear how much these differences are due to population size or not. There are many states. Neverthless, here are a few observations:

- The number of households with insurance coverage varies across states but it is not clear from the total counts whether this coverage is inline with state population size or not
- It is clear to see that **Texas** has more households with **Some** or **No** members covered than other states
- **California** has most households with **All members covered** (California also has more households than any other state)

<br>
It may be better to map the percentage households with All, Some, or No members covered for each state to see if that reveals anything more insightful

#### D: Percentage of Households with All Members Covered 

In [162]:
# 
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_pct_ALL,
    columns = ['State_Name', 'All'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "% Households").add_to(map)
folium.LayerControl().add_to(map)

map

#### E: Percentage of Households with Some Members Covered 

In [157]:
# 
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_pct_SOME,
    columns = ['State_Name', 'Some'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "% Households").add_to(map)
folium.LayerControl().add_to(map)

map

#### F: Percentage of Households with No Members Covered 

In [158]:
# 
map = folium.Map(location = [37.0902, -95.7129], zoom_start=3)

folium.Choropleth(
    geo_data = country_geo, 
    data = cov_pct_NONE,
    columns = ['State_Name', 'None'],
    key_on = 'feature.properties.name', # this part is very important - check your json file to see where the KEY is located
    fill_color = 'YlOrBr', fill_opacity=0.6, line_opacity=0.1,
        legend_name = "% Households").add_to(map)
folium.LayerControl().add_to(map)

map

#### Observations
Here we begin to see some differences between the states.
- **Texas** has the lowest percentage of its households with All Members Covered
- **Texas** also has the highest percentage of its households with Some and No Members Covered
- Most states have more than 80% of households with All Members Covered, which means approx. 20% of households accross the US with only Some or Non Members Covered

#### Ideas for further analysis
I'm interested to explore how these maps compare to where people with different income types are situated, e.g. do most people with insurance in Texas have it through their employer or other means? Where are the most Self-employed people located and how are they covered? 