<a href="https://colab.research.google.com/github/oimartin/Older-and-Wiser/blob/geriatric_visits/geriatric_visits_nb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data
Geriatric visits data: https://data.cms.gov/provider-data/dataset/bce0-b5db
<br>
Zip code/ State data: https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2019-zip-code-data-soi

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sns

In [3]:
visits = pd.read_csv("https://raw.githubusercontent.com/oimartin/Older-and-Wiser/geriatric_visits/data/Geriatric_Medicine.csv")
zip_codes = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/geriatric_visits/data/all_states_only_zip_codes.csv')

# EDA

## Geriatric Visits

In [4]:
# First look at data
display(visits.info())
print(" ")
print(" ")

display(visits.describe())
print(" ")
print(" ")

# Only two codes used, new and existing patients
print("most_utilized_procedure_code_for_new_patient")
display(visits.iloc[:, 7].value_counts())
print("most_utilized_procedure_code_for_established_patient")
display(visits.iloc[:, 14].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42959 entries, 0 to 42958
Data columns (total 15 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   zip_code                                              42959 non-null  int64  
 1   min_medicare_pricing_for_new_patient                  42959 non-null  float64
 2   max_medicare_pricing_for_new_patient                  42959 non-null  float64
 3   mode_medicare_pricing_for_new_patient                 42959 non-null  float64
 4   min_copay_for_new_patient                             42959 non-null  float64
 5   max_copay_for_new_patient                             42959 non-null  float64
 6   mode_copay_for_new_patient                            42959 non-null  float64
 7   most_utilized_procedure_code_for_new_patient          42959 non-null  int64  
 8   min_medicare_pricing_for_established_patient          42

None

 
 


Unnamed: 0,zip_code,min_medicare_pricing_for_new_patient,max_medicare_pricing_for_new_patient,mode_medicare_pricing_for_new_patient,min_copay_for_new_patient,max_copay_for_new_patient,mode_copay_for_new_patient,most_utilized_procedure_code_for_new_patient,min_medicare_pricing_for_established_patient,max_medicare_pricing_for_established_patient,mode_medicare_pricing_for_established_patient,min_copay_for_established_patient,max_copay_for_established_patient,mode_copay_for_established_patient,most_utilized_procedure_code_for_established_patient
count,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0,42959.0
mean,49591.379245,58.238339,177.239252,177.239252,14.559585,44.309813,44.309813,99205.0,18.073701,144.793623,103.643492,4.518425,36.198406,25.910873,99214.0
std,27974.499582,4.04555,11.124559,11.124559,1.011387,2.78114,2.78114,0.0,1.459078,9.072558,6.628021,0.36477,2.268139,1.657005,0.0
min,210.0,53.136,163.672,163.672,13.284,40.918,40.918,99205.0,16.256,133.848,95.608,4.064,33.462,23.902,99214.0
25%,26037.5,55.312,169.736,169.736,13.828,42.434,42.434,99205.0,17.04,138.8,98.992,4.26,34.7,24.748,99214.0
50%,48815.0,56.936,174.056,174.056,14.234,43.514,43.514,99205.0,17.616,142.08,101.496,4.404,35.52,25.374,99214.0
75%,73042.5,59.432,180.408,180.408,14.858,45.102,45.102,99205.0,18.544,147.312,105.648,4.636,36.828,26.412,99214.0
max,99950.0,74.816,233.632,233.632,18.704,58.408,58.408,99205.0,23.768,190.744,135.848,5.942,47.686,33.962,99214.0


 
 
most_utilized_procedure_code_for_new_patient


99205    42959
Name: most_utilized_procedure_code_for_new_patient, dtype: int64

most_utilized_procedure_code_for_established_patient


99214    42959
Name: most_utilized_procedure_code_for_established_patient, dtype: int64

In [5]:
visits.nunique()

zip_code                                                42959
min_medicare_pricing_for_new_patient                       95
max_medicare_pricing_for_new_patient                       96
mode_medicare_pricing_for_new_patient                      96
min_copay_for_new_patient                                  95
max_copay_for_new_patient                                  96
mode_copay_for_new_patient                                 96
most_utilized_procedure_code_for_new_patient                1
min_medicare_pricing_for_established_patient               90
max_medicare_pricing_for_established_patient               96
mode_medicare_pricing_for_established_patient              96
min_copay_for_established_patient                          90
max_copay_for_established_patient                          96
mode_copay_for_established_patient                         96
most_utilized_procedure_code_for_established_patient        1
dtype: int64

<h1> Code Definitions </h1>
<b>99214</b>: Established patient office visit, 30-39 minutes
<br>
<b>99205</b>: Typically 60 minutes, Comprehensive history, Comprehensive Examination, High complexity medical decision making

## Zip Codes

In [6]:
# Observe zip code data from 2019 IRS
display(zip_codes.info())
print(" ")
print(" ")
display(zip_codes.nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27695 entries, 0 to 27694
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   STATEFIPS  27695 non-null  int64 
 1   STATE      27695 non-null  object
 2   ZIPCODE    27695 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 649.2+ KB


None

 
 


STATEFIPS       51
STATE           51
ZIPCODE      27595
dtype: int64

In [7]:
# Select for only zipcodes, not overall state data
clean_zip = zip_codes[(zip_codes['ZIPCODE'] !=0) &
                      (zip_codes['ZIPCODE'] !=99999)]
display(clean_zip.info())
print(" ")
print(" ")
display(clean_zip.nunique())
display(clean_zip.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27593 entries, 1 to 27693
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   STATEFIPS  27593 non-null  int64 
 1   STATE      27593 non-null  object
 2   ZIPCODE    27593 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 862.3+ KB


None

 
 


STATEFIPS       51
STATE           51
ZIPCODE      27593
dtype: int64

Unnamed: 0,STATEFIPS,ZIPCODE
count,27593.0,27593.0
mean,29.669663,48856.241474
std,15.119769,27047.660765
min,1.0,1001.0
25%,18.0,27105.0
50%,29.0,48845.0
75%,42.0,70578.0
max,56.0,99901.0


## Add States and Regions to Visits

In [8]:
# functions to add states to visit df

# Determine start and end zip codes by state
def find_start_end_zip(current_state):
  zipcodes_state = clean_zip[clean_zip['STATE'] == state]['ZIPCODE']
  min_zip = zipcodes_state.min()
  max_zip = zipcodes_state.max()

  return min_zip, max_zip, current_state

# Add appropriate state to visit df by index position of zip code
def add_state(min_zip, max_zip, current_state):
  state_zip_index = visits[(visits['zip_code'] >= min_zip) & (visits['zip_code'] <= max_zip)].index
  visits.iloc[state_zip_index, -1] = current_state
  print(f'{current_state} min zip code = {min_zip}, \n max zip code = {max_zip}')

  return 

In [9]:
# list of all state abbreviations
all_states_dc = list(clean_zip['STATE'].unique())

# create new column for states
visits['State'] = ''

# add states to visit data and print zip code boundaries by state
for state in all_states_dc:
  state_min, state_max, current_state = find_start_end_zip(state)
  add_state(state_min, state_max, current_state)


AL min zip code = 35004, 
 max zip code = 36925
AK min zip code = 99501, 
 max zip code = 99901
AZ min zip code = 85003, 
 max zip code = 86547
AR min zip code = 71601, 
 max zip code = 72959
CA min zip code = 90001, 
 max zip code = 96161
CO min zip code = 80002, 
 max zip code = 81657
CT min zip code = 6001, 
 max zip code = 6907
DE min zip code = 19701, 
 max zip code = 19979
DC min zip code = 20001, 
 max zip code = 20037
FL min zip code = 32003, 
 max zip code = 34997
GA min zip code = 30002, 
 max zip code = 39897
HI min zip code = 96701, 
 max zip code = 96826
ID min zip code = 83201, 
 max zip code = 83876
IL min zip code = 60002, 
 max zip code = 62999
IN min zip code = 46001, 
 max zip code = 47995
IA min zip code = 50001, 
 max zip code = 52807
KS min zip code = 66002, 
 max zip code = 67954
KY min zip code = 40003, 
 max zip code = 42788
LA min zip code = 70001, 
 max zip code = 71486
ME min zip code = 3901, 
 max zip code = 4989
MD min zip code = 20601, 
 max zip code = 21

In [10]:
# Observe number of zipcodes labeled by state
visits.State.value_counts()

GA    3402
CA    2725
TX    2604
NY    2242
PA    2242
VA    2107
IL    1629
OH    1483
MO    1192
MI    1183
NC    1102
IA    1078
MN    1044
KY    1030
IN     999
WV     935
WI     918
TN     813
OK     793
KS     775
MA     748
WA     744
NJ     742
LA     737
AR     728
CO     675
NE     630
AZ     565
       559
SC     544
MS     541
ME     504
OR     498
NM     435
CT     434
SD     417
ND     416
MT     411
UT     359
VT     310
WY     301
NH     284
AK     264
NV     255
ID     230
HI     115
DE      98
RI      88
DC      31
Name: State, dtype: int64

<h1>Zip Codes by State</h1>
The zip code data used is from the 2019 Individual income tax statistics by zip code. There are 559 zip codes that were not assigned a state.
2019 Individual income tax statistics by zip code. There are 559 zip codes that were not assigned a state.

## Adjust Zip_code in Visits

In [11]:
visits

Unnamed: 0,zip_code,min_medicare_pricing_for_new_patient,max_medicare_pricing_for_new_patient,mode_medicare_pricing_for_new_patient,min_copay_for_new_patient,max_copay_for_new_patient,mode_copay_for_new_patient,most_utilized_procedure_code_for_new_patient,min_medicare_pricing_for_established_patient,max_medicare_pricing_for_established_patient,mode_medicare_pricing_for_established_patient,min_copay_for_established_patient,max_copay_for_established_patient,mode_copay_for_established_patient,most_utilized_procedure_code_for_established_patient,State
0,210,60.136,181.552,181.552,15.034,45.388,45.388,99205,18.896,148.440,106.440,4.724,37.110,26.610,99214,
1,211,60.136,181.552,181.552,15.034,45.388,45.388,99205,18.896,148.440,106.440,4.724,37.110,26.610,99214,
2,212,60.136,181.552,181.552,15.034,45.388,45.388,99205,18.896,148.440,106.440,4.724,37.110,26.610,99214,
3,213,60.136,181.552,181.552,15.034,45.388,45.388,99205,18.896,148.440,106.440,4.724,37.110,26.610,99214,
4,214,60.136,181.552,181.552,15.034,45.388,45.388,99205,18.896,148.440,106.440,4.724,37.110,26.610,99214,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42954,99926,74.816,233.632,233.632,18.704,58.408,58.408,99205,22.376,190.744,135.848,5.594,47.686,33.962,99214,
42955,99927,74.816,233.632,233.632,18.704,58.408,58.408,99205,22.376,190.744,135.848,5.594,47.686,33.962,99214,
42956,99928,74.816,233.632,233.632,18.704,58.408,58.408,99205,22.376,190.744,135.848,5.594,47.686,33.962,99214,
42957,99929,74.816,233.632,233.632,18.704,58.408,58.408,99205,22.376,190.744,135.848,5.594,47.686,33.962,99214,


In [12]:
visits.iloc[:, 0] = visits['zip_code'].astype('string')
visits.iloc[:, 7] = visits.iloc[:, 7].astype('string')
visits.iloc[:, 14] = visits.iloc[:, 14].astype('string')

In [13]:
visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42959 entries, 0 to 42958
Data columns (total 16 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   zip_code                                              42959 non-null  string 
 1   min_medicare_pricing_for_new_patient                  42959 non-null  float64
 2   max_medicare_pricing_for_new_patient                  42959 non-null  float64
 3   mode_medicare_pricing_for_new_patient                 42959 non-null  float64
 4   min_copay_for_new_patient                             42959 non-null  float64
 5   max_copay_for_new_patient                             42959 non-null  float64
 6   mode_copay_for_new_patient                            42959 non-null  float64
 7   most_utilized_procedure_code_for_new_patient          42959 non-null  string 
 8   min_medicare_pricing_for_established_patient          42

## Add Regions

In [14]:
# Create regions
visits['Region'] = visits['State']
visits['Region'].replace(['CT','ME','MA','NH','RI', 'VT'],
                         'New England',inplace=True)

visits['Region'].replace(['NJ','NY','PA'],
                         'Middle Atlantic',inplace=True)

visits['Region'].replace(['IN','IL', 'MI','OH', 'WI'],
                         'East North Central',inplace=True)

visits['Region'].replace(['IA', 'NE', 'KS', 'ND','MN', 'SD', 'MO'],
                          'West North Central',inplace=True)

visits['Region'].replace(['DE', 'DC', 'FL', 'GA', 'MD', 'NC','SC', 'VA', 'WV'],
                          'South Atlantic',inplace=True)

visits['Region'].replace(['AL', 'LA', 'OK','TX'],
                          'West South Central',inplace=True)

visits['Region'].replace(['AR', 'KY', 'MS','TN'],
                          'East South Central',inplace=True)

visits['Region'].replace(['AZ', 'CO', 'ID','NM','MT', 'UT', 'NV', 'WY'],
                          'Mountain',inplace=True)

visits['Region'].replace(['AK', 'CA', 'HI', 'OR','WA'],
                          'Pacific',inplace=True)

# Graphs

## Treemap - Region

In [15]:
MA = visits[(visits['Region'] == 'Middle Atlantic')]

fig = px.treemap(MA,
                 path=[px.Constant("Middle Atlantic"), 'State', 'zip_code'],
                  color='mode_medicare_pricing_for_established_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(MA['mode_medicare_pricing_for_established_patient']),
                 width=2000, height=1000)

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

### Other Graphs

In [57]:
SA = visits[(visits['Region'] == 'South Atlantic')]

fig = px.treemap(SA,
                 path=[px.Constant("South Atlantic"), 'State', 'zip_code'],
                  color='mode_copay_for_established_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(SA['mode_copay_for_established_patient']),
                 width=1800, height=900)

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [59]:
SA = visits[(visits['Region'] == 'South Atlantic')]

fig = px.treemap(SA,
                 path=[px.Constant("South Atlantic"), 'State', 'zip_code'],
                  color='mode_medicare_pricing_for_established_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(SA['mode_medicare_pricing_for_established_patient']),
                 width=2000, height=1000)

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [53]:
SA = visits[(visits['Region'] == 'South Atlantic')]

fig = px.treemap(SA,
                 path=[px.Constant("South Atlantic"), 'State', 'zip_code'],
                  color='min_medicare_pricing_for_new_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(SA['min_medicare_pricing_for_new_patient']))

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [34]:
fig = px.treemap(visits[(visits['Region'] == 'Middle Atlantic')],
                 path=[px.Constant("Region"), 'State', 'zip_code'],
                 values='min_medicare_pricing_for_new_patient',
                  color='min_medicare_pricing_for_new_patient',
                  color_continuous_scale='RdBu')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [43]:
fig = px.treemap(visits[(visits['Region'] == 'Mountain')],
                 path=[px.Constant("Region - Mountain"), 'State', 'zip_code'],
                  color='min_medicare_pricing_for_new_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(visits['min_medicare_pricing_for_new_patient']))
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [45]:
fig = px.treemap(visits[(visits['Region'] == 'New England')],
                 path=[px.Constant("Region"), 'State', 'zip_code'],
                 values='min_medicare_pricing_for_new_patient',
                  color='min_medicare_pricing_for_new_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(visits['min_medicare_pricing_for_new_patient']))
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [50]:
NE_Mt = visits[(visits['Region'] == 'New England') |
                        (visits['Region'] == 'Mountain')]

fig = px.treemap(NE_Mt,
                 path=['Region', 'State', 'zip_code'],
                 values='min_medicare_pricing_for_new_patient',
                  color='min_medicare_pricing_for_new_patient',
                  color_continuous_scale='RdBu',
                 color_continuous_midpoint=np.average(NE_Mt['min_medicare_pricing_for_new_patient']))
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

#### Treemap - Region, no value for color

In [35]:
fig = px.treemap(visits[(visits['Region'] == 'South Atlantic')],
                 path=[px.Constant("South Atlantic"), 'State', 'zip_code'],
                 values='min_medicare_pricing_for_new_patient')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [36]:
fig = px.treemap(visits[(visits['Region'] == 'Middle Atlantic')],
                 path=[px.Constant("Region"), 'State', 'zip_code'],
                 values='min_medicare_pricing_for_new_patient')
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()