In [104]:
import pandas as pd
import numpy as np
from sodapy import Socrata

#settings
pd.set_option('display.max_columns', 500)


https://dev.socrata.com/foundry/data.cdc.gov/swc5-untb  

https://www.cdc.gov/places/methodology/index.html
The 36 measures include 13 health outcomes, 9 prevention practices, 4 health risk behaviors, 7 disability measures (new for the 2023 release), and 3 health status measures.
The measures include major risk behaviors that lead to illness, suffering, and early death related to chronic diseases and conditions, as well as the conditions and diseases that are the most common, costly, and preventable of all health problems.
Each measure has a comprehensive definition that includes the background, significance, limitations of the indicator, data source, and limitations of the data resources.
Measures complement existing sets of surveillance indicators that report state, metropolitan area, and county data, including County Health Rankings and Chronic Disease Indicators.
The 95% confidence intervals (CIs) of modelled estimates are generated using a Monte Carlo simulation.

This dataset contains model-based county estimates. PLACES covers the entire United States—50 states and the District of Columbia—at county, place, census tract, and ZIP Code Tabulation Area levels. It provides information uniformly on this large scale for local areas at four geographic levels. Estimates were provided by the Centers for Disease Control and Prevention (CDC), Division of Population Health, Epidemiology and Surveillance Branch. PLACES was funded by the Robert Wood Johnson Foundation in conjunction with the CDC Foundation. This dataset includes estimates for 36 measures: 13 for health outcomes, 9 for preventive services use, 4 for chronic disease-related health risk behaviors, 7 for disabilities, and 3 for health status. These estimates can be used to identify emerging health problems and to help develop and carry out effective, targeted public health prevention activities. Because the small area model cannot detect effects due to local interventions, users are cautioned against using these estimates for program or policy evaluations. Data sources used to generate these model-based estimates are Behavioral Risk Factor Surveillance System (BRFSS) 2021 or 2020 data, Census Bureau 2021 or 2020 county population estimate data, and American Community Survey 2017–2021, or 2016–2020 estimates. The 2023 release uses 2021 BRFSS data for 29 measures and 2020 BRFSS data for 7 measures (all teeth lost, dental visits, mammograms, cervical cancer screening, colorectal cancer screening, core preventive services among older adults, and sleeping less than 7 hours) that the survey collects data on every other year. More information about the methodology can be found at www.cdc.gov/places.

In [119]:
client = Socrata("chronicdata.cdc.gov", 'phceef5Bs3FyOOOd82FyjwziG', username='mfarme@outlook.com', password='u7hwEt8ChspWWh')
results = client.get_all("swc5-untb") # PLACES 2023 Model-based county estimates (using BRFSS 2021,2020 data)
results = pd.DataFrame.from_records(results)
results.drop([
    'datasource', 'data_value_unit', 'data_value_type', 'locationid',
    'categoryid', 'geolocation', ':@computed_region_skr5_azej',
    'short_question_text','category'
], axis=1, inplace=True) 

In [199]:
df = results.copy() #avoid the 1.5 minute runtime of the cell above during development
df = df[df['year'] == '2021'] #2021 BRFSS Data Only
df['measureid'].unique()

array(['STROKE', 'OBESITY', 'COPD', 'CASTHMA', 'CHD', 'COGNITION', 'LPA',
       'DIABETES', 'BINGE', 'CHECKUP', 'CANCER', 'MHLTH', 'INDEPLIVE',
       'MOBILITY', 'SELFCARE', 'DEPRESSION', 'CSMOKING', 'PHLTH',
       'ACCESS2', 'DISABILITY', 'HEARING', 'CHOLSCREEN', 'BPHIGH',
       'HIGHCHOL', 'KIDNEY', 'GHLTH', 'VISION', 'ARTHRITIS', 'BPMED'],
      dtype=object)

In [200]:
df['measure'].unique()

array(['Stroke among adults aged >=18 years',
       'Obesity among adults aged >=18 years',
       'Chronic obstructive pulmonary disease among adults aged >=18 years',
       'Current asthma among adults aged >=18 years',
       'Coronary heart disease among adults aged >=18 years',
       'Cognitive disability among adults ages >=18 years',
       'No leisure-time physical activity among adults aged >=18 years',
       'Diagnosed diabetes among adults aged >=18 years',
       'Binge drinking among adults aged >=18 years',
       'Visits to doctor for routine checkup within the past year among adults aged >=18 years',
       'Cancer (excluding skin cancer) among adults aged >=18 years',
       'Mental health not good for >=14 days among adults aged >=18 years',
       'Independent living disability among adults aged >=18 years',
       'Mobility disability among adults aged >=18 years',
       'Self-care disability among adults aged >=18 years',
       'Depression among adults aged >

In [201]:
df = df[df['measureid'].isin(['DIABETES', 'GHLTH','CSMOKING','OBESITY', 'CHECKUP', 'DEPRESSION', 'ACCESS2', 'MHLTH'])] #Choice of DV for analysis 
df['m'] = df['measureid'].astype(str) + '_' + df['datavaluetypeid'].astype(str) 
df.drop(['measureid', 'datavaluetypeid'], axis=1, inplace=True)

In [202]:
df_transformed = pd.get_dummies(df, columns=['m'])

In [203]:
df_transformed.columns

Index(['year', 'stateabbr', 'statedesc', 'locationname', 'measure',
       'data_value', 'low_confidence_limit', 'high_confidence_limit',
       'totalpopulation', 'm_ACCESS2_AgeAdjPrv', 'm_ACCESS2_CrdPrv',
       'm_CHECKUP_AgeAdjPrv', 'm_CHECKUP_CrdPrv', 'm_CSMOKING_AgeAdjPrv',
       'm_CSMOKING_CrdPrv', 'm_DEPRESSION_AgeAdjPrv', 'm_DEPRESSION_CrdPrv',
       'm_DIABETES_AgeAdjPrv', 'm_DIABETES_CrdPrv', 'm_GHLTH_AgeAdjPrv',
       'm_GHLTH_CrdPrv', 'm_MHLTH_AgeAdjPrv', 'm_MHLTH_CrdPrv',
       'm_OBESITY_AgeAdjPrv', 'm_OBESITY_CrdPrv'],
      dtype='object')

In [204]:
measures = [
    'm_ACCESS2_AgeAdjPrv', 'm_ACCESS2_CrdPrv',
    'm_CHECKUP_AgeAdjPrv', 'm_CHECKUP_CrdPrv', 'm_CSMOKING_AgeAdjPrv',
    'm_CSMOKING_CrdPrv', 'm_DEPRESSION_AgeAdjPrv', 'm_DEPRESSION_CrdPrv',
    'm_DIABETES_AgeAdjPrv', 'm_DIABETES_CrdPrv', 'm_GHLTH_AgeAdjPrv',
    'm_GHLTH_CrdPrv', 'm_MHLTH_AgeAdjPrv', 'm_MHLTH_CrdPrv',
    'm_OBESITY_AgeAdjPrv', 'm_OBESITY_CrdPrv'
]

In [205]:
for i in measures:
    df_transformed[i] = np.where(df_transformed[i] == 1, df_transformed['data_value'], np.nan)
df_transformed = df_transformed.drop(columns=['data_value'])

In [206]:
df_transformed.head(10)

Unnamed: 0,year,stateabbr,statedesc,locationname,measure,low_confidence_limit,high_confidence_limit,totalpopulation,m_ACCESS2_AgeAdjPrv,m_ACCESS2_CrdPrv,m_CHECKUP_AgeAdjPrv,m_CHECKUP_CrdPrv,m_CSMOKING_AgeAdjPrv,m_CSMOKING_CrdPrv,m_DEPRESSION_AgeAdjPrv,m_DEPRESSION_CrdPrv,m_DIABETES_AgeAdjPrv,m_DIABETES_CrdPrv,m_GHLTH_AgeAdjPrv,m_GHLTH_CrdPrv,m_MHLTH_AgeAdjPrv,m_MHLTH_CrdPrv,m_OBESITY_AgeAdjPrv,m_OBESITY_CrdPrv
5,2021,TX,Texas,Deaf Smith,Obesity among adults aged >=18 years,33.6,51.0,18329,,,,,,,,,,,,,,,,42.0
7,2021,AL,Alabama,Dallas,Obesity among adults aged >=18 years,36.8,52.2,37619,,,,,,,,,,,,,,,,44.3
8,2021,AK,Alaska,Petersburg,Obesity among adults aged >=18 years,24.4,38.2,3356,,,,,,,,,,,,,,,,31.0
14,2021,CA,California,Alameda,Diagnosed diabetes among adults aged >=18 years,8.6,11.8,1648556,,,,,,,,,,10.1,,,,,,
15,2021,CA,California,San Benito,Diagnosed diabetes among adults aged >=18 years,9.4,12.9,66677,,,,,,,,,,11.0,,,,,,
19,2021,AR,Arkansas,Jackson,Visits to doctor for routine checkup within th...,72.9,81.8,16811,,,,77.6,,,,,,,,,,,,
21,2021,CA,California,Monterey,Diagnosed diabetes among adults aged >=18 years,10.3,13.9,437325,,,,,,,,,,12.0,,,,,,
22,2021,AK,Alaska,Bristol Bay,Mental health not good for >=14 days among adu...,13.4,17.6,838,,,,,,,,,,,,,15.4,,,
23,2021,AK,Alaska,Haines,Diagnosed diabetes among adults aged >=18 years,10.1,14.2,2071,,,,,,,,,,12.1,,,,,,
33,2021,AL,Alabama,Lawrence,Obesity among adults aged >=18 years,33.3,49.6,33090,,,,,,,,,,,,,,,41.3,
