In [58]:
import pandas as pd
import numpy as np
from sodapy import Socrata

#settings
pd.set_option('display.max_columns', 500)


https://dev.socrata.com/foundry/data.cdc.gov/swc5-untb  

This dataset contains model-based county estimates. PLACES covers the entire United States—50 states and the District of Columbia—at county, place, census tract, and ZIP Code Tabulation Area levels. It provides information uniformly on this large scale for local areas at four geographic levels. Estimates were provided by the Centers for Disease Control and Prevention (CDC), Division of Population Health, Epidemiology and Surveillance Branch. PLACES was funded by the Robert Wood Johnson Foundation in conjunction with the CDC Foundation. This dataset includes estimates for 36 measures: 13 for health outcomes, 9 for preventive services use, 4 for chronic disease-related health risk behaviors, 7 for disabilities, and 3 for health status. These estimates can be used to identify emerging health problems and to help develop and carry out effective, targeted public health prevention activities. Because the small area model cannot detect effects due to local interventions, users are cautioned against using these estimates for program or policy evaluations. Data sources used to generate these model-based estimates are Behavioral Risk Factor Surveillance System (BRFSS) 2021 or 2020 data, Census Bureau 2021 or 2020 county population estimate data, and American Community Survey 2017–2021, or 2016–2020 estimates. The 2023 release uses 2021 BRFSS data for 29 measures and 2020 BRFSS data for 7 measures (all teeth lost, dental visits, mammograms, cervical cancer screening, colorectal cancer screening, core preventive services among older adults, and sleeping less than 7 hours) that the survey collects data on every other year. More information about the methodology can be found at www.cdc.gov/places.

In [85]:
client = Socrata("chronicdata.cdc.gov", 'phceef5Bs3FyOOOd82FyjwziG', username='mfarme@outlook.com', password='u7hwEt8ChspWWh')
results = client.get_all("swc5-untb") # PLACES 2023 Model-based county estimates (using BRFSS 2021,2020 data)
df = pd.DataFrame.from_records(results)
df.drop([
    'datasource', 'data_value_unit', 'data_value_type', 'locationid',
    'categoryid', 'geolocation', ':@computed_region_skr5_azej',
    'short_question_text','category'
], axis=1, inplace=True) 

In [86]:
df = df[df['year'] == '2021'] #2021 BRFSS Data Only
df = df.sort_values(by=['locationname'])
df.head(1)

Unnamed: 0,year,stateabbr,statedesc,locationname,measure,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,measureid,datavaluetypeid
163600,2021,SC,South Carolina,Abbeville,Current asthma among adults aged >=18 years,10.3,9.0,11.7,24299,CASTHMA,AgeAdjPrv


In [87]:
df['m'] = df['measureid'].astype(str) + '_' + df['datavaluetypeid'].astype(str)
df.drop(['measureid', 'datavaluetypeid'], axis=1, inplace=True)

In [88]:
df_transformed = pd.get_dummies(df, columns=['m'])

In [89]:
df_transformed.columns

Index(['year', 'stateabbr', 'statedesc', 'locationname', 'measure',
       'data_value', 'low_confidence_limit', 'high_confidence_limit',
       'totalpopulation', 'm_ACCESS2_AgeAdjPrv', 'm_ACCESS2_CrdPrv',
       'm_ARTHRITIS_AgeAdjPrv', 'm_ARTHRITIS_CrdPrv', 'm_BINGE_AgeAdjPrv',
       'm_BINGE_CrdPrv', 'm_BPHIGH_AgeAdjPrv', 'm_BPHIGH_CrdPrv',
       'm_BPMED_AgeAdjPrv', 'm_BPMED_CrdPrv', 'm_CANCER_AgeAdjPrv',
       'm_CANCER_CrdPrv', 'm_CASTHMA_AgeAdjPrv', 'm_CASTHMA_CrdPrv',
       'm_CHD_AgeAdjPrv', 'm_CHD_CrdPrv', 'm_CHECKUP_AgeAdjPrv',
       'm_CHECKUP_CrdPrv', 'm_CHOLSCREEN_AgeAdjPrv', 'm_CHOLSCREEN_CrdPrv',
       'm_COGNITION_AgeAdjPrv', 'm_COGNITION_CrdPrv', 'm_COPD_AgeAdjPrv',
       'm_COPD_CrdPrv', 'm_CSMOKING_AgeAdjPrv', 'm_CSMOKING_CrdPrv',
       'm_DEPRESSION_AgeAdjPrv', 'm_DEPRESSION_CrdPrv', 'm_DIABETES_AgeAdjPrv',
       'm_DIABETES_CrdPrv', 'm_DISABILITY_AgeAdjPrv', 'm_DISABILITY_CrdPrv',
       'm_GHLTH_AgeAdjPrv', 'm_GHLTH_CrdPrv', 'm_HEARING_AgeAdjPrv',
     

In [90]:
measures = [
    'm_ACCESS2_AgeAdjPrv', 'm_ACCESS2_CrdPrv',
    'm_ARTHRITIS_AgeAdjPrv', 'm_ARTHRITIS_CrdPrv', 'm_BINGE_AgeAdjPrv',
    'm_BINGE_CrdPrv', 'm_BPHIGH_AgeAdjPrv', 'm_BPHIGH_CrdPrv',
    'm_BPMED_AgeAdjPrv', 'm_BPMED_CrdPrv', 'm_CANCER_AgeAdjPrv',
    'm_CANCER_CrdPrv', 'm_CASTHMA_AgeAdjPrv', 'm_CASTHMA_CrdPrv',
    'm_CHD_AgeAdjPrv', 'm_CHD_CrdPrv', 'm_CHECKUP_AgeAdjPrv',
    'm_CHECKUP_CrdPrv', 'm_CHOLSCREEN_AgeAdjPrv', 'm_CHOLSCREEN_CrdPrv',
    'm_COGNITION_AgeAdjPrv', 'm_COGNITION_CrdPrv', 'm_COPD_AgeAdjPrv',
    'm_COPD_CrdPrv', 'm_CSMOKING_AgeAdjPrv', 'm_CSMOKING_CrdPrv',
    'm_DEPRESSION_AgeAdjPrv', 'm_DEPRESSION_CrdPrv', 'm_DIABETES_AgeAdjPrv',
    'm_DIABETES_CrdPrv', 'm_DISABILITY_AgeAdjPrv', 'm_DISABILITY_CrdPrv',
    'm_GHLTH_AgeAdjPrv', 'm_GHLTH_CrdPrv', 'm_HEARING_AgeAdjPrv',
    'm_HEARING_CrdPrv', 'm_HIGHCHOL_AgeAdjPrv', 'm_HIGHCHOL_CrdPrv',
    'm_INDEPLIVE_AgeAdjPrv', 'm_INDEPLIVE_CrdPrv', 'm_KIDNEY_AgeAdjPrv',
    'm_KIDNEY_CrdPrv', 'm_LPA_AgeAdjPrv', 'm_LPA_CrdPrv',
    'm_MHLTH_AgeAdjPrv', 'm_MHLTH_CrdPrv', 'm_MOBILITY_AgeAdjPrv',
    'm_MOBILITY_CrdPrv', 'm_OBESITY_AgeAdjPrv', 'm_OBESITY_CrdPrv',
    'm_PHLTH_AgeAdjPrv', 'm_PHLTH_CrdPrv', 'm_SELFCARE_AgeAdjPrv',
    'm_SELFCARE_CrdPrv', 'm_STROKE_AgeAdjPrv', 'm_STROKE_CrdPrv',
    'm_VISION_AgeAdjPrv', 'm_VISION_CrdPrv'
]

In [95]:
for i in measures:
    df_transformed[i] = np.where(df_transformed[i] == 1, df_transformed['data_value'], np.nan)

# Drop the data_value column
df_transformed = df_transformed.drop(columns=['data_value'])

KeyError: 'data_value'

In [103]:
df_transformed.head(1)
#combine low_confidence and high_confidence into one column listed like this [low_confidence, high_confidence]
df_transformed['confidence_interval'] = df_transformed['low_confidence_limit'].astype(str) + ',' + df_transformed['high_confidence_limit'].astype(str)
df_transformed.head(1)

Unnamed: 0,year,stateabbr,statedesc,locationname,measure,low_confidence_limit,high_confidence_limit,totalpopulation,m_ACCESS2_AgeAdjPrv,m_ACCESS2_CrdPrv,m_ARTHRITIS_AgeAdjPrv,m_ARTHRITIS_CrdPrv,m_BINGE_AgeAdjPrv,m_BINGE_CrdPrv,m_BPHIGH_AgeAdjPrv,m_BPHIGH_CrdPrv,m_BPMED_AgeAdjPrv,m_BPMED_CrdPrv,m_CANCER_AgeAdjPrv,m_CANCER_CrdPrv,m_CASTHMA_AgeAdjPrv,m_CASTHMA_CrdPrv,m_CHD_AgeAdjPrv,m_CHD_CrdPrv,m_CHECKUP_AgeAdjPrv,m_CHECKUP_CrdPrv,m_CHOLSCREEN_AgeAdjPrv,m_CHOLSCREEN_CrdPrv,m_COGNITION_AgeAdjPrv,m_COGNITION_CrdPrv,m_COPD_AgeAdjPrv,m_COPD_CrdPrv,m_CSMOKING_AgeAdjPrv,m_CSMOKING_CrdPrv,m_DEPRESSION_AgeAdjPrv,m_DEPRESSION_CrdPrv,m_DIABETES_AgeAdjPrv,m_DIABETES_CrdPrv,m_DISABILITY_AgeAdjPrv,m_DISABILITY_CrdPrv,m_GHLTH_AgeAdjPrv,m_GHLTH_CrdPrv,m_HEARING_AgeAdjPrv,m_HEARING_CrdPrv,m_HIGHCHOL_AgeAdjPrv,m_HIGHCHOL_CrdPrv,m_INDEPLIVE_AgeAdjPrv,m_INDEPLIVE_CrdPrv,m_KIDNEY_AgeAdjPrv,m_KIDNEY_CrdPrv,m_LPA_AgeAdjPrv,m_LPA_CrdPrv,m_MHLTH_AgeAdjPrv,m_MHLTH_CrdPrv,m_MOBILITY_AgeAdjPrv,m_MOBILITY_CrdPrv,m_OBESITY_AgeAdjPrv,m_OBESITY_CrdPrv,m_PHLTH_AgeAdjPrv,m_PHLTH_CrdPrv,m_SELFCARE_AgeAdjPrv,m_SELFCARE_CrdPrv,m_STROKE_AgeAdjPrv,m_STROKE_CrdPrv,m_VISION_AgeAdjPrv,m_VISION_CrdPrv,confidence_interval,m_ACCESS2_AgeAdjPrv_CI
163600,2021,SC,South Carolina,Abbeville,Current asthma among adults aged >=18 years,9.0,11.7,24299,,,,,,,,,,,,,10.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"9.0,11.7",


In [102]:
#using measures list for columns, create new column with same name as measure but with _CI appended on df_transformed
for measure in measures:
    df_transformed[measure + '_CI'] = df_transformed[measure + '_CI'].astype(str)


KeyError: 'm_ACCESS2_CrdPrv_CI'