In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
brfss = pd.read_csv("data/BRFSS2023.csv", index_col=0)

In [3]:
brfss

Unnamed: 0,_STATE,SEXVAR,_AGEG5YR,EDUCA,INCOME3,_RACE,BPHIGH6,DIABETE4,CVDINFR4,CVDCRHD4,CVDSTRK3,SMOKE100,SMOKDAY2,EXERANY2,_BMI5,_BMI5CAT,_RFBMI5,ALCDAY4,GENHLTH,PHYSHLTH,MENTHLTH,CHOLCHK3,CHECKUP1
0,1,2,13,5.0,99.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,,2.0,3047.0,4.0,2,888.0,2.0,88.0,88.0,3.0,2.0
1,1,2,13,5.0,99.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,,1.0,2856.0,3.0,2,888.0,2.0,88.0,88.0,2.0,2.0
2,1,2,13,4.0,2.0,2.0,1.0,3.0,2.0,2.0,2.0,1.0,3.0,1.0,2231.0,2.0,1,888.0,4.0,6.0,2.0,2.0,1.0
3,1,2,12,5.0,99.0,1.0,3.0,3.0,2.0,2.0,2.0,2.0,,1.0,2744.0,3.0,2,888.0,2.0,2.0,88.0,3.0,3.0
4,1,2,12,5.0,7.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,,1.0,2585.0,3.0,2,202.0,4.0,88.0,88.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433318,78,1,10,5.0,5.0,2.0,1.0,3.0,2.0,2.0,7.0,2.0,,1.0,2921.0,3.0,2,105.0,3.0,12.0,30.0,2.0,1.0
433319,78,2,3,6.0,6.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0,,2.0,2496.0,2.0,1,888.0,2.0,88.0,88.0,2.0,1.0
433320,78,2,7,6.0,10.0,8.0,3.0,3.0,2.0,2.0,2.0,2.0,,1.0,3438.0,4.0,2,201.0,2.0,10.0,88.0,2.0,1.0
433321,78,2,10,6.0,3.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,,1.0,2386.0,2.0,1,888.0,3.0,88.0,88.0,2.0,1.0


Labels dictionaries for basic value-to-label conversions

In [4]:
state_labels = {
         1: "Alabama", 2: "Alaska", 4: "Arizona", 5: "Arkansas", 6: "California", 8: "Colorado",
         9: "Connecticut", 10: "Delaware", 11: "District of Columbia", 12: "Florida", 13: "Georgia",
         15: "Hawaii", 16: "Idaho", 17: "Illinois", 18: "Indiana", 19: "Iowa", 20: "Kansas",
         22: "Louisiana", 23: "Maine", 24: "Maryland", 25: "Massachusetts", 26: "Michigan",
         27: "Minnesota", 28: "Mississippi", 29: "Missouri", 30: "Montana", 31: "Nebraska",
         32: "Nevada", 33: "New Hampshire", 34: "New Jersey", 35: "New Mexico", 36: "New York",
         37: "North Carolina", 38: "North Dakota", 39: "Ohio", 40: "Oklahoma", 41: "Oregon",
         44: "Rhode Island", 45: "South Carolina", 46: "South Dakota", 47: "Tennessee", 48: "Texas",
         49: "Utah", 50: "Vermont", 51: "Virginia", 53: "Washington", 54: "West Virginia",
         55: "Wisconsin", 56: "Wyoming", 66: "Guam", 72: "Puerto Rico", 78: "Virgin Islands"
    }

sex_labels = {1: 'Male', 2: 'Female'}

age_labels = {
    1: "18-24", 2: "25-29", 3: "30-34", 4: "35-39",
    5: "40-44", 6: "45-49", 7: "50-54", 8: "55-59",
    9: "60-64", 10: "65-69", 11: "70-74", 12: "75-79", 13: "80+",
    np.nan: None, 14: None
}

edulevel_labels = {
    1: "Never attended school",
    2: "Elementary",
    3: "Some high school",
    4: "High school graduate",
    5: "Some college or technical school",
    6: "College graduate or higher",
    9: "Refused",
    pd.NA: None
}

income_labels = {
    1: "Less than $10,000",
    2: "$10,000 to <$15,000",
    3: "$15,000 to <$20,000",
    4: "$20,000 to <$25,000",
    5: "$25,000 to <$35,000",
    6: "$35,000 to <$50,000",
    7: "$50,000 to <$75,000",
    8: "$75,000 to < $100,000",
    9: "$100,000 to < $150,000",
    10: "$150,000 to < $200,000", 
    11: "$200,000 or more",
    77: None,
    99: None
}

race_labels = {
    1: "White only (non-Hispanic)",
    2: "Black only (non-Hispanic)",
    3: "American Indian/Alaska Native (non-Hispanic)",
    4: "Asian (non-Hispanic)",
    5: "Native Hawaiian or other Pacific Islander only (non-Hispanic)",
    6: "Other race (non-Hispanic)",
    7: "Multiracial (non-Hispanic)",
    8: "Hispanic",
    9: None
}

blood_pressure_labels = {
    1: "Yes",
    2: "Yes, but only during pregnancy",
    3: "No",
    4: "Borderline high",
    7: None,
    9: None
}

diabetes_labels = {
    1: "Yes",
    2: "Yes, but only during pregnancy",
    3: "No",
    4: "Pre-diabetes or borderline diabetes",
    7: None,
    9: None
}

smokday2_labels = {
    1: "Every day",
    2: "Some days",
    3: "Not at all",
    7: None,
    9: None 
}

bmi_labels = {
    1: "Underweight",
    2: "Normal weight",
    3: "Overweight",
    4: "Obese"
}

bmi_binary = {
    1: "No",
    2: "Yes",
    9: None
}

genhealth_labels = {
    1: "Excellent",
    2: "Very good",
    3: "Good",
    4: "Fair",
    5: "Poor",
    7: None,
    9: None
}

cholchk3_labels = {
    1: "Never",
    2: "Within the past year",
    3: "Within the past 2 years",
    4: "Within the past 3 years",
    5: "Within the past 4 years",
    6: "Within the past 5 years",
    7: None,
    8: "5 or more years ago",
    9: None
}

checkup1_labels = {
    1: "Within past year",
    2: "Within past 2 years",
    3: "Within past 5 years",
    4: "5 or more years ago",
    7: None,
    8: "Never",
    9: None
}

#Simple binary that is used in at least two questions
simple_binary = {
    1: "Yes",
    2: "No",
    7: None, 
    9: None
}

Functions for numerical conversions

In [5]:
def process_alcday4(value):
    '''
    The survey data has values starting with 1 for weekly data
    (e.g. 102 means 2 days per week), and values starting with 2 for
    monthly data (e.g. 215 means 15 days per month). To standardize, this
    function will ignore leading 1's, 2's and multiply the weekly data by 4
    to approximate the monthly data. 

    Survey data also contains a value of 888 for "no drinks in past 30 days" 
    which will be converted to 0s.
    '''

    try:
        value_int = int(float(value))
        value_str = str(value_int)

        #If value starts with 1 (weekly data)
        if value_str.startswith(str(1)):
            days_per_week = int(value_str[2]) #Select last digit of survey value
            return days_per_week * 4 
        elif value_str.startswith(str(2)):
            days_per_month = int(value_str[1:]) #Ignore first digit (2) and keep days
            return days_per_month 
        elif value == 888:
            return 0 #No drinks at all
        else: 
            return None #treat 777 (Don't know) and 999 (Refused) as NaN values

    except:
        return None #Values of NaN keep as NaN
    

def removes_missing(value):
    '''
    Converts missing values to None and keeps other
    values the same. Useful for PHYSHLTH and MENTHLTH variables
    that are simply raw counts of days. 
    '''
    if value == 88:
        return None
    elif value == 77:
        return None
    elif value == 99:
        return None
    else:
        return value
    

Conversions 

In [6]:
brfss["_STATE"] = brfss["_STATE"].replace(state_labels)
brfss['SEXVAR'] = brfss['SEXVAR'].replace(sex_labels)
brfss["_AGEG5YR"] = brfss["_AGEG5YR"].replace(age_labels)
brfss["EDUCA"] = brfss["EDUCA"].replace(edulevel_labels)
brfss["INCOME3"] = brfss["INCOME3"].replace(income_labels)
brfss["_RACE"] = brfss["_RACE"].replace(race_labels)
brfss["BPHIGH6"] = brfss["BPHIGH6"].replace(blood_pressure_labels)
brfss["DIABETE4"] = brfss["DIABETE4"].replace(diabetes_labels)
brfss["CVDINFR4"] = brfss["CVDINFR4"].replace(simple_binary)
brfss["CVDCRHD4"] = brfss["CVDCRHD4"].replace(simple_binary)
brfss["CVDSTRK3"] = brfss["CVDSTRK3"].replace(simple_binary)
brfss["SMOKE100"] = brfss["SMOKE100"].replace(simple_binary)
brfss["SMOKDAY2"] = brfss["SMOKDAY2"].replace(smokday2_labels)
brfss["EXERANY2"] = brfss["EXERANY2"].replace(simple_binary)
brfss['_BMI5'] = brfss['_BMI5'] / 100   #Source data has 2 implied decimal places
brfss["_BMI5CAT"] = brfss["_BMI5CAT"].replace(bmi_labels)
brfss["_RFBMI5"] = brfss["_RFBMI5"].replace(bmi_binary)
brfss["ALCDAY4"] = brfss["ALCDAY4"].apply(process_alcday4)
brfss["GENHLTH"] = brfss["GENHLTH"].replace(genhealth_labels)
brfss["PHYSHLTH"] = brfss["PHYSHLTH"].apply(removes_missing)
brfss["MENTHLTH"] = brfss["MENTHLTH"].apply(removes_missing)
brfss["CHOLCHK3"] = brfss["CHOLCHK3"].replace(cholchk3_labels)
brfss["CHECKUP1"] = brfss["CHECKUP1"].replace(checkup1_labels)



In [7]:
brfss

Unnamed: 0,_STATE,SEXVAR,_AGEG5YR,EDUCA,INCOME3,_RACE,BPHIGH6,DIABETE4,CVDINFR4,CVDCRHD4,CVDSTRK3,SMOKE100,SMOKDAY2,EXERANY2,_BMI5,_BMI5CAT,_RFBMI5,ALCDAY4,GENHLTH,PHYSHLTH,MENTHLTH,CHOLCHK3,CHECKUP1
0,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,Yes,No,No,No,No,,No,30.47,Obese,Yes,0.0,Very good,,,Within the past 2 years,Within past 2 years
1,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,No,No,No,No,No,,Yes,28.56,Overweight,Yes,0.0,Very good,,,Within the past year,Within past 2 years
2,Alabama,Female,80+,High school graduate,"$10,000 to <$15,000",Black only (non-Hispanic),Yes,No,No,No,No,Yes,Not at all,Yes,22.31,Normal weight,No,0.0,Fair,6.0,2.0,Within the past year,Within past year
3,Alabama,Female,75-79,Some college or technical school,,White only (non-Hispanic),No,No,No,No,No,No,,Yes,27.44,Overweight,Yes,0.0,Very good,2.0,,Within the past 2 years,Within past 5 years
4,Alabama,Female,75-79,Some college or technical school,"$50,000 to <$75,000",White only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,25.85,Overweight,Yes,2.0,Fair,,,Within the past year,Within past year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433318,Virgin Islands,Male,65-69,Some college or technical school,"$25,000 to <$35,000",Black only (non-Hispanic),Yes,No,No,No,,No,,Yes,29.21,Overweight,Yes,20.0,Good,12.0,30.0,Within the past year,Within past year
433319,Virgin Islands,Female,30-34,College graduate or higher,"$35,000 to <$50,000",Black only (non-Hispanic),No,No,No,No,No,No,,No,24.96,Normal weight,No,0.0,Very good,,,Within the past year,Within past year
433320,Virgin Islands,Female,50-54,College graduate or higher,"$150,000 to < $200,000",Hispanic,No,No,No,No,No,No,,Yes,34.38,Obese,Yes,1.0,Very good,10.0,,Within the past year,Within past year
433321,Virgin Islands,Female,65-69,College graduate or higher,"$15,000 to <$20,000",Black only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,23.86,Normal weight,No,0.0,Good,,,Within the past year,Within past year


Change columns names

In [8]:
brfss.rename(columns={
    "_STATE": "state",
    "SEXVAR": "sex",
    "_AGEG5YR": "age",
    "EDUCA": "education",
    "INCOME3": "income",
    "_RACE": "race",
    "BPHIGH6": "high_blood_pressure",
    "DIABETE4": "diabetes",
    "CVDINFR4": "heart_attack",
    "CVDCRHD4": "heart_disease",
    "CVDSTRK3": "stroke",
    "SMOKE100": "smoked_100_cig",
    "SMOKDAY2": "smoking_frequency",
    "EXERANY2": "physical_activity",
    "_BMI5": "bmi",
    "_BMI5CAT": "bmi_cat",
    "_RFBMI5": "overweight",
    "ALCDAY4": "alcohol",
    "GENHLTH": "gen_health",
    "PHYSHLTH": "phys_health",
    "MENTHLTH": "mental_health",
    "CHOLCHK3": "cholesterol_check",
    "CHECKUP1": "routine_checkup"
}, inplace=True)


Save final csv

In [9]:
brfss

Unnamed: 0,state,sex,age,education,income,race,high_blood_pressure,diabetes,heart_attack,heart_disease,stroke,smoked_100_cig,smoking_frequency,physical_activity,bmi,bmi_cat,overweight,alcohol,gen_health,phys_health,mental_health,cholesterol_check,routine_checkup
0,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,Yes,No,No,No,No,,No,30.47,Obese,Yes,0.0,Very good,,,Within the past 2 years,Within past 2 years
1,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,No,No,No,No,No,,Yes,28.56,Overweight,Yes,0.0,Very good,,,Within the past year,Within past 2 years
2,Alabama,Female,80+,High school graduate,"$10,000 to <$15,000",Black only (non-Hispanic),Yes,No,No,No,No,Yes,Not at all,Yes,22.31,Normal weight,No,0.0,Fair,6.0,2.0,Within the past year,Within past year
3,Alabama,Female,75-79,Some college or technical school,,White only (non-Hispanic),No,No,No,No,No,No,,Yes,27.44,Overweight,Yes,0.0,Very good,2.0,,Within the past 2 years,Within past 5 years
4,Alabama,Female,75-79,Some college or technical school,"$50,000 to <$75,000",White only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,25.85,Overweight,Yes,2.0,Fair,,,Within the past year,Within past year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433318,Virgin Islands,Male,65-69,Some college or technical school,"$25,000 to <$35,000",Black only (non-Hispanic),Yes,No,No,No,,No,,Yes,29.21,Overweight,Yes,20.0,Good,12.0,30.0,Within the past year,Within past year
433319,Virgin Islands,Female,30-34,College graduate or higher,"$35,000 to <$50,000",Black only (non-Hispanic),No,No,No,No,No,No,,No,24.96,Normal weight,No,0.0,Very good,,,Within the past year,Within past year
433320,Virgin Islands,Female,50-54,College graduate or higher,"$150,000 to < $200,000",Hispanic,No,No,No,No,No,No,,Yes,34.38,Obese,Yes,1.0,Very good,10.0,,Within the past year,Within past year
433321,Virgin Islands,Female,65-69,College graduate or higher,"$15,000 to <$20,000",Black only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,23.86,Normal weight,No,0.0,Good,,,Within the past year,Within past year


In [10]:
brfss.to_csv("brfss_processed.csv")

Ensure correct parsing

In [11]:
saved = pd.read_csv("brfss_processed.csv", index_col=0)
print(brfss.shape)
print(saved.shape)

(433323, 23)
(433323, 23)


In [12]:
saved

Unnamed: 0,state,sex,age,education,income,race,high_blood_pressure,diabetes,heart_attack,heart_disease,stroke,smoked_100_cig,smoking_frequency,physical_activity,bmi,bmi_cat,overweight,alcohol,gen_health,phys_health,mental_health,cholesterol_check,routine_checkup
0,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,Yes,No,No,No,No,,No,30.47,Obese,Yes,0.0,Very good,,,Within the past 2 years,Within past 2 years
1,Alabama,Female,80+,Some college or technical school,,White only (non-Hispanic),Yes,No,No,No,No,No,,Yes,28.56,Overweight,Yes,0.0,Very good,,,Within the past year,Within past 2 years
2,Alabama,Female,80+,High school graduate,"$10,000 to <$15,000",Black only (non-Hispanic),Yes,No,No,No,No,Yes,Not at all,Yes,22.31,Normal weight,No,0.0,Fair,6.0,2.0,Within the past year,Within past year
3,Alabama,Female,75-79,Some college or technical school,,White only (non-Hispanic),No,No,No,No,No,No,,Yes,27.44,Overweight,Yes,0.0,Very good,2.0,,Within the past 2 years,Within past 5 years
4,Alabama,Female,75-79,Some college or technical school,"$50,000 to <$75,000",White only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,25.85,Overweight,Yes,2.0,Fair,,,Within the past year,Within past year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433318,Virgin Islands,Male,65-69,Some college or technical school,"$25,000 to <$35,000",Black only (non-Hispanic),Yes,No,No,No,,No,,Yes,29.21,Overweight,Yes,20.0,Good,12.0,30.0,Within the past year,Within past year
433319,Virgin Islands,Female,30-34,College graduate or higher,"$35,000 to <$50,000",Black only (non-Hispanic),No,No,No,No,No,No,,No,24.96,Normal weight,No,0.0,Very good,,,Within the past year,Within past year
433320,Virgin Islands,Female,50-54,College graduate or higher,"$150,000 to < $200,000",Hispanic,No,No,No,No,No,No,,Yes,34.38,Obese,Yes,1.0,Very good,10.0,,Within the past year,Within past year
433321,Virgin Islands,Female,65-69,College graduate or higher,"$15,000 to <$20,000",Black only (non-Hispanic),Yes,Yes,No,No,No,No,,Yes,23.86,Normal weight,No,0.0,Good,,,Within the past year,Within past year


In [13]:
brfss[brfss.heart_attack != brfss.heart_disease]

Unnamed: 0,state,sex,age,education,income,race,high_blood_pressure,diabetes,heart_attack,heart_disease,stroke,smoked_100_cig,smoking_frequency,physical_activity,bmi,bmi_cat,overweight,alcohol,gen_health,phys_health,mental_health,cholesterol_check,routine_checkup
21,Alabama,Male,80+,High school graduate,"$20,000 to <$25,000",White only (non-Hispanic),Yes,No,No,Yes,No,No,,Yes,25.00,Overweight,Yes,0.0,Very good,,,Within the past year,Within past year
23,Alabama,Male,70-74,Some high school,"Less than $10,000",White only (non-Hispanic),Yes,No,Yes,No,No,,,Yes,,,,,Good,,,Within the past year,Within past year
24,Alabama,Female,70-74,Some college or technical school,"$35,000 to <$50,000",Black only (non-Hispanic),Yes,Yes,Yes,No,No,No,,Yes,26.63,Overweight,Yes,0.0,Good,,,Within the past year,Within past year
38,Alabama,Female,70-74,Some college or technical school,"$35,000 to <$50,000",White only (non-Hispanic),Yes,No,No,Yes,No,No,,Yes,30.95,Obese,Yes,0.0,Good,10.0,3.0,Within the past year,Within past year
49,Alabama,Female,60-64,High school graduate,"$150,000 to < $200,000",White only (non-Hispanic),No,Yes,Yes,No,No,Yes,Not at all,No,24.27,Normal weight,No,1.0,Fair,,30.0,Within the past year,Within past year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433237,Virgin Islands,Male,75-79,Elementary,,American Indian/Alaska Native (non-Hispanic),Yes,No,,,No,No,,No,31.47,Obese,Yes,0.0,Good,12.0,,Within the past 2 years,
433250,Virgin Islands,Male,80+,High school graduate,"$15,000 to <$20,000",Multiracial (non-Hispanic),Yes,No,Yes,No,No,No,,No,19.49,Normal weight,No,3.0,Good,3.0,,Within the past year,Within past year
433283,Virgin Islands,Male,40-44,High school graduate,,Other race (non-Hispanic),Yes,Yes,No,Yes,No,No,,No,34.08,Obese,Yes,0.0,Fair,,25.0,Within the past year,Within past year
433295,Virgin Islands,Female,65-69,High school graduate,"$10,000 to <$15,000",Hispanic,Yes,Pre-diabetes or borderline diabetes,Yes,No,,No,,No,26.05,Overweight,Yes,0.0,Poor,,,Within the past year,Within past year
