# Preliminaries

In [1]:
# import packages
import pandas as pd
import numpy as np

In [2]:
# read raw data
data = pd.read_csv("data/raw/brfss2020.csv")

In [3]:
data.columns

Index(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENM1',
       ...
       '_RFPSA23', '_CLNSCPY', '_SGMSCPY', '_SGMS10Y', '_RFBLDS4', '_STOLDNA',
       '_VIRCOLN', '_SBONTIM', '_CRCREC1', '_AIDTST4'],
      dtype='object', length=279)

# Select Relevant Columns

In [4]:
# PHYSHLTH - How many days during the past 30 days was your physical health not good? 
# MENTHLTH - How many days during the past 30 days was your mental health not good?
# SEXVAR - Sex at birth
# QSTLANG - Language identifier
# _IMPRACE - Imputed race/ethnicits
# MSCODE - Metropolitan status code
# _METSTAT - Metropolitan status
# VETERAN3 - Veteran status
# EDUCA - Highest schooling
# EMPLOY1 - Current employment
# INCOME2 - Annual household income
# MARITAL - Marital status
# CHILDREN - Number of children
# EXERANY2 - Physical exercise
# SLEPTIM1 - Sleep
# SMOKE100 - Smoked 100 cigarettes
# SMOKDAY2 - Smoking frequency
# ALCDAY5 - Alcohol frequency
# DRNK3GE5 - Heavy drinking frequency
# ECIGARET - Ever e-cigarette smoker
# ECIGNOW - Current e-cigarette smoker
# MARIJAN1 - Marijuana frequency
# CVDINFR4 - Ever Heart attack
# CVDCRHD4 - Ever CVD
# CVDSTRK3 - Ever stroke
# ASTHMA3 - Ever asthma
# ASTHNOW - Current asthma
# CHCCOPD2 - Ever COPD
# HAVARTH4 - Ever arthritis
# ADDEPEV3 - Ever depressive order
# CHCKDNY2 - Ever kidney disease
# DIABETE4 - Ever diabetes
# TOLDCFS - Ever chronic fatigue syndrome
# HAVECFS - Current chronic fatigue syndrome
# CIMEMLOS - Confusion or memory loss
# CHCSCNCR - Ever skin cancer
# CHCOCNCR - Ever other cancer
# LASTDEN4 - Last dentist visit
# ACEDEPRS - Live with anyone who was depressed, mentally ill, or suicidal
# ACEDRINK - Live with anyone who was a problem drinker or alcoholic
# ACEDRUGS - Live with anyone who used illegal street drugs or who abused prescription medications
# ACEPRISN - Live with anyone who served time or was sentenced to serve time in a prison
# PREGNANT - Pregnant
# Weight2 - Weight
# Height3 - Height
# _BMI5 - BMI
# DEAF - Deaf
# BLIND - Blind
# DECIDE - Difficulty concentrating, remembering, or making decisions
# DIFFWALK - Difficulty walking or climbing stairs
# DIFFDRES - Difficulty dressing or bathing
# DIFFALON - Difficulty doing errands alone
# FLUSHOT7 - Recent flu shot
# CNCRDIFF - Number of cancers
# CNCRTYP1 - Type of cancer
# CSRVTRT3 - Currently receiving cancer treatment
# CSRVDOC1 - Doctor providing majority of care
# CSRVRTRN - Received instructions for routine cancer check-ups
# CSRVINST - Received written instructions for routine cancer check-ups
# _HCVU651 - Have any form of health coverage
# PERSDOC2 - Have one person you think of as your personal doctor or health care provider
# DIABEDU - Have you ever taken a course or class in how to manage your diabetes yourself?

In [5]:
# define columns to keep
required_columns_data = data[[
    # response variables
    '_PHYS14D', '_MENT14D', 'PHYSHLTH', 'MENTHLTH',
    # demographics
    '_AGE80', 'SEXVAR', 'QSTLANG', '_IMPRACE', 'VETERAN3', '_BMI5CAT', 'PREGNANT',
    # socio-economics
     '_METSTAT', 'MSCODE', 'EDUCA', 'EMPLOY1', 'INCOME2', 
    # relationships
    'MARITAL', 'CHILDREN', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
    # lifestyle
    'EXERANY2', 'SLEPTIM1', 'SMOKE100', 'SMOKDAY2', 'AVEDRNK3', 'ECIGARET', 'ECIGNOW',
    # health conditions
    'CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCCOPD2', 'HAVARTH4', 'ADDEPEV3', 'CHCKDNY2', 'DIABETE4', 'CIMEMLOS',
    'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON',
    # disabilities
    'DEAF', 'BLIND',
    # health usage
    '_HCVU651',
    # cancer
    'CHCSCNCR', 'CHCOCNCR', 'CNCRDIFF', 'CNCRTYP1', 
    # treatment variables
    'DIABEDU', 'PERSDOC2'
]]

# Subset Diabetes Patients

In [6]:
# subset patients
patients = required_columns_data[(required_columns_data['DIABEDU'] <= 2.0)]
# patients = patients[(patients['PERSDOC2'] <= 3.0)]

# remove patients with missing response
patients = patients[patients['_PHYS14D'] != 9.0]
patients = patients[patients['_MENT14D'] != 9.0]

In [7]:
conditions = [
    patients['PERSDOC2'] == 1.0,
    patients['PERSDOC2'] == 2.0,
    patients['PERSDOC2'] == 3.0
]

values = ['only_one', 'more_than_one', 'no']

patients = patients.assign(personal_doctor = np.select(conditions, values, default = ''))
patients.drop('PERSDOC2', axis = 1, inplace = True)

In [8]:
conditions = [
    patients['DIABEDU'] == 1.0,
    patients['DIABEDU'] == 2.0
]

values = ['yes', 'no']

patients = patients.assign(diabetes_course = np.select(conditions, values, default = ''))
patients.drop('DIABEDU', axis = 1, inplace = True)

# Process Variables

### Response

In [9]:
patients.rename(columns = {'_PHYS14D': 'phys_health_status'}, inplace = True)
patients.rename(columns = {'_MENT14D': 'ment_health_status'}, inplace = True)

In [10]:
patients.rename(columns = {'PHYSHLTH': 'phys_health_days'}, inplace = True)
patients['phys_health_days'].replace(88, 0, inplace = True)

patients.rename(columns = {'MENTHLTH': 'ment_health_days'}, inplace = True)
patients['ment_health_days'].replace(88, 0, inplace = True)

### Age

In [11]:
patients.rename(columns = {'_AGE80': 'age'}, inplace = True)

### Gender

In [12]:
conditions = [
    patients['SEXVAR'] == 1.0,
    patients['SEXVAR'] == 2.0
]

values = ['male', 'female']

patients = patients.assign(sex = np.select(conditions, values, default = ''))
patients.drop('SEXVAR', axis = 1, inplace = True)

### Language

In [13]:
conditions = [
    patients['QSTLANG'] == 1.0,
    patients['QSTLANG'] == 2.0
]

values = ['english', 'spanish']

patients = patients.assign(language = np.select(conditions, values, default = ''))
patients.drop('QSTLANG', axis = 1, inplace = True)

### Race

In [14]:
conditions = [
    patients['_IMPRACE'] == 1.0,
    patients['_IMPRACE'] == 2.0,
    patients['_IMPRACE'] == 3.0,
    patients['_IMPRACE'] == 4.0,
    patients['_IMPRACE'] == 5.0,
    patients['_IMPRACE'] == 6.0
]

values = ['white', 'black', 'asian', 'native', 'hispanic', 'other']

patients = patients.assign(race = np.select(conditions, values, default = ''))
patients.drop('_IMPRACE', axis = 1, inplace = True)

### Veteran

In [15]:
conditions = [
    patients['VETERAN3'] == 1.0,
    patients['VETERAN3'] == 2.0,
    patients['VETERAN3'] == 7.0,
    patients['VETERAN3'] == 9.0
]

values = ['yes', 'no', 'unknown', 'unknown']

patients = patients.assign(veteran = np.select(conditions, values, default = ''))
patients.drop('VETERAN3', axis = 1, inplace = True)

### BMI

In [16]:
conditions = [
    patients['_BMI5CAT'] == 1.0,
    patients['_BMI5CAT'] == 2.0,
    patients['_BMI5CAT'] == 3.0,
    patients['_BMI5CAT'] == 4.0
]

values = ['underweight', 'normal', 'overweight', 'obese']

patients = patients.assign(bmi = np.select(conditions, values, default = ''))
patients.drop('_BMI5CAT', axis = 1, inplace = True)

### Metropolitan Status

In [17]:
conditions = [
    patients['_METSTAT'] == 1.0,
    patients['_METSTAT'] == 2.0
]

values = ['yes', 'no']

patients = patients.assign(metro = np.select(conditions, values, default = ''))
patients.drop('_METSTAT', axis = 1, inplace = True)

In [18]:
conditions = [
    patients['MSCODE'] == 1.0,
    patients['MSCODE'] == 2.0,
    patients['MSCODE'] == 3.0,
    patients['MSCODE'] == 5.0
]

values = ['city_center', 'city', 'county', 'outside']

patients = patients.assign(metro_granular = np.select(conditions, values, default = ''))
patients.drop('MSCODE', axis = 1, inplace = True)

### Education

In [19]:
conditions = [
    patients['EDUCA'] == 1.0,
    patients['EDUCA'] == 2.0,
    patients['EDUCA'] == 3.0,
    patients['EDUCA'] == 4.0,
    patients['EDUCA'] == 5.0,
    patients['EDUCA'] == 6.0
]

values = ['no_high_school', 'no_high_school', 'high_school_some', 'high_school_graduate', 'college_some', 'college_graduate']

patients = patients.assign(education = np.select(conditions, values, default = ''))
patients.drop('EDUCA', axis = 1, inplace = True)

### Employment

In [20]:
conditions = [
    patients['EMPLOY1'] == 1.0,
    patients['EMPLOY1'] == 2.0,
    patients['EMPLOY1'] == 3.0,
    patients['EMPLOY1'] == 4.0,
    patients['EMPLOY1'] == 5.0,
    patients['EMPLOY1'] == 6.0,
    patients['EMPLOY1'] == 7.0,
    patients['EMPLOY1'] == 8.0
]

values = ['employed', 'employed', 'unemployed', 'unemployed', 'non_labor_force', 'non_labor_force', 'non_labor_force', 'non_labor_force']

patients = patients.assign(employment = np.select(conditions, values, default = ''))
patients.drop('EMPLOY1', axis = 1, inplace = True)

### Income

In [21]:
conditions = [
    patients['INCOME2'] == 1.0,
    patients['INCOME2'] == 2.0,
    patients['INCOME2'] == 3.0,
    patients['INCOME2'] == 4.0,
    patients['INCOME2'] == 5.0,
    patients['INCOME2'] == 6.0,
    patients['INCOME2'] == 7.0,
    patients['INCOME2'] == 8.0
]

values = ['poverty', 'poverty', 'low', 'low', 'low', 'middle', 'middle', 'high']

patients = patients.assign(income = np.select(conditions, values, default = ''))
patients.drop('INCOME2', axis = 1, inplace = True)

### Partner Status

In [22]:
conditions = [
    patients['MARITAL'] == 1.0,
    patients['MARITAL'] == 2.0,
    patients['MARITAL'] == 3.0,
    patients['MARITAL'] == 4.0,
    patients['MARITAL'] == 5.0,
    patients['MARITAL'] == 6.0
]

values = ['couple', 'single', 'single', 'single', 'single', 'couple']

patients = patients.assign(partner = np.select(conditions, values, default = ''))
patients.drop('MARITAL', axis = 1, inplace = True)

### Children

In [23]:
conditions = [
    patients['CHILDREN'] == 88.0,
    patients['CHILDREN'] == 1.0,
    patients['CHILDREN'] == 2.0,
    (patients['CHILDREN'] >= 3.0) & (patients['CHILDREN'] <= 4.0),
    (patients['CHILDREN'] >= 5.0) & (patients['CHILDREN'] <= 87.0)
]

values = ['none', 'one', 'two', 'three_to_four', 'five_plus']

patients = patients.assign(children = np.select(conditions, values, default = ''))
patients.drop('CHILDREN', axis = 1, inplace = True)

### Household Members

In [24]:
conditions = [
    patients['ACEDEPRS'] == 1.0,
    patients['ACEDEPRS'] == 2.0,
    patients['ACEDEPRS'].isna()
]

values = ['yes', 'no', 'no']

patients = patients.assign(depressed_household = np.select(conditions, values, default = ''))
patients.drop('ACEDEPRS', axis = 1, inplace = True)

In [25]:
conditions = [
    patients['ACEDRINK'] == 1.0,
    patients['ACEDRINK'] == 2.0,
    patients['ACEDRINK'].isna()
]

values = ['yes', 'no', 'no']

patients = patients.assign(alcohol_household = np.select(conditions, values, default = ''))
patients.drop('ACEDRINK', axis = 1, inplace = True)

In [26]:
conditions = [
    patients['ACEDRUGS'] == 1.0,
    patients['ACEDRUGS'] == 2.0,
    patients['ACEDRUGS'].isna()
]

values = ['yes', 'no', 'no']

patients = patients.assign(drugs_household = np.select(conditions, values, default = ''))
patients.drop('ACEDRUGS', axis = 1, inplace = True)

In [27]:
conditions = [
    patients['ACEPRISN'] == 1.0,
    patients['ACEPRISN'] == 2.0,
    patients['ACEPRISN'].isna()
]

values = ['yes', 'no', 'no']

patients = patients.assign(prison_household = np.select(conditions, values, default = ''))
patients.drop('ACEPRISN', axis = 1, inplace = True)

### Exercise

In [28]:
conditions = [
    patients['EXERANY2'] == 1.0,
    patients['EXERANY2'] == 2.0
]

values = ['yes', 'no']

patients = patients.assign(exercise_past_month = np.select(conditions, values, default = ''))
patients.drop('EXERANY2', axis = 1, inplace = True)

### Sleep

In [29]:
conditions = [
    patients['SLEPTIM1'] < 5.0,
    (patients['SLEPTIM1'] >= 5.0) & (patients['SLEPTIM1'] < 7.0),
    (patients['SLEPTIM1'] >= 7.0) & (patients['SLEPTIM1'] < 10.0),
    (patients['SLEPTIM1'] >= 10.0) & (patients['SLEPTIM1'] < 25.0),
]

values = ['very_low', 'low', 'healthy', 'very_high']

patients = patients.assign(sleep = np.select(conditions, values, default = ''))
patients.drop('SLEPTIM1', axis = 1, inplace = True)

### Smoking

In [30]:
conditions = [
    patients['SMOKE100'] == 1.0,
    patients['SMOKE100'] == 2.0
]

values = ['yes', 'no']

patients = patients.assign(ever_smoked_100 = np.select(conditions, values, default = ''))
patients.drop('SMOKE100', axis = 1, inplace = True)

In [31]:
conditions = [
    patients['SMOKDAY2'] == 1.0,
    patients['SMOKDAY2'] == 2.0,
    patients['SMOKDAY2'] == 3.0,
    patients['SMOKDAY2'].isna()
]

values = ['often', 'sometimes', 'none', 'none']

patients = patients.assign(current_smoker = np.select(conditions, values, default = ''))
patients.drop('SMOKDAY2', axis = 1, inplace = True)

### E-Smoking

In [32]:
conditions = [
    patients['ECIGARET'] == 1.0,
    patients['ECIGARET'] == 2.0,
    patients['ECIGARET'].isna()
]

values = ['yes', 'no', 'no']

patients = patients.assign(ever_e_smoked = np.select(conditions, values, default = ''))
patients.drop('ECIGARET', axis = 1, inplace = True)

In [33]:
conditions = [
    patients['ECIGNOW'] == 1.0,
    patients['ECIGNOW'] == 2.0,
    patients['ECIGNOW'] == 3.0,
    patients['ECIGNOW'].isna()
]

values = ['often', 'sometimes', 'none', 'none']

patients = patients.assign(current_e_smoker = np.select(conditions, values, default = ''))
patients.drop('ECIGNOW', axis = 1, inplace = True)

### Drinking

In [34]:
patients.loc[patients['AVEDRNK3'].isna(), 'AVEDRNK3'] = 0
patients.loc[patients['AVEDRNK3'] == 99.0, 'AVEDRNK3'] = np.nan
patients.loc[patients['AVEDRNK3'] == 77.0, 'AVEDRNK3'] = np.nan
patients.rename(columns = {'AVEDRNK3': 'average_alcohol_month'}, inplace = True)

### Health Conditions

In [35]:
old_vars = ['CHCOCNCR', 'CHCSCNCR', 'CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'CHCCOPD2', 'HAVARTH4', 'ADDEPEV3', 'CHCKDNY2', 'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON']
new_vars = ['ever_any_cancer', 'ever_skin_cancer', 'cvd', 'chd', 'stroke', 'ever_asthma', 'copd', 'arthritis', 'depression', 'kidney_disease', 'deaf', 'blind', 'concentration', 'mobility', 'dressing_bathing', 'errands']

for i in range(16):
    
    old_var = old_vars[i]
    new_var = new_vars[i]
    
    conditions = [
        patients[old_var] == 1.0,
        patients[old_var] == 2.0
    ]
    
    values = ["yes", "no"]
    
    patients = patients.assign(temp = np.select(conditions, values, default = ''))
    patients.drop(old_var, axis = 1, inplace = True)
    patients.rename(columns = {'temp': new_var}, inplace = True)

In [36]:
conditions = [
    patients['ASTHNOW'] == 1.0,
    patients['ASTHNOW'] == 2.0,
    patients['ASTHNOW'].isna()
]

values = ["yes", "no", "no"]

patients = patients.assign(current_asthma = np.select(conditions, values, default = ''))
patients.drop('ASTHNOW', axis = 1, inplace = True)

In [37]:
conditions = [
    patients['DIABETE4'] == 1.0,
    patients['DIABETE4'] == 2.0,
    patients['DIABETE4'] == 3.0,
    patients['DIABETE4'] == 4.0
]

values = ["yes", "no", "no", "borderline"]

patients = patients.assign(diabetes = np.select(conditions, values, default = ''))
patients.drop('DIABETE4', axis = 1, inplace = True)

In [38]:
conditions = [
    patients['CIMEMLOS'] == 1.0,
    patients['CIMEMLOS'] == 2.0,
    patients['CIMEMLOS'].isna()
]

values = ["yes", "no", "no"]

patients = patients.assign(cognitive_decline = np.select(conditions, values, default = ''))
patients.drop('CIMEMLOS', axis = 1, inplace = True)

In [39]:
conditions = [
    patients['PREGNANT'] == 1.0,
    patients['PREGNANT'] == 2.0,
    patients['PREGNANT'].isna()
]

values = ["yes", "no", "no"]

patients = patients.assign(pregnant = np.select(conditions, values, default = ''))
patients.drop('PREGNANT', axis = 1, inplace = True)

### Cancer Variables

In [40]:
conditions = [
    patients['CNCRDIFF'] == 1.0,
    patients['CNCRDIFF'] == 2.0,
    patients['CNCRDIFF'] == 3.0,
    patients['CNCRDIFF'].isna()
]

values = ["one", "two", "three", "zero"]

patients = patients.assign(number_cancers = np.select(conditions, values, default = ''))
patients.drop('CNCRDIFF', axis = 1, inplace = True)

In [41]:
conditions = [
    patients['CNCRTYP1'].isin([1.0]),
    patients['CNCRTYP1'].isin([2.0, 3.0, 4.0]),
    patients['CNCRTYP1'].isin([10.0]),
    patients['CNCRTYP1'].isin([19.0, 20.0]),
    patients['CNCRTYP1'].isin([21.0, 22.0]),
    patients['CNCRTYP1'].isin([77.0, 99.0]),
    patients['CNCRTYP1'].isin([np.nan])
]

values = ["breast", "cerv_endo_ovar", "colon", "prostate", "skin", "", "none"]

patients = patients.assign(cancer_type = np.select(conditions, values, default = "other"))
patients.drop('CNCRTYP1', axis = 1, inplace = True)

### Health Coverage

In [42]:
conditions = [
    patients['age'] >= 65,
    patients['_HCVU651'] == 1.0,
    patients['_HCVU651'] == 2.0,
]

values = ["over_65", "yes", "no"]

patients = patients.assign(health_coverage = np.select(conditions, values, default = ''))
patients.drop('_HCVU651', axis = 1, inplace = True)

# Finalize Data

In [43]:
patients = patients.dropna(subset = ["personal_doctor", "diabetes_course", "phys_health_status", "ment_health_status"])

In [44]:
patients.to_csv('data/input/diabetes_patients_20231205.csv', index = False)