In [20]:
import pandas as pd
import numpy as np
import requests
import os
import glob
from bs4 import BeautifulSoup

# Get Variable Definitions

In [21]:
def get_variable_description(url):

    # get the html page with the variable codebook for all xpt files
    # use this to rename the columns to descriptive cols
    page = requests.get(url)

    # beautiful soup is good for parsing html
    soup = BeautifulSoup(page.content, 'html.parser')

    # this is the id for the table on the variable codebook bage
    tbl = soup.find("table",{"id":"GridView1"})

    # get the table and load into a df
    df_var_mapping = pd.read_html(str(tbl))[0]
    return df_var_mapping


questionnaire_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Questionnaire&Cycle=2017-2020'
)

demo_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Demographics&Cycle=2017-2020'
)

exam_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Examination&Cycle=2017-2020'
)

df_var_mapping = pd.concat([questionnaire_var_desc,demo_var_desc,exam_var_desc])
#df_var_mapping

df_var_mapping.to_csv("../data/cdc_nhanes_var_mapping.csv")

# Read XPT Data

In [22]:
def read_xpt_files(folder_path):
    """Input folder path to read multiple xpt files in folder
       Returns dictionary with key: file name, 
                               value: df"""
    df_dict = {}
    for file_name in os.listdir(folder_path):
        if 'XPT' in os.path.splitext(file_name)[1]:
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_sas(file_path, format='xport')
            name = os.path.splitext(file_name)[0]
            if df.empty:
                raise Exception (f'Empty dataframe from file: {name}')
            df_dict[name] = df
        else:
            print(f'not loading file {file_name}')
    return df_dict



def full_outer_join(dataframes):
    joined_df = None
    for df in dataframes.values():
        if joined_df is None:
            joined_df = df
        else:
            joined_df = pd.merge(joined_df, df, on='SEQN', how='outer')
    return joined_df

In [23]:
def get_cdc_data(folder):
    
    folder_path_base = '../data/xpt_data/'
    folder_path_bulk = folder_path_base + folder
    folder_path_prescription_med = folder_path_base + folder + '/prescription_med'

    # all data except for prescription medicine
    dataframes_dict = read_xpt_files(folder_path_bulk)
    df_cdc_joined_pre = full_outer_join(dataframes_dict)
    
    # prescription medicine survey - need to dedup
    dataframes_dict = read_xpt_files(folder_path_prescription_med)
    prescript_med = full_outer_join(dataframes_dict)

    # https://stackoverflow.com/questions/1885181/how-to-un-escape-a-backslash-escaped-string
    prescript_med['RXDRSD1']=prescript_med['RXDRSD1'].apply(lambda x: x.decode('unicode-escape'))
    prescript_med['RXDRSD2']=prescript_med['RXDRSD2'].apply(lambda x: x.decode('unicode-escape'))
    prescript_med['RXDRSD3']=prescript_med['RXDRSD3'].apply(lambda x: x.decode('unicode-escape'))

    conditions = [
        (prescript_med["RXDRSD1"] == "Major depressive disorder, single episode, unspecified"),
        (prescript_med["RXDRSD1"] == "Major depressive disorder, recurrent, unspecified"),
        (prescript_med["RXDRSD2"] == "Major depressive disorder, single episode, unspecified"),
        (prescript_med["RXDRSD2"] == "Major depressive disorder, recurrent, unspecified"),
        (prescript_med["RXDRSD3"] == "Major depressive disorder, single episode, unspecified"),
        (prescript_med["RXDRSD3"] == "Major depressive disorder, recurrent, unspecified")
        ]

    values = [1,1,1,1,1,1]
    prescript_med['MDD'] = np.select(conditions, values)

    # Sort by SEQN and MDD for any instances where the responder has MDD == 1, it will precede any rows where MDD == 0
    # Only keep first row of each responder to retain rows where MDD == 1 if possible and MDD == 0 where not
    prescript_med.sort_values(by=["SEQN"], axis = 0, ascending =[True], inplace=True)
    prescript_med.sort_values(by=["MDD"], axis = 0, ascending =[False], inplace=True)
    pm_dropped = prescript_med.drop_duplicates(subset = ['SEQN'], keep = 'first', inplace = False) 
    pm_dropped = pm_dropped[['SEQN','MDD']]
    
    # merge into 1 
    df_cdc_joined = pd.merge(df_cdc_joined_pre, pm_dropped, on='SEQN', how='left')
    return df_cdc_joined

## 2017 - March 2020 Data

In [24]:
df_cdc_joined_2017_2020 = get_cdc_data(folder = '2017_march2020')
df_cdc_joined_2017_2020['folder'] = '2017_march2020'.encode("utf-8")

not loading file prescription_med
not loading file .DS_Store


## 2015-2016

In [25]:
df_cdc_joined_2015_2016 = get_cdc_data(folder = '2015_2016')
df_cdc_joined_2015_2016['folder'] = '2015_2016'.encode("utf-8")

not loading file prescription_med
not loading file .DS_Store


# 2013-2014

In [26]:
df_cdc_joined_2013_2014 = get_cdc_data(folder = '2013_2014')
df_cdc_joined_2013_2014['folder'] = '2013_2014'.encode("utf-8")

not loading file prescription_med
not loading file .DS_Store


# Merge all years & sanitize

In [27]:
df_cdc_joined = pd.concat([
     df_cdc_joined_2017_2020,
     df_cdc_joined_2015_2016,
     df_cdc_joined_2013_2014
])

# convert bytes and strip whitespace
df_cdc_joined_clean = df_cdc_joined.apply(lambda x: x.str.decode('utf-8').str.strip() if x.dtype == "object" else x)
# replace empty strings with nan
#df_cdc_joined_clean = df_cdc_joined_clean.replace('', np.nan)
df_cdc_joined_clean

Unnamed: 0,SEQN,SMQ681,SMQ690A,SMQ710,SMQ720,SMQ725,SMQ690B,SMQ740,SMQ690C,SMQ770,...,FSD670ZW,INQ244,IND247,OCD231,OCD241,OCD391,OCD392,BPQ056,BPD058,BPQ059
0,109264.0,2.0,,,,,,,,,...,,,,,,,,,,
1,109266.0,2.0,,,,,,,,,...,,,,,,,,,,
2,109271.0,1.0,1.0,5.0,20.0,1.0,,,,,...,,,,,,,,,,
3,109273.0,1.0,1.0,5.0,13.0,1.0,,,,,...,,,,,,,,,,
4,109274.0,2.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10170,83726.0,,,,,,,,,,...,,,,8.0,22.0,,,2.0,,2.0
10171,83727.0,2.0,,,,,,,,,...,,,,9.0,22.0,,,2.0,,2.0
10172,83728.0,,,,,,,,,,...,,1.0,,,,,,,,
10173,83730.0,,,,,,,,,,...,,2.0,3.0,,,,,,,


In [28]:
# total positive cases by file

df_cdc_joined_clean.groupby(['folder'])['MDD'].sum()

folder
2013_2014         404
2015_2016         392
2017_march2020    716
Name: MDD, dtype: int64

## Clean up ordinal variables

In [29]:
def clean_ordinal_col(x, column, max_number):
    
    if pd.isnull(x[column]):
        new_val = x[column]
    
    else:
        col_value = round(x[column])
        acceptable_values = list(range(max_number+1))

        if col_value in acceptable_values:
            new_val = col_value
        else:
            new_val = None

    return new_val

ordinal_column_key = {
    'DPQ010':('little_interest_in_doing_things',3),
    'DPQ020':('feeling_down_depressed_hopeless',3),
    'DPQ030':('trouble_falling_or_staying_asleep',3),
    'DPQ040':('feeling_tired_or_having_little_energy',3),
    'DPQ050':('poor_appetitie_or_overeating',3),
    'DPQ060':('feeling_bad_about_yourself',3),
    'DPQ070':('trouble_concentrating',3),
    'DPQ080':('moving_or_speaking_to_slowly_or_fast',3),
    'DPQ090':('thoughts_you_would_be_better_off_dead',3),
    'DPQ100':('difficult_doing_daytoday_tasks',3),
    'ALQ121':('alcoholic_drinks_past_12mo',10),
    'DBQ700':('how_healthy_is_your_diet',5),
    'DID250':('count_days_seen_doctor_12mo',40),
    'PAQ670':('count_days_moderate_recreational_activity',7),   
    'PAD675':('count_minutes_moderate_recreational_activity',600),
    'PAD680':('count_minutes_moderate_sedentary_activity',1320),
    'PAQ706':('count_days_physical_activity_youth',7),   
    'WHQ225':('count_lost_10plus_pounds',4),
    'WHQ520':('count_tried_to_lose_weight_youth',3),
    'FSDHH':('food_security_level_household',4),   
    'FSDAD':('food_security_level_adult',4),
    'HUQ010':('general_health_condition',5),
    'HUD062':('duration_last_healthcare_visit',4),      
    'INDFMMPC':('monthly_poverty_index_category',3),
    'OCQ180':('count_hours_worked_last_week',80),
    'RIDAGEYR':('age_in_years',80),   
    'DMDEDUC2':('education_level',5)        
}

In [30]:
# for each ordinal variable create a new column that removes all the "missing response" or "don't know" so
# it can be treated as a range.

# For example, for column "DPQ010" values 7 = "Refused" & values 9 = "Don't know".
# The max_number for that column is equal to 3 so we set all values > 3 as null.

for key in ordinal_column_key.keys():
    
    col_name = key
    new_col_name = ordinal_column_key[key][0]
    max_num = ordinal_column_key[key][1]
    
    df_cdc_joined_clean[new_col_name] = df_cdc_joined_clean.apply(
        lambda x: clean_ordinal_col(x = x, column = col_name, max_number = max_num), axis = 1
    )

## Clean up category variables

In [32]:
def clean_category_col(x, column):
    
    col_value = x[column]
    
    if col_value == 1:
        new_val = 1
    elif col_value == 2:
        new_val = 0
    else:
        new_val = None

    return new_val

category_column_key = {
    'HIQ011':'has_health_insurance',
    'HIQ210':'has_health_insurance_gap',
    'SMQ681':'has_smoked_tabacco_last_5days',
    'RIAGENDR':'is_male',
    'DMDBORN4':'is_usa_born',
    'DIQ010':'has_diabetes',
    'MCQ080':'has_overweight_diagnosis',    
    'WHQ070':'has_tried_to_lose_weight_12mo',
    'RHQ131':'has_been_pregnant'
}

In [33]:
for key in category_column_key.keys():
    
    col_name = key
    new_col_name = category_column_key[key]
    
    df_cdc_joined_clean[new_col_name] = df_cdc_joined_clean.apply(
        lambda x: clean_category_col(x = x, column = col_name), axis = 1
    )

## one-off columns

In [35]:
df_cdc_joined_clean['monthly_poverty_index'] = df_cdc_joined_clean['INDFMMPI']

# clean up additional columns with high correlation with outcome variable

In [15]:
# df_clean = df_cdc_joined_clean
# #filter for women who have been pregnant in calculating correlation but keep all responses
# preg = df_clean[df_clean['has_been_pregnant']==1]
# df=preg

# #convert to floats to enable correlation calculations
# for col in df:
#     df[col] = df[col].astype(float)


# df_corr=df.corr(method='pearson')
# corr_values=(df_corr['MDD']).sort_values(ascending=False)
# df_corr_values=corr_values.to_frame()

# #Identify features with higher correlation with outcome variable
# filtered_corr = df_corr_values[df_corr_values['MDD'] > 0.1]
# high_corr_feat = filtered_corr.index.tolist()

In [36]:
"""
Cleaned features:
DPQ020 = feeling_down_depressed_hopeless
DPQ040 = feeling_tired_or_having_little_energy
DPQ060 = feeling_bad_about_yourself
DPQ070 = trouble_concentrating
DPQ100 = difficult_doing_daytoday_tasks
DPQ050 = poor_appetitie_or_overeating
DPQ010 = little_interest_in_doing_things
DPQ030 = trouble_falling_or_staying_asleep
HUQ010 = general_health_condition
DPQ080 = moving_or_speaking_to_slowly_or_fast
WHQ225 = count_lost_10plus_pounds
DBQ700 = how_healthy_is_your_diet
DPQ090 = thoughts_you_would_be_better_off_dead
DMDBORN4 = is_usa_born
FSDAD = food_security_level_adult
MCQ080 = has_overweight_diagnosis
FSDHH = food_security_level_household
RIDAGEYR = age_in_years
DID250 = count_days_seen_doctor_12mo
MCD180D = age_with_angina_pectoris (convert 77777 and 99999 to NaN)
RHQ020 = age_range_first_menstrual_period (convert 7 and 9 to NaN)
MCQ170L = have_liver_condition (convert 7 and 9 to NaN)
BMIHT = BMI_standing_height_comment (drop column)
MCD180L = age_liver_condition (convert 77777 and 99999 to NaN)
OCD150 = type_of_work_done_last_week (convert 7 and 9 to NaN)
WHQ060 = weight_change_intentional (convert 7 and 9 to NaN)
SMQ830 = days_nicotine_substitute_used (convert 7 and 9 to NaN)
BMXWAIST = waist_circumference (no cleaning needed)
CDQ006 = pain_relief_from_cardio_recoverytime (convert 7 and 9 to NaN)
HUQ051 = annual_healthcare_visit_count (convert 77 and 99 to NaN)
RHQ131 = has_been_pregnant

##############################################################################
Features to clean Week 7:

#Alcohol Use
ALQ111 = drank_alc (convert 7 and 9 to NaN)
ALQ121= alc_drinking_freq (convert 77 and 99 to NaN)
ALQ130 = alc_per_day (convert 777 and 999 to NaN)
ALQ142 = times_with_4or5_alc (convert 77 and 99 to NaN)
ALQ280 = times_with_8plus_alc (convert 77 and 99 to NaN)
ALQ290 = times_with_12plus_alc (convert 77 and 99 to NaN)
ALQ151 = 4plus_alc_daily (convert 7 and 9 to NaN)
ALQ170 = days_4plus_drinks_occasion (convert 777 and 999 to NaN)

#Blood Pressure & Cholesterol
BPQ020 = high_bp (convert 7 and 9 to NaN)
BPD035 = age_hypertension (convert 777 and 999 to NaN)
BPQ040A = hypertension_prescription (convert 7 and 9 to NaN)
BPQ050A = high_bp_prescription (convert 7 and 9 to NaN)
BPQ080 = high_cholesterol (convert 7 and 9 to NaN)
BPQ090D = cholesterol_prescription (convert 7 and 9 to NaN)

#Cardiovascular Health
CDQ001 = chest_discomfort (convert 7 and 9 to NaN)

#Dermatology
DEQ034A = stay_in_shade (convert 77 and 99 to NaN)
DEQ034C = wear_long_sleeves (convert 7 and 9 to NaN)
DEQ034D = use_sunscreen (convert 7 and 9 to NaN)
DED120 = min_outdoors_workday (convert 7777 and 9999 to NaN)
DED125 = min_outdoors_nonworkday (convert 7777 and 9999 to NaN)

#Diabetes
DIQ010SAS = diabetes (convert 7 and 9 to NaN)
DID040 = diabetes_age (convert 777 and 999 to NaN)
DIQ160 = prediabetes (convert 7 and 9 to NaN)
DIQ180 = blood_tested (convert 7 and 9 to NaN)
DIQ050 = taking_insulin (convert 7 and 9 to NaN)
DIQ070 = diabetes_pills (convert 7 and 9 to NaN)
DIQ230 = time_since_diabetes_specialist (convert 7 and 9 to NaN)
DIQ240 = have_dr_for_diabetes (convert 7 and 9 to NaN)

#Diet Behavior & Nutrition
DBQ010 = breastfed (convert 7 and 9 to NaN)
DBQ197 = milk_consumption_freq (convert 4, 7, and 9 to NaN)
DBQ301 = govmnt_meal_delivery (convert 7 and 9 to NaN)
DBD895 = nonhomemade_meals (convert 5555, 7777, 9999 to NaN)
DBD900 = fastfood_meals (convert 5555, 7777, and 9999 to NaN)
DBD905 = readytoeat_meals (convert 6666, 7777, and 9999 to NaN)
DBD910 = frozen_pizza (convert 6666, 7777, and 9999 to NaN) 

#Early Childhood
ECD010 = mothers_age_at_birth (convert 7777 and 9999 to NaN)
ECQ020 = mother_smoke_during_preg (convert 7 and 9 to NaN)
ECD070A = weight_at_birth (convert 7777 and 9999 to NaN)

#Food Security
FSD151 = emergency_food_received (convert 7 and 9 to NaN)
FSQ165 = food_stamps_used (convert 7 and 9 to NaN)
FSQ653 = wic_benefit_used (convert 7 and 9 to NaN)

#Hospital Utilization & Access to Care
HUQ010 = general_health (convert 7 and 9 to NaN)
HUQ030 = regular_healthcare_place (convert 7 and 9 to NaN)
HUD062 = time_since_last_healthcare (convert 77 and 99 to NaN)
HUQ071 = overnight_in_hospital (convert 7 and 9 to NaN)
HUQ090 = seen_mental_health_professional (convert 7 and 9 to NaN)

#Health Insurance
HIQ011 = have_health_insurance (convert 7 and 9 to NaN)
HIQ032A = have_private_insurance (convert 77 and 99 to NaN)
HIQ270 = plan_cover_prescriptions (convert 7 and 9 to NaN)

#Income
INDFMMPI = family_poverty_level
INDFMMPC = family_poverty_level_category (convert 7 and 9 to NaN)

#Medical Conditions
MCQ010 = asthma (convert 7 and 9 to NaN)
AGQ030SAS = hay_fever (convert 7 and 9 to NaN)
MCQ053 = anemia_treatment (convert 7 and 9 to NaN)
MCQ092 = blood_transfusion (convert 7 and 9 to NaN)
MCQ160a = arthritis
MCQ160b = heart_failure
MCQ160c = coronary_heart_disease (convert 7 and 9 to NaN)
MCQ160d = angina_pectoris (convert 7 and 9 to NaN)
MCQ160e = heart_attack (convert 7 and 9 to NaN)
MCQ160f = stroke (convert 7 and 9 to NaN)
MCQ160m = thyroid_issues (convert 7 and 9 to NaN)
MCQ160p = respiratory_issues (convert 7 and 9 to NaN)
MCQ520 = abdominal_pain (convert 7 and 9 to NaN)
MCQ550 = gallstones (convert 7 and 9 to NaN)
MCQ560 = gallbladder_surgery (convert 7 and 9 to NaN)
MCQ220 = cancer (convert 7 and 9 to NaN)
MCQ366a = dr_recommend_lose_weight (convert 7 and 9 to NaN)
MCQ366b = dr_recommend_exercise (convert 7 and 9 to NaN)
MCQ366c = dr_recommend_reduce_salt (convert 7 and 9 to NaN)
MCQ366d = dr_recommend_reduce_fat (convert 7 and 9 to NaN)
MCQ371a = currently_losing_weight (convert 7 and 9 to NaN)
MCQ371b = currently_increase_exercise (convert 7 and 9 to NaN)
MCQ371c = currently_reducing_salt (convert 7 and 9 to NaN)
MCQ371d = currently_reducing_fat (convert 7 and 9 to NaN)
OSQ230 = metal_objects (convert 7 and 9 to NaN)

#Occupation
OCQ180 = hours_worked (convert 77777 and 99999 to NaN)
OCQ210 = over_35_hrs_worked (convert 7 and 9 to NaN)
OCQ670 = work_schedule (convert 7 and 9 to NaN)

#Oral Health
OHQ030 = last_dental_visit (convert 77 and 99 to NaN)
OHQ033 = last_dental_visit_reason (convert 7 and 9 to NaN)
OHQ770 = couldnt_get_dental_care (convert 7 and 9 to NaN)

#Osteoporosis
OSQ010a = broken_hip (convert 7 and 9 to NaN)
OSQ010b = broken_wrist (convert 7 and 9 to NaN)
OSQ010c = broken_spine (convert 7 and 9 to NaN)
OSQ080 = other_fracture (convert 7 and 9 to NaN)

#Physical Activity
PAQ605 = vigorous_work (convert 7 and 9 to NaN)
PAQ635 = walk_or_bicycle (convert 7 and 9 to NaN)
PAQ650 = vigorous_recreation (convert 7 and 9 to NaN)
PAQ665 = moderate_recreation (convert 7 and 9 to NaN)

#Reproductive Health
RHQ031 = regular_periods (convert 7 and 9 to NaN)
RHQ060 = age_last_period (convert 777 and 999 to NaN)
RHQ074 = try_pregnancy_1yr (convert 7 and 9 to NaN)
RHQ076 = see_dr_fertility (convert 7 and 9 to NaN)
RHQ078 = pelvic_infection (convert 7 and 9 to NaN)
RHD143 = pregnant_now (convert 7 and 9 to NaN)
RHQ160 = pregnancy_count (convert 77 and 99 to NaN)
RHQ162 = diabetes_pregnancy (convert 7 and 9 to NaN)
RHD167 = delivery_count (convert 77 and 99 to NaN)
RHQ171 = live_birth_count (convert 77 and 99 to NaN)
RHD180 = age_at_first_birth (convert 777 and 999 to NaN)
RHD190 = age_at_last_birth (convert 777 and 999 to NaN)
RHQ197 = months_since_birth (convert 777 and 999 to NaN)
RHQ540 = horomones_not_bc (convert 7 and 9 to NaN)

#Smoking
SMQ020 = smoked_100_cigs (convert 7 and 9 to NaN)
SMQ040 = currently_smoke (convert 7 and 9 to NaN)

#Weight History
WHD010 = height_in (convert 7777 and 9999 to NaN)
WHD020 = weight_lbs (convert 7777 and 9999 to NaN)
WHQ070 = attempt_weight_loss_1yr (convert 7 and 9 to NaN)

"""

#keep all original columns and create new sanitized columns with more descriptive names 

def sanitize_5(value):
    if value == 77777 or value == 99999:
        return np.nan
    else:
        return value
    
def sanitize_4(value):
    if value == 7777 or value == 9999:
        return np.nan
    else:
        return value

def sanitize_4_5(value):
    if value == 7777 or value == 9999 or value == 5555 or value == 6666:
        return np.nan
    else:
        return value
    
def sanitize_3(value):
    if value == 777 or value == 999:
        return np.nan
    else:
        return value
    
def sanitize_2(value):
    if value == 77 or value == 99:
        return np.nan
    else:
        return value
    
def sanitize_1(value):
    if value == 7 or value == 9:
        return np.nan
    else:
        return value

def sanitize_1_4(value):
    if value == 7 or value == 9 or value == 4:
        return np.nan
    else:
        return value
    
def convert_to_int(df):
    converted_columns = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = (df[col].fillna(0)
                      .astype(int)
                      .astype(object)
                      .where(df[col].notnull()))
            converted_columns.append(col)
    #if converted_columns == []:
    #    print('No columns were converted')
    #else:
    #    print('Converted columns ', converted_columns, ' to integer type')

#convert all columns to int
convert_to_int(df_cdc_joined_clean)

#Prior to week 7
df_cdc_joined_clean['age_with_angina_pectoris'] = df_cdc_joined_clean['MCD180D'].map(sanitize_5)
df_cdc_joined_clean['age_liver_condition'] = df_cdc_joined_clean['MCD180L'].map(sanitize_5)
df_cdc_joined_clean['age_range_first_menstrual_period'] = df_cdc_joined_clean['RHQ020'].map(sanitize_1)
df_cdc_joined_clean['annual_healthcare_visit_count'] = df_cdc_joined_clean['HUQ051'].map(sanitize_2)
df_cdc_joined_clean['have_liver_condition'] = df_cdc_joined_clean['MCQ170L'].map(sanitize_1)
df_cdc_joined_clean['type_of_work_done_last_week'] = df_cdc_joined_clean['OCD150'].map(sanitize_1)
df_cdc_joined_clean['weight_change_intentional'] = df_cdc_joined_clean['WHQ060'].map(sanitize_1)
df_cdc_joined_clean['days_nicotine_substitute_used'] = df_cdc_joined_clean['SMQ830'].map(sanitize_1)
df_cdc_joined_clean['pain_relief_from_cardio_recoverytime'] = df_cdc_joined_clean['CDQ006'].map(sanitize_1)
df_cdc_joined_clean = df_cdc_joined_clean.drop('BMIHT', axis=1)

#Alcohol Use
df_cdc_joined_clean['drank_alc'] = df_cdc_joined_clean['ALQ111'].map(sanitize_1)
df_cdc_joined_clean['alc_drinking_freq'] = df_cdc_joined_clean['ALQ121'].map(sanitize_2)
df_cdc_joined_clean['alc_per_day'] = df_cdc_joined_clean['ALQ130'].map(sanitize_3)
df_cdc_joined_clean['times_with_4or5_alc'] = df_cdc_joined_clean['ALQ142'].map(sanitize_2)
df_cdc_joined_clean['times_with_8plus_alc'] = df_cdc_joined_clean['ALQ280'].map(sanitize_2)
df_cdc_joined_clean['times_with_12plus_alc'] = df_cdc_joined_clean['ALQ290'].map(sanitize_2)
df_cdc_joined_clean['4plus_alc_daily'] = df_cdc_joined_clean['ALQ151'].map(sanitize_1)
df_cdc_joined_clean['days_4plus_drinks_occasion'] = df_cdc_joined_clean['ALQ170'].map(sanitize_3)

#Blood Pressure & Cholesterol
df_cdc_joined_clean['high_bp'] = df_cdc_joined_clean['BPQ020'].map(sanitize_1)
df_cdc_joined_clean['age_hypertension'] = df_cdc_joined_clean['BPD035'].map(sanitize_3)
df_cdc_joined_clean['hypertension_prescription'] = df_cdc_joined_clean['BPQ040A'].map(sanitize_1)
df_cdc_joined_clean['high_bp_prescription'] = df_cdc_joined_clean['BPQ050A'].map(sanitize_1)
df_cdc_joined_clean['high_cholesterol'] = df_cdc_joined_clean['BPQ080'].map(sanitize_1)
df_cdc_joined_clean['cholesterol_prescription'] = df_cdc_joined_clean['BPQ090D'].map(sanitize_1)

#Cardiovascular Health
df_cdc_joined_clean['chest_discomfort'] = df_cdc_joined_clean['CDQ001'].map(sanitize_1)

#Dermatology
# df_cdc_joined_clean['stay_in_shade'] = df_cdc_joined_clean['DEQ034A'].map(sanitize_2)
# df_cdc_joined_clean['wear_long_sleeves'] = df_cdc_joined_clean['DEQ034C'].map(sanitize_1)
# df_cdc_joined_clean['use_sunscreen'] = df_cdc_joined_clean['DEQ034D'].map(sanitize_1)
# df_cdc_joined_clean['min_outdoors_workday'] = df_cdc_joined_clean['DED120'].map(sanitize_4)
# df_cdc_joined_clean['min_outdoors_nonworkday'] = df_cdc_joined_clean['DED125'].map(sanitize_4)

#Diabetes
# df_cdc_joined_clean['diabetes'] = df_cdc_joined_clean['DIQ010SAS'].map(sanitize_1)
# df_cdc_joined_clean['diabetes_age'] = df_cdc_joined_clean['DID040'].map(sanitize_3)
# df_cdc_joined_clean['prediabetes'] = df_cdc_joined_clean['DIQ160'].map(sanitize_1)
# df_cdc_joined_clean['blood_tested'] = df_cdc_joined_clean['DIQ180'].map(sanitize_1)
# df_cdc_joined_clean['taking_insulin'] = df_cdc_joined_clean['DIQ050'].map(sanitize_1)
# df_cdc_joined_clean['diabetes_pills'] = df_cdc_joined_clean['DIQ070'].map(sanitize_1)
# df_cdc_joined_clean['time_since_diabetes_specialist'] = df_cdc_joined_clean['DIQ230'].map(sanitize_1)
# df_cdc_joined_clean['have_dr_for_diabetes'] = df_cdc_joined_clean['DIQ240'].map(sanitize_1)

#Diet Behavior & Nutrition
df_cdc_joined_clean['breastfed'] = df_cdc_joined_clean['DBQ010'].map(sanitize_1)
df_cdc_joined_clean['milk_consumption_freq'] = df_cdc_joined_clean['DBQ197'].map(sanitize_1_4)
df_cdc_joined_clean['govmnt_meal_delivery'] = df_cdc_joined_clean['DBQ301'].map(sanitize_1)
df_cdc_joined_clean['nonhomemade_meals'] = df_cdc_joined_clean['DBD895'].map(sanitize_4_5)
df_cdc_joined_clean['fastfood_meals'] = df_cdc_joined_clean['DBD900'].map(sanitize_4_5)
df_cdc_joined_clean['readytoeat_meals'] = df_cdc_joined_clean['DBD905'].map(sanitize_4_5)
df_cdc_joined_clean['frozen_pizza'] = df_cdc_joined_clean['DBD910'].map(sanitize_4_5)

#Early Childhood
# df_cdc_joined_clean['mothers_age_at_birth'] = df_cdc_joined_clean['ECD010'].map(sanitize_4)
# df_cdc_joined_clean['mother_smoke_during_preg'] = df_cdc_joined_clean['ECQ020'].map(sanitize_1)
# df_cdc_joined_clean['weight_at_birth'] = df_cdc_joined_clean['ECD070A'].map(sanitize_4)

#Food Security
df_cdc_joined_clean['emergency_food_received'] = df_cdc_joined_clean['FSD151'].map(sanitize_1)
df_cdc_joined_clean['food_stamps_used'] = df_cdc_joined_clean['FSQ165'].map(sanitize_1)
df_cdc_joined_clean['wic_benefit_used'] = df_cdc_joined_clean['FSQ653'].map(sanitize_1)

#Hospital Utilization & Access to Care
df_cdc_joined_clean['general_health'] = df_cdc_joined_clean['HUQ010'].map(sanitize_1)
df_cdc_joined_clean['regular_healthcare_place'] = df_cdc_joined_clean['HUQ030'].map(sanitize_1)
df_cdc_joined_clean['time_since_last_healthcare'] = df_cdc_joined_clean['HUD062'].map(sanitize_2)
df_cdc_joined_clean['overnight_in_hospital'] = df_cdc_joined_clean['HUQ071'].map(sanitize_1)
df_cdc_joined_clean['seen_mental_health_professional'] = df_cdc_joined_clean['HUQ090'].map(sanitize_1)

#Health Insurance
df_cdc_joined_clean['have_health_insurance'] = df_cdc_joined_clean['HIQ011'].map(sanitize_1)
df_cdc_joined_clean['have_private_insurance'] = df_cdc_joined_clean['HIQ032A'].map(sanitize_2)
df_cdc_joined_clean['plan_cover_prescriptions'] = df_cdc_joined_clean['HIQ270'].map(sanitize_1)

#Income
df_cdc_joined_clean['family_poverty_level'] = df_cdc_joined_clean['INDFMMPI']
df_cdc_joined_clean['family_poverty_level_category'] = df_cdc_joined_clean['INDFMMPC'].map(sanitize_1)

#Medical Conditions
df_cdc_joined_clean['asthma'] = df_cdc_joined_clean['MCQ010'].map(sanitize_1)
# df_cdc_joined_clean['hay_fever'] = df_cdc_joined_clean['AGQ030SAS'].map(sanitize_1)
df_cdc_joined_clean['anemia_treatment'] = df_cdc_joined_clean['MCQ053'].map(sanitize_1)
df_cdc_joined_clean['blood_transfusion'] = df_cdc_joined_clean['MCQ092'].map(sanitize_1)
df_cdc_joined_clean['arthritis'] = df_cdc_joined_clean['MCQ160A'].map(sanitize_1)
df_cdc_joined_clean['heart_failure'] = df_cdc_joined_clean['MCQ160B'].map(sanitize_1)
df_cdc_joined_clean['coronary_heart_disease'] = df_cdc_joined_clean['MCQ160C'].map(sanitize_1)
df_cdc_joined_clean['angina_pectoris'] = df_cdc_joined_clean['MCQ160D'].map(sanitize_1)
df_cdc_joined_clean['heart_attack'] = df_cdc_joined_clean['MCQ160E'].map(sanitize_1)
df_cdc_joined_clean['stroke'] = df_cdc_joined_clean['MCQ160F'].map(sanitize_1)
df_cdc_joined_clean['thyroid_issues'] = df_cdc_joined_clean['MCQ160M'].map(sanitize_1)
df_cdc_joined_clean['respiratory_issues'] = df_cdc_joined_clean['MCQ160P'].map(sanitize_1)
df_cdc_joined_clean['abdominal_pain'] = df_cdc_joined_clean['MCQ520'].map(sanitize_1)
df_cdc_joined_clean['gallstones'] = df_cdc_joined_clean['MCQ550'].map(sanitize_1)
df_cdc_joined_clean['gallbladder_surgery'] = df_cdc_joined_clean['MCQ560'].map(sanitize_1)
df_cdc_joined_clean['cancer'] = df_cdc_joined_clean['MCQ220'].map(sanitize_1)
df_cdc_joined_clean['dr_recommend_lose_weight'] = df_cdc_joined_clean['MCQ366A'].map(sanitize_1)
df_cdc_joined_clean['dr_recommend_exercise'] = df_cdc_joined_clean['MCQ366B'].map(sanitize_1)
df_cdc_joined_clean['dr_recommend_reduce_salt'] = df_cdc_joined_clean['MCQ366C'].map(sanitize_1)
df_cdc_joined_clean['dr_recommend_reduce_fat'] = df_cdc_joined_clean['MCQ366D'].map(sanitize_1)
df_cdc_joined_clean['currently_losing_weight'] = df_cdc_joined_clean['MCQ371A'].map(sanitize_1)
df_cdc_joined_clean['currently_increase_exercise'] = df_cdc_joined_clean['MCQ371B'].map(sanitize_1)
df_cdc_joined_clean['currently_reducing_salt'] = df_cdc_joined_clean['MCQ371C'].map(sanitize_1)
df_cdc_joined_clean['currently_reducing_fat'] = df_cdc_joined_clean['MCQ371D'].map(sanitize_1)
df_cdc_joined_clean['metal_objects'] = df_cdc_joined_clean['OSQ230'].map(sanitize_1)

#Occupation
df_cdc_joined_clean['hours_worked'] = df_cdc_joined_clean['OCQ180'].map(sanitize_5)
df_cdc_joined_clean['over_35_hrs_worked'] = df_cdc_joined_clean['OCQ210'].map(sanitize_1)
df_cdc_joined_clean['work_schedule'] = df_cdc_joined_clean['OCQ670'].map(sanitize_1)

#Oral Health
# df_cdc_joined_clean['last_dental_visit'] = df_cdc_joined_clean['OHQ030'].map(sanitize_2)
# df_cdc_joined_clean['last_dental_visit_reason'] = df_cdc_joined_clean['OHQ033'].map(sanitize_1)
# df_cdc_joined_clean['couldnt_get_dental_care'] = df_cdc_joined_clean['OHQ770'].map(sanitize_1)

#Osteoporosis
# df_cdc_joined_clean['broken_hip'] = df_cdc_joined_clean['OSQ010A'].map(sanitize_1)
# df_cdc_joined_clean['broken_wrist'] = df_cdc_joined_clean['OSQ010B'].map(sanitize_1)
# df_cdc_joined_clean['broken_spine'] = df_cdc_joined_clean['OSQ010C'].map(sanitize_1)
# df_cdc_joined_clean['other_fracture'] = df_cdc_joined_clean['OSQ080'].map(sanitize_1)

#Physical Activity
df_cdc_joined_clean['vigorous_work'] = df_cdc_joined_clean['PAQ605'].map(sanitize_1)
df_cdc_joined_clean['walk_or_bicycle'] = df_cdc_joined_clean['PAQ635'].map(sanitize_1)
df_cdc_joined_clean['vigorous_recreation'] = df_cdc_joined_clean['PAQ650'].map(sanitize_1)
df_cdc_joined_clean['moderate_recreation'] = df_cdc_joined_clean['PAQ665'].map(sanitize_1)

#Reproductive Health
df_cdc_joined_clean['regular_periods'] = df_cdc_joined_clean['RHQ031'].map(sanitize_1)
df_cdc_joined_clean['age_last_period'] = df_cdc_joined_clean['RHQ060'].map(sanitize_3)
df_cdc_joined_clean['try_pregnancy_1yr'] = df_cdc_joined_clean['RHQ074'].map(sanitize_1)
df_cdc_joined_clean['see_dr_fertility'] = df_cdc_joined_clean['RHQ076'].map(sanitize_1)
df_cdc_joined_clean['pelvic_infection'] = df_cdc_joined_clean['RHQ078'].map(sanitize_1)
df_cdc_joined_clean['pregnant_now'] = df_cdc_joined_clean['RHD143'].map(sanitize_1)
df_cdc_joined_clean['pregnancy_count'] = df_cdc_joined_clean['RHQ160'].map(sanitize_2)
df_cdc_joined_clean['diabetes_pregnancy'] = df_cdc_joined_clean['RHQ162'].map(sanitize_1)
df_cdc_joined_clean['delivery_count'] = df_cdc_joined_clean['RHD167'].map(sanitize_2)
df_cdc_joined_clean['live_birth_count'] = df_cdc_joined_clean['RHQ171'].map(sanitize_2)
df_cdc_joined_clean['age_at_first_birth'] = df_cdc_joined_clean['RHD180'].map(sanitize_3)
df_cdc_joined_clean['age_at_last_birth'] = df_cdc_joined_clean['RHD190'].map(sanitize_3)
df_cdc_joined_clean['months_since_birth'] = df_cdc_joined_clean['RHQ197'].map(sanitize_3)
df_cdc_joined_clean['horomones_not_bc'] = df_cdc_joined_clean['RHQ540'].map(sanitize_1)

#Smoking
df_cdc_joined_clean['smoked_100_cigs'] = df_cdc_joined_clean['SMQ020'].map(sanitize_1)
df_cdc_joined_clean['currently_smoke'] = df_cdc_joined_clean['SMQ040'].map(sanitize_1)

#Weight History
df_cdc_joined_clean['height_in'] = df_cdc_joined_clean['WHD010'].map(sanitize_4)
df_cdc_joined_clean['weight_lbs'] = df_cdc_joined_clean['WHD020'].map(sanitize_4)
df_cdc_joined_clean['attempt_weight_loss_1yr'] = df_cdc_joined_clean['WHQ070'].map(sanitize_1)

In [37]:
# total positive cases by file

df_cdc_joined_clean.groupby(['folder'])['MDD'].sum()

folder
2013_2014         404
2015_2016         392
2017_march2020    716
Name: MDD, dtype: int64

In [38]:
df_cdc_joined_clean.to_csv(
    path_or_buf = "../data/cdc_nhanes_survey_responses_clean.csv",
    index = False
)