In [1]:
import pandas as pd
import numpy as np
import requests
import os
import glob
from bs4 import BeautifulSoup

# get variable names

In [2]:
def get_variable_description(url):

    # get the html page with the variable codebook for all xpt files
    # use this to rename the columns to descriptive cols
    page = requests.get(url)

    # beautiful soup is good for parsing html
    soup = BeautifulSoup(page.content, 'html.parser')

    # this is the id for the table on the variable codebook bage
    tbl = soup.find("table",{"id":"GridView1"})

    # get the table and load into a df
    df_var_mapping = pd.read_html(str(tbl))[0]
    return df_var_mapping


questionnaire_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Questionnaire&Cycle=2017-2020'
)

demo_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Demographics&Cycle=2017-2020'
)

exam_var_desc = get_variable_description(
    url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Examination&Cycle=2017-2020'
)

df_var_mapping = pd.concat([questionnaire_var_desc,demo_var_desc,exam_var_desc])
#df_var_mapping

df_var_mapping.to_csv("df_var_mapping.csv")

In [3]:
# create a key value mapping between the variable name and description
var_map = dict(zip(
    df_var_mapping['Variable Name'], 
    df_var_mapping['Variable Description']
))

#var_map

# Read XPT Data

In [4]:
def read_xpt_files(folder_path):
    """Input folder path to read multiple xpt files in folder
       Returns dictionary with key: file name, 
                               value: df"""
    df_dict = {}
    for file_name in os.listdir(folder_path):
        if 'XPT' in os.path.splitext(file_name)[1]:
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_sas(file_path, format='xport')
            name = os.path.splitext(file_name)[0]
            if df.empty:
                raise Exception (f'Empty dataframe from file: {name}')
            df_dict[name] = df
        else:
            print(f'not loading file {file_name}')
    return df_dict



def full_outer_join(dataframes):
    joined_df = None
    for df in dataframes.values():
        if joined_df is None:
            joined_df = df
        else:
            joined_df = pd.merge(joined_df, df, on='SEQN', how='outer')
    return joined_df

## 2017 - March 2020 Data

In [5]:
# all data except for prescription medicine

folder_path = './xpt_data/2017_march2020'
dataframes_dict = read_xpt_files(folder_path)
df_cdc_joined_pre = full_outer_join(dataframes_dict)
# df_cdc_joined_pre

not loading file prescription_med
not loading file .ipynb_checkpoints


In [6]:
# prescription medicine survey - need to dedup

folder_path = './xpt_data/2017_march2020/prescription_med'
dataframes_dict = read_xpt_files(folder_path)
prescript_med = full_outer_join(dataframes_dict)

prescript_med['RXDRSD1']=prescript_med['RXDRSD1'].apply(lambda x: x.decode())
prescript_med['RXDRSD2']=prescript_med['RXDRSD2'].apply(lambda x: x.decode())
prescript_med['RXDRSD3']=prescript_med['RXDRSD3'].apply(lambda x: x.decode())

conditions = [
    (prescript_med["RXDRSD1"] == "Major depressive disorder, single episode, unspecified"),
    (prescript_med["RXDRSD1"] == "Major depressive disorder, recurrent, unspecified"),
    (prescript_med["RXDRSD2"] == "Major depressive disorder, single episode, unspecified"),
    (prescript_med["RXDRSD2"] == "Major depressive disorder, recurrent, unspecified"),
    (prescript_med["RXDRSD3"] == "Major depressive disorder, single episode, unspecified"),
    (prescript_med["RXDRSD3"] == "Major depressive disorder, recurrent, unspecified")
    ]

values = [1,1,1,1,1,1]
prescript_med['MDD'] = np.select(conditions, values)


# Sort by SEQN and MDD for any instances where the responder has MDD == 1, it will precede any rows where MDD == 0
# Only keep first row of each responder to retain rows where MDD == 1 if possible and MDD == 0 where not
prescript_med.sort_values(by=["SEQN"], axis = 0, ascending =[True], inplace=True)
prescript_med.sort_values(by=["MDD"], axis = 0, ascending =[False], inplace=True)
pm_dropped = prescript_med.drop_duplicates(subset = ['SEQN'], keep = 'first', inplace = False) 
pm_dropped = pm_dropped[['SEQN','MDD']]
# pm_dropped

In [7]:
# merge into 1 

df_cdc_joined = pd.merge(df_cdc_joined_pre, pm_dropped, on='SEQN', how='left')
df_cdc_joined

Unnamed: 0,SEQN,ALQ111,ALQ121,ALQ130,ALQ142,ALQ270,ALQ280,ALQ290,ALQ151,ALQ170,...,SMQ078,SMD641,SMD650,SMD100FL,SMD100MN,SMQ670,SMQ621,SMD630,SMAQUEX2,MDD
0,109266.0,1.0,1.000000e+01,1.0,5.397605e-79,,,,2.0,5.397605e-79,...,,,,,,,,,1.0,0
1,109271.0,1.0,5.397605e-79,,,,,,1.0,,...,2.0,30.0,20.0,,,1.0,,,1.0,0
2,109273.0,1.0,5.397605e-79,,,,,,2.0,,...,1.0,30.0,15.0,1.0,1.0,1.0,,,1.0,0
3,109274.0,1.0,4.000000e+00,2.0,5.000000e+00,7.0,5.397605e-79,,2.0,5.397605e-79,...,,,,,,,,,1.0,0
4,109282.0,1.0,5.397605e-79,,,,,,2.0,,...,,,,,,,,,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15555,121170.0,,,,,,,,,,...,,,,,,,,,,0
15556,121709.0,,,,,,,,,,...,,,,,,,,,,0
15557,122778.0,,,,,,,,,,...,,,,,,,,,,0
15558,122879.0,,,,,,,,,,...,,,,,,,,,,0


# Sanitize Data

In [149]:
# convert bytes and strip whitespace
df_cdc_joined_clean = df_cdc_joined.apply(lambda x: x.str.decode('utf-8').str.strip() if x.dtype == "object" else x)
# replace empty strings with nan
df_cdc_joined_clean = df_cdc_joined_clean.replace('', np.nan)
df_cdc_joined_clean

Unnamed: 0,SEQN,ALQ111,ALQ121,ALQ130,ALQ142,ALQ270,ALQ280,ALQ290,ALQ151,ALQ170,...,SMQ078,SMD641,SMD650,SMD100FL,SMD100MN,SMQ670,SMQ621,SMD630,SMAQUEX2,MDD
0,109266.0,1.0,1.000000e+01,1.0,5.397605e-79,,,,2.0,5.397605e-79,...,,,,,,,,,1.0,0
1,109271.0,1.0,5.397605e-79,,,,,,1.0,,...,2.0,30.0,20.0,,,1.0,,,1.0,0
2,109273.0,1.0,5.397605e-79,,,,,,2.0,,...,1.0,30.0,15.0,1.0,1.0,1.0,,,1.0,0
3,109274.0,1.0,4.000000e+00,2.0,5.000000e+00,7.0,5.397605e-79,,2.0,5.397605e-79,...,,,,,,,,,1.0,0
4,109282.0,1.0,5.397605e-79,,,,,,2.0,,...,,,,,,,,,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15555,121170.0,,,,,,,,,,...,,,,,,,,,,0
15556,121709.0,,,,,,,,,,...,,,,,,,,,,0
15557,122778.0,,,,,,,,,,...,,,,,,,,,,0
15558,122879.0,,,,,,,,,,...,,,,,,,,,,0


## Clean up ordinal variables

In [242]:
def clean_ordinal_col(x, column, max_number):
    
    col_value = round(x[column])
    acceptable_values = list(range(max_number+1))
    
    if col_value in acceptable_values:
        new_val = col_value
    else:
        new_val = None

    return new_val

ordinal_column_key = {
    'DPQ010':('little_interest_in_doing_things',3),
    'DPQ020':('feeling_down_depressed_hopeless',3),
    'DPQ030':('trouble_falling_or_staying_asleep',3),
    'DPQ040':('feeling_tired_or_having_little_energy',3),
    'DPQ050':('poor_appetitie_or_overeating',3),
    'DPQ060':('feeling_bad_about_yourself',3),
    'DPQ070':('trouble_concentrating',3),
    'DPQ080':('moving_or_speaking_to_slowly_or_fast',3),
    'DPQ090':('thoughts_you_would_be_better_off_dead',3),
    'DPQ100':('difficult_doing_daytoday_tasks',3),
    'ALQ121':('alcoholic_drinks_past_12mo',10),
    'DBQ700':('how_healthy_is_your_diet',5),
    'DID250':('count_days_seen_doctor_12mo',40),
    'PAQ670':('count_days_moderate_recreational_activity',7),   
    'PAD675':('count_minutes_moderate_recreational_activity',600),
    'PAD680':('count_minutes_moderate_sedentary_activity',1320),
    'PAQ706':('count_days_physical_activity_youth',7),   
    'WHQ225':('count_lost_10plus_pounds',4),
    'WHQ520':('count_tried_to_lose_weight_youth',3),
    'FSDHH':('food_security_level_household',4),   
    'FSDAD':('food_security_level_adult',4),
    'HUQ010':('general_health_condition',5),
    'HUD062':('duration_last_healthcare_visit',4),      
    'INDFMMPC':('monthly_poverty_index_category',3),
    'OCQ180':('count_hours_worked_last_week',80),
    'RIDAGEYR':('age_in_years',80),   
    'DMDEDUC2':('education_level',5)        
}

In [244]:
# for each ordinal variable create a new column that removes all the "missing response" or "don't know" so
# it can be treated as a range.

# For example, for column "DPQ010" values 7 = "Refused" & values 9 = "Don't know".
# The max_number for that column is equal to 3 so we set all values > 3 as null.

for key in ordinal_column_key.keys():
    
    col_name = key
    new_col_name = ordinal_column_key[key][0]
    max_num = ordinal_column_key[key][1]
    
    df_cdc_joined_clean[new_col_name] = df_cdc_joined_clean.apply(
        lambda x: clean_ordinal_col(x = x, column = col_name, max_number = max_num), axis = 1
    )

## Clean up category variables

In [246]:
def clean_category_col(x, column):
    
    col_value = x[column]
    
    if col_value == 1:
        new_val = 1
    elif col_value == 2:
        new_val = 0
    else:
        new_val = None

    return new_val

category_column_key = {
    'HIQ011':'has_health_insurance',
    'HIQ210':'has_health_insurance_gap'
    'SMQ681':'has_smoked_tabacco_last_5days',
    'RIAGENDR':'is_male',
    'DMDBORN4':'is_usa_born',
    'DIQ010':'has_diabetes',
    'MCQ080':'has_overweight_diagnosis',    
    'WHQ070':'has_tried_to_lose_weight_12mo'
    'RHQ131':'has_been_pregnant'
}

In [248]:
for key in category_column_key.keys():
    
    col_name = key
    new_col_name = category_column_key[key]
    
    df_cdc_joined_clean[new_col_name] = df_cdc_joined_clean.apply(
        lambda x: clean_category_col(x = x, column = col_name), axis = 1
    )

## one-off columns

In [None]:
df_cdc_joined_clean['monthly_poverty_index'] = df_cdc_joined_clean['INDFMMPI']

In [None]:
df_cdc_joined_clean

In [135]:
df_cdc_joined_clean.to_csv("df_cdc_joined_clean.csv")

# Misc QA & Sanitizations

In [107]:
# QA records in additional year folders

folder_path = './xpt_data/2015-2016'
dataframes_dict = read_xpt_files(folder_path)
df_cdc_joined = full_outer_join(dataframes_dict)
len(df_cdc_joined['SEQN'].unique())

5735

In [108]:
# QA records in additional year folders

folder_path = './xpt_data/2013-2014'
dataframes_dict = read_xpt_files(folder_path)
df_cdc_joined = full_outer_join(dataframes_dict)
len(df_cdc_joined['SEQN'].unique())

5924

In [109]:
dataframes_dict.keys()

dict_keys(['DPQ_H'])

## QA data

In [79]:
XPT_files = pd.DataFrame(columns = ['file_name', 'count_seqn', 'count_seqn_duplicate'])
SEQN_list = []


for key in dataframes_dict.keys():
    file_name = key
    count_seqn = len(dataframes_dict.get(key)['SEQN'].unique())
    count_seqn_duplicate = sum(dataframes_dict.get(key)['SEQN'].duplicated())
    XPT_file = pd.DataFrame([[file_name,count_seqn,count_seqn_duplicate]], columns=['file_name', 'count_seqn', 'count_seqn_duplicate'])
    XPT_files = XPT_files.append(XPT_file)
    SEQN_list.extend(dataframes_dict.get(key)['SEQN'].unique())
    

In [88]:
print(f"There is a total of {len(dataframes_dict.keys())} files")

There is a total of 22 files


In [89]:
print(f"There is a total of {len(np.unique(SEQN_list))} respondents")

There is a total of 15560 respondents


In [86]:
XPT_files

Unnamed: 0,file_name,count_seqn,count_seqn_duplicate
0,P_ALQ,8965,0
0,P_RHQ,5314,0
0,P_BMX,14300,0
0,P_PAQ,9693,0
0,P_DPQ,8965,0
0,P_OCQ,10195,0
0,P_WHQ,10195,0
0,P_WHQMEC,2211,0
0,P_DIQ,14986,0
0,P_RXQ_RX,15560,17402


# filter to columns we care about

In [7]:
cols_to_keep = '''SEQN
FSD652CW
HUQ010
HUQ030
HUQ090
DPQ010
DPQ020
DPQ030
DPQ040
DPQ050
DPQ060
DPQ070
DPQ080
DPQ090
DPQ100
RXDUSE
RXDDAYS
RXDRSC1
RXDRSC2
RXDRSC3
RXDRSD1
RXDRSD2
RXDRSD3
RHQ074
RHQ076
RHD167
RHQ171'''.split()
cols_to_keep

['SEQN',
 'FSD652CW',
 'HUQ010',
 'HUQ030',
 'HUQ090',
 'DPQ010',
 'DPQ020',
 'DPQ030',
 'DPQ040',
 'DPQ050',
 'DPQ060',
 'DPQ070',
 'DPQ080',
 'DPQ090',
 'DPQ100',
 'RXDUSE',
 'RXDDAYS',
 'RXDRSC1',
 'RXDRSC2',
 'RXDRSC3',
 'RXDRSD1',
 'RXDRSD2',
 'RXDRSD3',
 'RHQ074',
 'RHQ076',
 'RHD167',
 'RHQ171']

# rename based on the mapping obtained from cdc codebook

In [8]:
df_cdc_joined_clean_trim = df_cdc_joined_clean[cols_to_keep]
df_cdc_joined_clean_trim

Unnamed: 0,SEQN,FSD652CW,HUQ010,HUQ030,HUQ090,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,...,RXDRSC1,RXDRSC2,RXDRSC3,RXDRSD1,RXDRSD2,RXDRSD3,RHQ074,RHQ076,RHD167,RHQ171
0,109264.0,,2.0,1.0,1.0,,,,,,...,,,,,,,,,,
1,109266.0,,3.0,1.0,2.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,...,,,,,,,2.0,2.0,,
2,109277.0,,3.0,1.0,2.0,,,,,,...,,,,,,,,,,
3,109279.0,,3.0,1.0,1.0,,,,,,...,F90,R41,,Attention-deficit hyperactivity disorders,Other symptoms and signs involving cognitive f...,,,,,
4,109279.0,,3.0,1.0,1.0,,,,,,...,G47.9,,,"Sleep disorder, unspecified",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32957,124806.0,,1.0,1.0,2.0,,,,,,...,L20.9,,,"Atopic dermatitis, unspecified",,,,,,
32958,124808.0,,2.0,1.0,1.0,,,,,,...,Z79.3,,,Long term (current) use of hormonal contracept...,,,,,,
32959,124816.0,,1.0,1.0,,,,,,,...,,,,,,,,,,
32960,124819.0,,3.0,1.0,,,,,,,...,,,,,,,,,,


# rename columns with var descriptions

In [34]:
df_cdc_joined_clean_trim.rename(columns=var_map, inplace=True)
df_cdc_joined_clean_trim

Unnamed: 0,Respondent sequence number.,"After your {last} child was born, did you use WIC benefits to buy food for yourself?",{First/Next} I have some general questions about {your/SP's} health. Would you say {your/SP's} health in general is . . .,Is there a place that {you/SP} usually {go/goes} when {you are/he/she is} sick or {you/s/he} need{s} advice about {your/his/her} health?,"During the past 12 months, that is since {DISPLAY CURRENT MONTH} of {DISPLAY LAST YEAR}, {have you/has SP} seen or talked to a mental health professional such as a psychologist, psychiatrist, psychiatric nurse or clinical social worker about {your/his/her} health?","Over the last 2 weeks, how often have you been bothered by the following problems: little interest or pleasure in doing things? Would you say...","[Over the last 2 weeks, how often have you been bothered by the following problems:] feeling down, depressed, or hopeless?","[Over the last 2 weeks, how often have you been bothered by the following problems:] trouble falling or staying asleep, or sleeping too much?","[Over the last 2 weeks, how often have you been bothered by the following problems:] feeling tired or having little energy?","[Over the last 2 weeks, how often have you been bothered by the following problems:] poor appetite or overeating?",...,ICD-10-CM code 1.,ICD-10-CM code 2.,ICD-10-CM code 3.,ICD-10-CM code 1 description.,ICD-10-CM code 2 description.,ICD-10-CM code 3 description.,The next questions are about {your/SP's} pregnancy history. {Have you/Has SP} ever attempted to become pregnant over a period of at least a year without becoming pregnant?,{Have you/Has SP} ever been to a doctor or other medical provider because {you have/she has} been unable to become pregnant?,How many deliveries {have you/has SP} had? (Please count all vaginal and Cesarean deliveries and count stillbirths as well as live births.),How many of {your/her} deliveries resulted {Did {your/her} delivery result} in a live birth?
0,109264.0,,2.0,1.0,1.0,,,,,,...,,,,,,,,,,
1,109266.0,,3.0,1.0,2.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,...,,,,,,,2.0,2.0,,
2,109277.0,,3.0,1.0,2.0,,,,,,...,,,,,,,,,,
3,109279.0,,3.0,1.0,1.0,,,,,,...,F90,R41,,Attention-deficit hyperactivity disorders,Other symptoms and signs involving cognitive f...,,,,,
4,109279.0,,3.0,1.0,1.0,,,,,,...,G47.9,,,"Sleep disorder, unspecified",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32957,124806.0,,1.0,1.0,2.0,,,,,,...,L20.9,,,"Atopic dermatitis, unspecified",,,,,,
32958,124808.0,,2.0,1.0,1.0,,,,,,...,Z79.3,,,Long term (current) use of hormonal contracept...,,,,,,
32959,124816.0,,1.0,1.0,,,,,,,...,,,,,,,,,,
32960,124819.0,,3.0,1.0,,,,,,,...,,,,,,,,,,


In [10]:

with pd.option_context('display.max_columns', None, 'display.float_format', lambda x: '%.3f' % x): # also can do all rows using 'display.max_rows', None, 
    display(df_cdc_joined_clean_trim.describe())

Unnamed: 0,Respondent sequence number.,"After your {last} child was born, did you use WIC benefits to buy food for yourself?",{First/Next} I have some general questions about {your/SP's} health. Would you say {your/SP's} health in general is . . .,Is there a place that {you/SP} usually {go/goes} when {you are/he/she is} sick or {you/s/he} need{s} advice about {your/his/her} health?,"During the past 12 months, that is since {DISPLAY CURRENT MONTH} of {DISPLAY LAST YEAR}, {have you/has SP} seen or talked to a mental health professional such as a psychologist, psychiatrist, psychiatric nurse or clinical social worker about {your/his/her} health?","Over the last 2 weeks, how often have you been bothered by the following problems: little interest or pleasure in doing things? Would you say...","[Over the last 2 weeks, how often have you been bothered by the following problems:] feeling down, depressed, or hopeless?","[Over the last 2 weeks, how often have you been bothered by the following problems:] trouble falling or staying asleep, or sleeping too much?","[Over the last 2 weeks, how often have you been bothered by the following problems:] feeling tired or having little energy?","[Over the last 2 weeks, how often have you been bothered by the following problems:] poor appetite or overeating?","[Over the last 2 weeks, how often have you been bothered by the following problems:] feeling bad about yourself - or that you are a failure or have let yourself or your family down?","[Over the last 2 weeks, how often have you been bothered by the following problems:] trouble concentrating on things, such as reading the newspaper or watching TV?","[Over the last 2 weeks, how often have you been bothered by the following problems:] moving or speaking so slowly that other people could have noticed? Or the opposite - being so fidgety or restless that you have been moving around a lot more than usual?","[Over the last 2 weeks, how often have you been bothered by the following problems:] Thoughts that you would be better off dead or of hurting yourself in some way?","How difficult have these problems made it for you to do your work, take care of things at home, or get along with people?","In the past 30 days, have you used or taken medication for which a prescription is needed? Do not include prescription vitamins or minerals you may have already told me about.",For how long have you been using or taking {PRODUCT NAME}?,The next questions are about {your/SP's} pregnancy history. {Have you/Has SP} ever attempted to become pregnant over a period of at least a year without becoming pregnant?,{Have you/Has SP} ever been to a doctor or other medical provider because {you have/she has} been unable to become pregnant?,How many deliveries {have you/has SP} had? (Please count all vaginal and Cesarean deliveries and count stillbirths as well as live births.),How many of {your/her} deliveries resulted {Did {your/her} delivery result} in a live birth?
count,32962.0,379.0,32962.0,32962.0,31113.0,22468.0,22450.0,22450.0,22447.0,22447.0,22446.0,22446.0,22446.0,22433.0,16304.0,32962.0,24031.0,5600.0,5600.0,10231.0,9777.0
mean,117047.003,1.865,2.891,1.092,1.848,0.527,0.508,0.841,0.959,0.527,0.345,0.378,0.256,0.089,0.446,1.267,4175.267,1.899,1.926,2.697,2.682
std,4489.493,1.313,1.151,0.335,0.387,0.925,0.907,1.119,1.073,0.934,0.829,0.838,0.758,0.482,0.783,0.465,14172.014,0.399,0.387,2.829,1.811
min,109263.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,113128.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,365.0,2.0,2.0,2.0,2.0
50%,117020.5,2.0,3.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1095.0,2.0,2.0,2.0,2.0
75%,120928.75,2.0,4.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,3650.0,2.0,2.0,4.0,4.0
max,124822.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,99999.0,9.0,9.0,99.0,77.0
