# Data Pipeline

In [None]:
import pandas as pd
import numpy as np
import requests
import os
import glob
from bs4 import BeautifulSoup

# get all xpt file names

In [None]:

# xpt_file_list = glob.glob('./xpt_data/*.XPT')
# print(os.path.basename(xpt_file_list[0]).split('.')[0])

# get variable names

In [None]:
# get the html page with the variable codebook for all xpt files
# use this to rename the columns to descriptive cols
url = 'https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component=Questionnaire&Cycle=2017-2020'
page = requests.get(url)

# beautiful soup is good for parsing html
soup = BeautifulSoup(page.content, 'html.parser')

# this is the id for the table on the variable codebook bage
tbl = soup.find("table",{"id":"GridView1"})

# get the table and load into a df
df_var_mapping = pd.read_html(str(tbl))[0]
df_var_mapping

In [None]:
# create a key value mapping between the variable name and description
var_map = dict(zip(
    df_var_mapping['Variable Name'], 
    df_var_mapping['Variable Description']
))

var_map

In [None]:

def read_xpt_files(folder_path):
    """Input folder path to read multiple xpt files in folder
       Returns dictionary with key: file name, 
                               value: df"""
    df_dict = {}
    for file_name in os.listdir(folder_path):
        if 'XPT' in os.path.splitext(file_name)[1]:
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_sas(file_path, format='xport')
            name = os.path.splitext(file_name)[0]
            if df.empty:
                raise Exception (f'Empty dataframe from file: {name}')
            df_dict[name] = df
        else:
            print(f'not loading file {file_name}')
    return df_dict



def full_outer_join(dataframes):
    joined_df = None
    for df in dataframes.values():
        if joined_df is None:
            joined_df = df
        else:
            joined_df = pd.merge(joined_df, df, on='SEQN', how='outer')
    return joined_df


folder_path = './xpt_data/'
dataframes_dict = read_xpt_files(folder_path)
df_cdc_joined = full_outer_join(dataframes_dict)
df_cdc_joined

In [None]:
df_cdc_joined.dtypes

# convert bytes and clean columns

In [None]:
# convert bytes and strip whitespace
df_cdc_joined_clean = df_cdc_joined.apply(lambda x: x.str.decode('utf-8').str.strip() if x.dtype == "object" else x)
# replace empty strings with nan
df_cdc_joined_clean = df_cdc_joined_clean.replace('', np.nan)
df_cdc_joined_clean

# filter to columns we care about

In [None]:
cols_to_keep = '''SEQN
FSD652CW
HUQ010
HUQ030
HUQ090
DPQ010
DPQ020
DPQ030
DPQ040
DPQ050
DPQ060
DPQ070
DPQ080
DPQ090
DPQ100
RXDUSE
RXDDAYS
RXDRSC1
RXDRSC2
RXDRSC3
RXDRSD1
RXDRSD2
RXDRSD3
RHQ074
RHQ076
RHD167
RHQ171'''.split()
cols_to_keep

# rename based on the mapping obtained from cdc codebook

In [None]:
df_cdc_joined_clean_trim = df_cdc_joined_clean[cols_to_keep]
df_cdc_joined_clean_trim

# rename columns with var descriptions

In [None]:
df_cdc_joined_clean_trim.rename(columns=var_map, inplace=True)
df_cdc_joined_clean_trim

# quick check on stats of selected cols

In [None]:
# set the pandas display options using with statement
# lets us supress sci notation and see all columns 
# without messing up your pandas output everywhere
with pd.option_context('display.max_columns', None, 'display.float_format', lambda x: '%.3f' % x): # also can do all rows using 'display.max_rows', None, 
    display(df_cdc_joined_clean_trim.describe())