In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import json
from time import gmtime, strftime, localtime
import glob
from sqlalchemy import create_engine
import mysql.connector

In [3]:
base_cdc_url = 'https://wwwn.cdc.gov'

In [7]:
def get_table_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.find(lambda tag: tag.has_attr('id') and tag['id']=="GridView1")
    
# Lambda expression for all links that end with XPT
    link_list = table.findAll(lambda tag: tag.name=='a' and tag['href'].endswith(".XPT"))
    links_only = [link.get('href') for link in link_list]
    
    return links_only


def get_multi_year(data_type, base_url):
    datatype_dict = {'demographics':'Demographics', 'dietary':'Dietary',
                     'examination':'Examination', 'laboratory':'Laboratory', 
                     'questionnaire':'Questionnaire'}
    # Can add years as future years are added
    year_list = [1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015]
    data_links = []
    for year in year_list:
        url = f"{base_url}/nchs/nhanes/search/datapage.aspx?Component={datatype_dict[data_type]}&CycleBeginYear={year}"
        temp_data_links = get_table_links(url)
        for data in temp_data_links:
            if data not in data_links:
                data_links.append(data)
                print(f"Added {data} from {year}")
        time.sleep(1)

    return data_links

def get_column_labels(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    # Codebook section of documentation
    # TODO -- take section or pdf htm pages
    codebook_links = soup.findAll('div', id='CodebookLinks')[0].findAll('a')
    
    dictionary = {link.string.split('-')[0].strip() : link.string.split('-')[1].strip() for link in codebook_links}
    return dictionary

def download_data(data_type, link_list, base_url):
    cwd = os.getcwd()
    try:
        os.mkdir(data_type)
        print(f'Created {data_type} folder')
    except:
        print(f'{data_type} folder exists')
    for link in link_list:
        item_name = link.split('/')[-1]
        exists = os.path.isfile(f'{cwd}/{data_type}/{item_name}')
        if exists:
            print(f'{item_name} already exists')
        else:
            current_time = time.time()
            print(f'Downloading {item_name} at {strftime("%a, %d %b %Y %H:%M:%S", localtime())}')
            r = requests.get(base_url + link, allow_redirects=True)
            open(f'{cwd}/{data_type}/{item_name}', 'wb').write(r.content)
            time_elapsed = time.time() - current_time
            print(f'Downloaded {item_name} at {time_elapsed}s')

def create_xpt_dict(data_type):
    original_file_names = {}
    group_file_names = []
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))== 1:
            original_file_names[xpt_file.split('.')[0]] = [xpt_file]
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))> 1:
            try:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'].append(xpt_file)
            except KeyError as e:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'] = [xpt_file]               
    return original_file_names
        
        
def combine_tables(data_type, xpt_dict):
    temp_df_list = []
    cwd = os.getcwd()
    for keys, values in xpt_dict[data_type].items():
        for value in values:
            print(f'Trying {cwd}/{data_type}/{value}')
            temp_df_list.append(pd.read_sas(f'{cwd}/{data_type}/{value}'))
            print(f'{cwd}/{data_type}/{value} appended')
    return pd.concat(temp_df_list)
                  
def grab_empty_files(data_type, base_url):
    empty_list = []
    cwd = os.getcwd()
    for file in glob.glob(f'{cwd}/{data_type}/*'):
        if os.stat(file).st_size == 0:
            empty_list.append(file)
            os.remove(file)
    if len(empty_list) == 0:
        print("There are no empty files in this folder")
    else:
        print(f"Now re-downloading {len(empty_list)} files")
        download_data(data_type, empty_list, base_url)


In [None]:
demographic_links = get_multi_year('demographics', base_cdc_url)
dietary_links = get_multi_year('dietary', base_cdc_url)
examination_links = get_multi_year('examination', base_cdc_url)
laboratory_links = get_multi_year('laboratory', base_cdc_url)
questionnaire_links = get_multi_year('questionnaire', base_cdc_url)

link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}
with open('xpt_link_dict.json', 'w') as f:
    json.dump(link_dictionary, f)
    
# Create the xpt_file_dict json for individual table creation and anticipation of merged tabes
# xpt_file_dict = {}
# for keys in link_dictionary:
#     xpt_file_dict[keys] = create_xpt_dict(keys)
    
# with open('xpt_file_dict.json', 'w') as f:
#     json.dump(xpt_file_dict, f)
    

    
# # Download data - 
# download_data('demographics', xpt_link_dictionary['demographics'], base_cdc_url)
# download_data('dietary', xpt_link_dictionary['dietary'], base_cdc_url)
# download_data('examination', xpt_link_dictionary['examination'], base_cdc_url)
# download_data('laboratory', xpt_link_dictionary['laboratory'], base_cdc_url)
# download_data('questionnaire', xpt_link_dictionary['questionnaire'], base_cdc_url)


#Ensure DB max_allowed_packet is set to 1G
def send_to_db(user,password,host,port,database, data_type, link_dict, file_dict):
    engine = create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}', 
                           echo=False)
    
    cwd = os.getcwd()
    counter = 0
    db_name_dict = create_db_names(data_type, link_dict, file_dict)
    for file in glob.glob(f'{cwd}/{data_type}/*'):
        file_name = file.split('/')[-1]
        print(f"Creating dataframe from {file}")
        temp_df = pd.read_sas(file, encoding='ISO-8859-1')
        print(f'Sending to MySQL Server as {db_name_dict[file_name][1]}')
        try:
            temp_df.to_sql(name=f'{db_name_dict[file_name][1]}', con=engine, if_exists='fail', index=False)
            counter += 1
        except ValueError as e:
            print(file_name + "is present")
            print(e)
            
        print('Now cleaning up db')
        del temp_df
    print(f'Added {counter} databases')



#   This will create file names that append the start year last 2 digits ie. 99 for 1999 and prefix DIET, DEMO, LAB, EXAM, QUEST for the respective filename. It will use the base file name ie. DEMO from DEMO_H.XPT as the filename
def create_db_names(data_type, link_dict, file_dict):
    
#   Exludes a DEMO preview because single tables do not need DEMO_DEMO
    prefix_dict = {'demographics': '', 'dietary': 'DIET_', 'examination': 'EXAM_', 
                   'laboratory': 'LAB_', 'questionnaire': 'QUEST_'}
    
    temp_dict = {}
    
#   Create temp_dict[filename:['2digit year']]
    for link in xpt_link_dictionary[data_type]:
            temp_dict[link.split('/')[-1]] = [link.split('/')[-2][2:4]]
            
#   Add prefix and DB name to temp_dict[xpt_filename: ['2digit year', 'DB Name example DIET_DSBI_99']]
    for key, values in xpt_file_dictionary[data_type].items():
        for value in values:
            if len(value.split('_')) > 2:
                #If there are multiple for same  year in sequence for instance lipids second value
                temp_dict[value].append(f'{prefix_dict[data_type]}'+ value[:-6] + "_" + temp_dict[value][0])
            else:
                temp_dict[value].append(f'{prefix_dict[data_type]}'+ key + "_" + temp_dict[value][0])
    
    return temp_dict

#Setting UTF8 and latin1 encoding errors 
#DSII does not play nice with UTF and encoding errors row '\xC2\x92S MU...' for column 'DSDSUPP' at row 74590
#DSPI Incorrect string value: '\xC2\x92S MU...' for column 'DSDSUPP' at row 6913


In [137]:
xpt_link_dictionary = json.loads(open('xpt_link_dict.json').read())
xpt_file_dictionary = json.loads(open('xpt_file_dict.json').read())

In [161]:
demo_combo_df = combine_tables('demographics', xpt_file_dict)

Trying /Users/tomnahass/development/nhanes/demographics/DEMO.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT appended
Trying /Users/tomnahass/development/nhanes/demograph

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [3]:
df99 = pd.read_sas("demographics/DEMO.XPT")
df01 = pd.read_sas('demographics/DEMO_B.XPT')
df03 = pd.read_sas('demographics/DEMO_C.XPT')
df05 = pd.read_sas('demographics/DEMO_D.XPT')
df07 = pd.read_sas('demographics/DEMO_E.XPT')
df09 = pd.read_sas('demographics/DEMO_F.XPT')
df11 = pd.read_sas('demographics/DEMO_G.XPT')
df13 = pd.read_sas('demographics/DEMO_H.XPT')
df15 = pd.read_sas('demographics/DEMO_I.XPT')

In [20]:
df13.columns

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC',
       'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL',
       'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY',
       'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ',
       'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE',
       'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR'],
      dtype='object')

In [26]:
df11.columns

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGY', 'RIDEXAGM', 'DMQMILIZ',
       'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2',
       'DMDMARTL', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG',
       'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA',
       'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2',
       'INDFMPIR', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE',
       'DMDHRGND', 'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU'],
      dtype='object')

In [28]:
df99to15 = pd.concat([df99, df01, df03,df05,df07,df09,df11,df13,df15], sort=False)

In [66]:
# Removed WTIRE and WTMREP 52 columns x2, left with 66 columns
df99to15_cleaned = df99to15[df99to15.columns[~df99to15.columns.str.match('(WTIRE|WTMREP)')]]

In [28]:
engine = create_engine('mysql+mysqlconnector://tom:password@localhost:3306/nhanes', echo=False)

In [72]:
df99to15_clean = df99to15_cleaned.set_index('SEQN')

In [77]:
df99to15_clean.index.value_counts()

1

In [74]:
df99to15_clean.to_csv('demographics_1999-2016.csv')

In [7]:
df99.to_sql(name='demo_99', con=engine, if_exists='append', index=False)


In [61]:
send_to_db('tom', 'password', 'localhost','3306', 'nhanes', 'dietary', xpt_link_dictionary, xpt_file_dictionary)

Creating dataframe from /home/tom/development/nhanes-download/dietary/DR2IFF_D.XPT
Sending to MySQL Server as DIET_DR2IFF_05
DR2IFF_D.XPTis present
Table 'DIET_DR2IFF_05' already exists.
Now cleaning up db
Creating dataframe from /home/tom/development/nhanes-download/dietary/DR2TOT_E.XPT
Sending to MySQL Server as DIET_DR2TOT_07
DR2TOT_E.XPTis present
Table 'DIET_DR2TOT_07' already exists.
Now cleaning up db
Creating dataframe from /home/tom/development/nhanes-download/dietary/DR2TOT_F.XPT
Sending to MySQL Server as DIET_DR2TOT_09
DR2TOT_F.XPTis present
Table 'DIET_DR2TOT_09' already exists.
Now cleaning up db
Creating dataframe from /home/tom/development/nhanes-download/dietary/DR2TOT_H.XPT
Sending to MySQL Server as DIET_DR2TOT_13
DR2TOT_H.XPTis present
Table 'DIET_DR2TOT_13' already exists.
Now cleaning up db
Creating dataframe from /home/tom/development/nhanes-download/dietary/DSQ2_B.XPT
Sending to MySQL Server as DIET_DSQ2_01
DSQ2_B.XPTis present
Table 'DIET_DSQ2_01' already exist

In [24]:
create_db_names('dietary', xpt_link_dictionary, xpt_file_dictionary)

{'DRXIFF.XPT': ['99', 'DIET_DRXIFF_99'],
 'DRXTOT.XPT': ['99', 'DIET_DRXTOT_99'],
 'DRXFMT.XPT': ['99', 'DIET_DRXFMT_99'],
 'DSBI.XPT': ['99', 'DIET_DSBI_99'],
 'DSII.XPT': ['99', 'DIET_DSII_99'],
 'DSPI.XPT': ['99', 'DIET_DSPI_99'],
 'DSQFILE1.XPT': ['99', 'DIET_DSQFILE1_99'],
 'DSQFILE2.XPT': ['99', 'DIET_DSQFILE2_99'],
 'DRXIFF_B.XPT': ['01', 'DIET_DRXIFF_01'],
 'DRXTOT_B.XPT': ['01', 'DIET_DRXTOT_01'],
 'DRXFMT_B.XPT': ['01', 'DIET_DRXFMT_01'],
 'DSQ1_B.XPT': ['01', 'DIET_DSQ1_01'],
 'DSQ2_B.XPT': ['01', 'DIET_DSQ2_01'],
 'DR1IFF_C.XPT': ['03', 'DIET_DR1IFF_03'],
 'DR2IFF_C.XPT': ['03', 'DIET_DR2IFF_03'],
 'DR1TOT_C.XPT': ['03', 'DIET_DR1TOT_03'],
 'DR2TOT_C.XPT': ['03', 'DIET_DR2TOT_03'],
 'DRXFCD_C.XPT': ['03', 'DIET_DRXFCD_03'],
 'DRXMCD_C.XPT': ['03', 'DIET_DRXMCD_03'],
 'DSQ1_C.XPT': ['03', 'DIET_DSQ1_03'],
 'DSQ2_C.XPT': ['03', 'DIET_DSQ2_03'],
 'FOODLK_C.XPT': ['03', 'DIET_FOODLK_03'],
 'VARLK_C.XPT': ['03', 'DIET_VARLK_03'],
 'FFQDC_C.XPT': ['03', 'DIET_FFQDC_03'],
 'FFQRAW

In [51]:
df = pd.read_sas('dietary/DSII.XPT', encoding='ISO-8859-1')

In [None]:
'DSDSUPP' at row 74590

In [52]:
list(df.loc[74590:74599, 'DSDSUPP'])

['DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL',
 'DISNEY WINNIE THE POOH NUTRI-STIX MULTIVITAMIN CHILDREN\x92S MULTIPLE VITAMIN & MINERAL']