In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
from time import gmtime, strftime, localtime
import glob

In [3]:
base_cdc_url = 'https://wwwn.cdc.gov'

In [7]:
def get_table_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.find(lambda tag: tag.has_attr('id') and tag['id']=="GridView1")
    
# Lambda expression for all links that end with XPT
    link_list = table.findAll(lambda tag: tag.name=='a' and tag['href'].endswith(".XPT"))
    links_only = [link.get('href') for link in link_list]
    
    return links_only


def get_multi_year(data_type, base_url):
    datatype_dict = {'demographics':'Demographics', 'dietary':'Dietary',
                     'examination':'Examination', 'laboratory':'Laboratory', 
                     'questionnaire':'Questionnaire'}
    # Can add years as future years are added
    year_list = [1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015]
    data_links = []
    for year in year_list:
        url = f"{base_url}/nchs/nhanes/search/datapage.aspx?Component={datatype_dict[data_type]}&CycleBeginYear={year}"
        temp_data_links = get_table_links(url)
        for data in temp_data_links:
            if data not in data_links:
                data_links.append(data)
                print(f"Added {data} from {year}")
        time.sleep(1)

    return data_links

def get_column_labels(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    # Codebook section of documentation
    # TODO -- take section or pdf htm pages
    codebook_links = soup.findAll('div', id='CodebookLinks')[0].findAll('a')
    
    dictionary = {link.string.split('-')[0].strip() : link.string.split('-')[1].strip() for link in codebook_links}
    return dictionary

def download_data(data_type, link_list, base_url):
    cwd = os.getcwd()
    try:
        os.mkdir(data_type)
        print(f'Created {data_type} folder')
    except:
        print(f'{data_type} folder exists')
    for link in link_list:
        item_name = link.split('/')[-1]
        exists = os.path.isfile(f'{cwd}/{data_type}/{item_name}')
        if exists:
            print(f'{item_name} already exists')
        else:
            current_time = time.time()
            print(f'Downloading {item_name} at {strftime("%a, %d %b %Y %H:%M:%S", localtime())}')
            r = requests.get(base_url + link, allow_redirects=True)
            open(f'{cwd}/{data_type}/{item_name}', 'wb').write(r.content)
            time_elapsed = time.time() - current_time
            print(f'Downloaded {item_name} at {time_elapsed}s')

def create_xpt_dict(data_type):
    original_file_names = {}
    group_file_names = []
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))== 1:
            original_file_names[xpt_file.split('.')[0]] = [xpt_file]
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))> 1:
            try:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'].append(xpt_file)
            except KeyError as e:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'] = [xpt_file]               
    return original_file_names
        
        
def combine_tables(data_type, xpt_dict):
    temp_df_list = []
    cwd = os.getcwd()
    for keys, values in xpt_dict[data_type].items():
        for value in values:
            print(f'Trying {cwd}/{data_type}/{value}')
            temp_df_list.append(pd.read_sas(f'{cwd}/{data_type}/{value}'))
            print(f'{cwd}/{data_type}/{value} appended')
    return pd.concat(temp_df_list)
                  
def grab_empty_files(data_type, base_url):
    empty_list = []
    cwd = os.getcwd()
    for file in glob.glob(f'{cwd}/{data_type}/*'):
        if os.stat(file).st_size == 0:
            empty_list.append(file)
            os.remove(file)
    if len(empty_list) == 0:
        print("There are no empty files in this folder")
    else:
        print(f"Now re-downloading {len(empty_list)} files")
        download_data(data_type, empty_list, base_url)


In [6]:
demographic_links = get_multi_year('demographics', base_cdc_url)
dietary_links = get_multi_year('dietary', base_cdc_url)
examination_links = get_multi_year('examination', base_cdc_url)
laboratory_links = get_multi_year('laboratory', base_cdc_url)
questionnaire_links = get_multi_year('questionnaire', base_cdc_url)

Added /Nchs/Nhanes/1999-2000/DEMO.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DEMO_B.XPT from 2001
Added /Nchs/Nhanes/2003-2004/DEMO_C.XPT from 2003
Added /Nchs/Nhanes/2005-2006/DEMO_D.XPT from 2005
Added /Nchs/Nhanes/2007-2008/DEMO_E.XPT from 2007
Added /Nchs/Nhanes/2009-2010/DEMO_F.XPT from 2009
Added /Nchs/Nhanes/2011-2012/DEMO_G.XPT from 2011
Added /Nchs/Nhanes/2013-2014/DEMO_H.XPT from 2013
Added /Nchs/Nhanes/2015-2016/DEMO_I.XPT from 2015
Added /Nchs/Nhanes/1999-2000/DRXIFF.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXTOT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXFMT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSBI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSII.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSPI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE1.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE2.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DRXIFF_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXTOT_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXFMT_B.XPT from 200

In [8]:
link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}
import json

with open('xpt_link_dict.json', 'w') as f:
    json.dump(link_dictionary, f)

In [102]:
# Create the xpt_file_dict json for individual table creation and anticipation of merged tabes
xpt_file_dict = {}
for keys in link_dictionary:
    xpt_file_dict[keys] = create_xpt_dict(keys)
    
with open('xpt_file_dict.json', 'w') as f:
    json.dump(xpt_file_dict, f)

In [137]:
xpt_link_dictionary = json.loads(open('xpt_file_dict.json').read())

download_data('demographics', xpt_link_dictionary['demographics'], base_cdc_url)
download_data('dietary', xpt_link_dictionary['dietary'], base_cdc_url)
download_data('examination', xpt_link_dictionary['examination'], base_cdc_url)
download_data('laboratory', xpt_link_dictionary['laboratory'], base_cdc_url)
download_data('questionnaire', xpt_link_dictionary['questionnaire'], base_cdc_url)

In [161]:
demo_combo_df = combine_tables('demographics', xpt_file_dict)

Trying /Users/tomnahass/development/nhanes/demographics/DEMO.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT appended
Trying /Users/tomnahass/development/nhanes/demograph

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [5]:
df99 = pd.read_sas("demographics/DEMO.XPT")
df01 = pd.read_sas('demographics/DEMO_B.XPT')
df03 = pd.read_sas('demographics/DEMO_C.XPT')
df05 = pd.read_sas('demographics/DEMO_D.XPT')
df07 = pd.read_sas('demographics/DEMO_E.XPT')
df09 = pd.read_sas('demographics/DEMO_F.XPT')
df11 = pd.read_sas('demographics/DEMO_G.XPT')
df13 = pd.read_sas('demographics/DEMO_H.XPT')
df15 = pd.read_sas('demographics/DEMO_I.XPT')

In [9]:
pd.concat([df13,df15]).SEQN

0       73557.0
1       73558.0
2       73559.0
3       73560.0
4       73561.0
5       73562.0
6       73563.0
7       73564.0
8       73565.0
9       73566.0
10      73567.0
11      73568.0
12      73569.0
13      73570.0
14      73571.0
15      73572.0
16      73573.0
17      73574.0
18      73575.0
19      73576.0
20      73577.0
21      73578.0
22      73579.0
23      73580.0
24      73581.0
25      73582.0
26      73583.0
27      73584.0
28      73585.0
29      73586.0
         ...   
9941    93673.0
9942    93674.0
9943    93675.0
9944    93676.0
9945    93677.0
9946    93678.0
9947    93679.0
9948    93680.0
9949    93681.0
9950    93682.0
9951    93683.0
9952    93684.0
9953    93685.0
9954    93686.0
9955    93687.0
9956    93688.0
9957    93689.0
9958    93690.0
9959    93691.0
9960    93692.0
9961    93693.0
9962    93694.0
9963    93695.0
9964    93696.0
9965    93697.0
9966    93698.0
9967    93699.0
9968    93700.0
9969    93701.0
9970    93702.0
Name: SEQN, Length: 2014

In [21]:
from sqlalchemy import create_engine
import mysql.connector

In [28]:
engine = create_engine('mysql+mysqlconnector://tom:password@localhost:3306/nhanes', echo=False)

In [7]:
df99.to_sql(name='demo_99', con=engine, if_exists='append', index=False)


In [35]:
#Ensure DB max_allowed_packet is set to 1G
def send_to_db(user,password,host,port,database, data_type, link_dict, file_dict):
    engine = create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}', 
                           echo=False)
    
    cwd = os.getcwd()
    counter = 0
    db_name_dict = create_db_names(data_type, link_dict, file_dict)
    for file in glob.glob(f'{cwd}/{data_type}/*'):
        file_name = file.split('/')[-1]
        print(f"Creating dataframe from {file}")
        temp_df = pd.read_sas(file)
        print(f'Sending to MySQL Server as {db_name_dict[file_name][1]}')
        temp_df.to_sql(name=f'{db_name_dict[file_name][1]}', con=engine, if_exists='append', index=False)
        counter += 1
        print('Now cleaning up db')
        del temp_df
    print(f'Added {counter} databases')



#   This will create file names that append the start year last 2 digits ie. 99 for 1999 and prefix DIET, DEMO, LAB, EXAM, QUEST for the respective filename. It will use the base file name ie. DEMO from DEMO_H.XPT as the filename
def create_db_names(data_type, link_dict, file_dict):
    
#   Exludes a DEMO preview because single tables do not need DEMO_DEMO
    prefix_dict = {'demographics': '', 'dietary': 'DIET_', 'examination': 'EXAM_', 
                   'laboratory': 'LAB_', 'questionnaire': 'QUEST_'}
    
    temp_dict = {}
    
#   Create temp_dict[filename:['2digit year']]
    for link in xpt_link_dictionary[data_type]:
            temp_dict[link.split('/')[-1]] = [link.split('/')[-2][2:4]]
            
#   Add prefix and DB name to temp_dict[xpt_filename: ['2digit year', 'DB Name example DIET_DSBI_99']]
    for key, values in xpt_file_dictionary[data_type].items():
        for value in values:
            temp_dict[value].append(f'{prefix_dict[data_type]}'+ key + "_" + temp_dict[value][0])
    
    return temp_dict
    

In [None]:
send_to_db('tom', 'password', 'localhost','3306', 'nhanes', 'dietary', xpt_link_dictionary, xpt_file_dictionary)

Creating dataframe from /home/tom/development/nhanes-download/dietary/DR2IFF_D.XPT
Sending to MySQL Server as DIET_DR2IFF_05


In [3]:
import json
xpt_link_dictionary = json.loads(open('xpt_link_dict.json').read())
xpt_file_dictionary = json.loads(open('xpt_file_dict.json').read())

In [34]:
xpt_link_dictionary['demographics'][0].split('/')[-2:]

['1999-2000', 'DEMO.XPT']

In [22]:
create_file_names('questionnaire', xpt_link_dictionary, xpt_file_dictionary)

{'ACQ.XPT': ['99', 'QUEST_ACQ_99'],
 'ALQ.XPT': ['99', 'QUEST_ALQ_99'],
 'RXQ_ANA.XPT': ['99', 'QUEST_RXQ_99'],
 'AUQ.XPT': ['99', 'QUEST_AUQ_99'],
 'BAQ.XPT': ['99', 'QUEST_BAQ_99'],
 'BPQ.XPT': ['99', 'QUEST_BPQ_99'],
 'CDQ.XPT': ['99', 'QUEST_CDQ_99'],
 'CFQ.XPT': ['99', 'QUEST_CFQ_99'],
 'HSQ.XPT': ['99', 'QUEST_HSQ_99'],
 'DEQ.XPT': ['99', 'QUEST_DEQ_99'],
 'DIQ.XPT': ['99', 'QUEST_DIQ_99'],
 'DBQ.XPT': ['99', 'QUEST_DBQ_99'],
 'DUQ.XPT': ['99', 'QUEST_DUQ_99'],
 'ECQ.XPT': ['99', 'QUEST_ECQ_99'],
 'FSQ.XPT': ['99', 'QUEST_FSQ_99'],
 'HIQ.XPT': ['99', 'QUEST_HIQ_99'],
 'HUQ.XPT': ['99', 'QUEST_HUQ_99'],
 'HOQ.XPT': ['99', 'QUEST_HOQ_99'],
 'IMQ.XPT': ['99', 'QUEST_IMQ_99'],
 'KIQ.XPT': ['99', 'QUEST_KIQ_99'],
 'MCQ.XPT': ['99', 'QUEST_MCQ_99'],
 'CIQMDEP.XPT': ['99', 'QUEST_CIQMDEP_99'],
 'CIQGAD.XPT': ['99', 'QUEST_CIQGAD_99'],
 'CIQPANIC.XPT': ['99', 'QUEST_CIQPANIC_99'],
 'MPQ.XPT': ['99', 'QUEST_MPQ_99'],
 'OCQ.XPT': ['99', 'QUEST_OCQ_99'],
 'OHQ.XPT': ['99', 'QUEST_OHQ_99'],


In [7]:
temp_dict = {}


DIET_DSBI_99
DIET_DSII_99
DIET_DSPI_99
DIET_DRXIFF_99
DIET_DRXIFF_01
DIET_DRXFMT_99
DIET_DRXFMT_01
DIET_DSQFILE2_99
DIET_DRXTOT_99
DIET_DRXTOT_01
DIET_DSQFILE1_99
DIET_DS1IDS_13
DIET_DS1IDS_11
DIET_DS1IDS_09
DIET_DS1IDS_07
DIET_DR1TOT_03
DIET_DR1TOT_05
DIET_DR1TOT_07
DIET_DR1TOT_11
DIET_DR1TOT_09
DIET_DR1TOT_13
DIET_DR1TOT_15
DIET_VARLK_03
DIET_DSQ1_01
DIET_DSQ1_03
DIET_DSQ1_05
DIET_DS2TOT_13
DIET_DS2TOT_11
DIET_DS2TOT_09
DIET_DS2TOT_07
DIET_DSQIDS_13
DIET_DSQIDS_11
DIET_DSQIDS_09
DIET_DSQIDS_07
DIET_DR2IFF_13
DIET_DR2IFF_15
DIET_DR2IFF_11
DIET_DR2IFF_09
DIET_DR2IFF_05
DIET_DR2IFF_07
DIET_DR2IFF_03
DIET_DRXFCD_13
DIET_DRXFCD_15
DIET_DRXFCD_03
DIET_DRXFCD_11
DIET_DRXFCD_09
DIET_DRXFCD_05
DIET_DRXFCD_07
DIET_DSQTOT_07
DIET_DSQTOT_11
DIET_DSQTOT_09
DIET_DSQTOT_13
DIET_DS2IDS_07
DIET_DS2IDS_11
DIET_DS2IDS_09
DIET_DS2IDS_13
DIET_FFQDC_05
DIET_FFQDC_03
DIET_DR1IFF_03
DIET_DR1IFF_05
DIET_DR1IFF_07
DIET_DR1IFF_11
DIET_DR1IFF_09
DIET_DR1IFF_13
DIET_DR1IFF_15
DIET_DSQ2_03
DIET_DSQ2_01
DIET_DSQ2_

In [42]:
temp_dict = {}
for link in xpt_link_dictionary['demographics']:
    temp_dict[link.split('/')[-1]] = link.split('/')[-2][2:4]
for file in glob.glob('demographics/*'):
    print(file.split('/')[-1])
temp_dict    

DEMO_D.XPT
DEMO_B.XPT
DEMO_E.XPT
DEMO.XPT
DEMO_G.XPT
DEMO_C.XPT
DEMO_F.XPT
DEMO_H.XPT
DEMO_I.XPT


{'DEMO.XPT': '99',
 'DEMO_B.XPT': '01',
 'DEMO_C.XPT': '03',
 'DEMO_D.XPT': '05',
 'DEMO_E.XPT': '07',
 'DEMO_F.XPT': '09',
 'DEMO_G.XPT': '11',
 'DEMO_H.XPT': '13',
 'DEMO_I.XPT': '15'}

In [23]:
combo1315 = pd.concat([df11,df13,df15])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [24]:
combo1315.INDFMIN2.value_counts(dropna=False).sort_index()

 1.0     1146
 2.0     1418
 3.0     2077
 4.0     2071
 5.0     2337
 6.0     3349
 7.0     2758
 8.0     2159
 9.0     1527
 10.0    1271
 12.0     688
 13.0     406
 14.0    2474
 15.0    4638
 77.0     744
 99.0     336
NaN       503
Name: INDFMIN2, dtype: int64

In [26]:
combo1315.WTINT2YR

0       102641.406474
1        15457.736897
2         7397.684828
3       127351.373299
4        12209.744980
5        60593.636684
6         5024.464768
7         5897.024603
8        14391.778470
9         7794.526990
10       22768.423624
11       26960.774346
12       11401.934012
13       24912.668432
14       26980.605125
15       53830.599426
16        7879.750437
17       12291.154515
18       16590.074977
19       20457.614917
20       13822.148996
21       63069.107216
22       10138.004540
23       15600.678771
24       18635.323223
25       11224.041366
26       10118.363218
27       11793.948458
28       17983.231494
29       20419.465237
            ...      
9941     12118.265490
9942     59708.217967
9943     26328.589736
9944     32944.356566
9945    136319.798518
9946     17823.018496
9947     21037.256891
9948     11528.871825
9949      8862.399321
9950     23924.202819
9951     12977.535438
9952     29880.639492
9953     22441.858682
9954     25564.294841
9955     1