In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
from time import gmtime, strftime, localtime

In [2]:
base_cdc_url = 'https://wwwn.cdc.gov'

In [7]:
def get_table_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.find(lambda tag: tag.has_attr('id') and tag['id']=="GridView1")
    
# Lambda expression for all links that end with XPT
    link_list = table.findAll(lambda tag: tag.name=='a' and tag['href'].endswith(".XPT"))
    links_only = [link.get('href') for link in link_list]
    
    return links_only


def get_multi_year(data_type, base_url):
    datatype_dict = {'demographics':'Demographics', 'dietary':'Dietary',
                     'examination':'Examination', 'laboratory':'Laboratory', 
                     'questionnaire':'Questionnaire'}
    # Can add years as future years are added
    year_list = [1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015]
    data_links = []
    for year in year_list:
        url = f"{base_url}/nchs/nhanes/search/datapage.aspx?Component={datatype_dict[data_type]}&CycleBeginYear={year}"
        temp_data_links = get_table_links(url)
        for data in temp_data_links:
            if data not in data_links:
                data_links.append(data)
                print(f"Added {data} from {year}")
        time.sleep(1)

    return data_links

def get_column_labels(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    # Codebook section of documentation
    # TODO -- take section or pdf htm pages
    codebook_links = soup.findAll('div', id='CodebookLinks')[0].findAll('a')
    
    dictionary = {link.string.split('-')[0].strip() : link.string.split('-')[1].strip() for link in codebook_links}
    return dictionary


In [6]:
demographic_links = get_multi_year('demographics', base_cdc_url)
dietary_links = get_multi_year('dietary', base_cdc_url)
examination_links = get_multi_year('examination', base_cdc_url)
laboratory_links = get_multi_year('laboratory', base_cdc_url)
questionnaire_links = get_multi_year('questionnaire', base_cdc_url)

Added /Nchs/Nhanes/1999-2000/DEMO.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DEMO_B.XPT from 2001
Added /Nchs/Nhanes/2003-2004/DEMO_C.XPT from 2003
Added /Nchs/Nhanes/2005-2006/DEMO_D.XPT from 2005
Added /Nchs/Nhanes/2007-2008/DEMO_E.XPT from 2007
Added /Nchs/Nhanes/2009-2010/DEMO_F.XPT from 2009
Added /Nchs/Nhanes/2011-2012/DEMO_G.XPT from 2011
Added /Nchs/Nhanes/2013-2014/DEMO_H.XPT from 2013
Added /Nchs/Nhanes/2015-2016/DEMO_I.XPT from 2015
Added /Nchs/Nhanes/1999-2000/DRXIFF.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXTOT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXFMT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSBI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSII.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSPI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE1.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE2.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DRXIFF_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXTOT_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXFMT_B.XPT from 200

In [8]:
link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}

In [9]:
import json

with open('xpt_link_dict.json', 'w') as f:
    json.dump(link_dictionary, f)


In [49]:
temp_df = pd.read_html("https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=1999",
            match="Data File Name", attrs = {'id': 'GridView1'})[0]

In [115]:
get_table_links("https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=1999")

['/Nchs/Nhanes/1999-2000/AUX1.XPT',
 '/Nchs/Nhanes/1999-2000/AUXAR.XPT',
 '/Nchs/Nhanes/1999-2000/AUXTYM.XPT',
 '/Nchs/Nhanes/1999-2000/BAX.XPT',
 '/Nchs/Nhanes/1999-2000/BIX.XPT',
 '/Nchs/Nhanes/1999-2000/BPX.XPT',
 '/Nchs/Nhanes/1999-2000/BMX.XPT',
 '/Nchs/Nhanes/1999-2000/CVX.XPT',
 '/Nchs/Nhanes/1999-2000/LEXABPI.XPT',
 '/Nchs/Nhanes/1999-2000/LEXPN.XPT',
 '/Nchs/Nhanes/1999-2000/MSX.XPT',
 '/Nchs/Nhanes/1999-2000/OHXDENT.XPT',
 '/Nchs/Nhanes/1999-2000/OHXPERIO.XPT',
 '/Nchs/Nhanes/1999-2000/OHXREF.XPT',
 '/Nchs/Nhanes/1999-2000/SEQ.XPT',
 '/Nchs/Nhanes/1999-2000/TB.XPT',
 '/Nchs/Nhanes/1999-2000/VIX.XPT']

In [124]:
demographic_links[0][:-4]+'.htm'

'/Nchs/Nhanes/1999-2000/DEMO.htm'

In [125]:
r = requests.get(base_cdc_url + demographic_links[0][:-4]+'.htm')

In [129]:
data = r.text

In [138]:
div_links = BeautifulSoup(data).findAll('div', id="CodebookLinks")[0]

In [150]:
div_links.findAll('a')[0].string.split('-')[1].strip()

'Respondent sequence number'

In [154]:
column_dict = get_column_labels(base_cdc_url + demographic_links[0][:-4]+'.htm')

In [156]:
test_df = pd.read_sas('Test.xpt')

In [158]:
test_df.rename(columns=column_dict)

Unnamed: 0,Respondent sequence number,Data Release Number,Interview/Examination Status,Six month time period,Gender,Age at Screening Adjudicated,Age in Months,Exam Age in Months,Race/Ethnicity,Linked NH3 Race/Ethnicity,...,Interview Weight Jack Knife Replicate 43,Interview Weight Jack Knife Replicate 44,Interview Weight Jack Knife Replicate 45,Interview Weight Jack Knife Replicate 46,Interview Weight Jack Knife Replicate 47,Interview Weight Jack Knife Replicate 48,Interview Weight Jack Knife Replicate 49,Interview Weight Jack Knife Replicate 50,Interview Weight Jack Knife Replicate 51,Interview Weight Jack Knife Replicate 52
0,1.0,1.0,2.0,2.0,2.0,2.000000e+00,29.0,31.0,4.0,2.0,...,10094.017100,9912.461855,9727.078709,1.004152e+04,9.953956e+03,9857.381983,9.865152e+03,1.032799e+04,9.809165e+03,1.032332e+04
1,2.0,1.0,2.0,2.0,1.0,7.700000e+01,926.0,926.0,3.0,1.0,...,27186.728682,27324.345051,28099.663528,2.775707e+04,2.804929e+04,26716.602006,2.687770e+04,2.726803e+04,2.740638e+04,2.698481e+04
2,3.0,1.0,2.0,1.0,2.0,1.000000e+01,125.0,126.0,3.0,1.0,...,43993.193099,44075.386428,46642.563799,4.496768e+04,4.457248e+04,44087.945688,4.483137e+04,4.448099e+04,4.538911e+04,4.378191e+04
3,4.0,1.0,2.0,2.0,1.0,1.000000e+00,22.0,23.0,4.0,2.0,...,10702.307249,10531.444441,10346.119327,1.063606e+04,5.397605e-79,10533.108939,1.065475e+04,1.085102e+04,1.056498e+04,1.101253e+04
4,5.0,1.0,2.0,2.0,1.0,4.900000e+01,597.0,597.0,3.0,1.0,...,93164.782430,92119.608772,95388.490406,9.413138e+04,9.529781e+04,91325.082461,9.164059e+04,9.281793e+04,9.428286e+04,9.199325e+04
5,6.0,1.0,2.0,2.0,2.0,1.900000e+01,230.0,230.0,5.0,4.0,...,36948.534618,36855.281734,38998.473111,3.802201e+04,3.754149e+04,36871.278832,3.752068e+04,3.715187e+04,3.801650e+04,3.671024e+04
6,7.0,1.0,2.0,2.0,2.0,5.900000e+01,712.0,712.0,4.0,2.0,...,23096.205218,22705.366233,22352.088620,2.260011e+04,2.327240e+04,22956.903156,2.314706e+04,2.280701e+04,2.308579e+04,2.391840e+04
7,8.0,1.0,2.0,1.0,1.0,1.300000e+01,159.0,160.0,3.0,1.0,...,31904.891113,32634.978618,33640.727499,3.254396e+04,3.276514e+04,31939.471692,3.251975e+04,3.221750e+04,3.267858e+04,3.165622e+04
8,9.0,1.0,2.0,2.0,2.0,1.100000e+01,133.0,133.0,4.0,2.0,...,7704.500150,7734.334276,7529.435502,7.602184e+03,7.783768e+03,7581.275523,7.866193e+03,7.974498e+03,7.783064e+03,8.034182e+03
9,10.0,1.0,2.0,2.0,1.0,4.300000e+01,518.0,518.0,4.0,2.0,...,21569.526048,21308.984171,21071.164059,2.176243e+04,2.266320e+04,21281.981862,2.167752e+04,2.172516e+04,2.162508e+04,2.253903e+04


In [159]:
demographic_links

['/Nchs/Nhanes/1999-2000/DEMO.XPT',
 '/Nchs/Nhanes/2001-2002/DEMO_B.XPT',
 '/Nchs/Nhanes/2003-2004/DEMO_C.XPT',
 '/Nchs/Nhanes/2005-2006/DEMO_D.XPT',
 '/Nchs/Nhanes/2007-2008/DEMO_E.XPT',
 '/Nchs/Nhanes/2009-2010/DEMO_F.XPT',
 '/Nchs/Nhanes/2011-2012/DEMO_G.XPT',
 '/Nchs/Nhanes/2013-2014/DEMO_H.XPT',
 '/Nchs/Nhanes/2015-2016/DEMO_I.XPT']

In [160]:
link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}

In [168]:
list(link_dictionary.keys())

['demographics', 'dietary', 'examination', 'laboratory', 'questionnaire']

In [186]:
import os

In [31]:
def download_data(data_type, link_list, base_url):
    cwd = os.getcwd()
    try:
        os.mkdir(data_type)
        print(f'Created {data_type} folder')
    except:
        print(f'{data_type} folder exists')
    for link in link_list:
        item_name = link.split('/')[-1]
        exists = os.path.isfile(f'{cwd}/{data_type}/{item_name}')
        if exists:
            print(f'{item_name} already exists')
        else:
            current_time = time.time()
            print(f'Downloading {item_name} at {strftime("%a, %d %b %Y %H:%M:%S", localtime())}')
            r = requests.get(base_url + link, allow_redirects=True)
            open(f'{cwd}/{data_type}/{item_name}', 'wb').write(r.content)
            time_elapsed = time.time() - current_time
            print(f'Downloaded {item_name} at {time_elapsed}s')
            

In [30]:
download_data('dietary', link_dictionary['dietary'], base_cdc_url)

dietary folder exists
DRXIFF.XPT already exists
DRXTOT.XPT already exists
DRXFMT.XPT already exists
DSBI.XPT already exists
DSII.XPT already exists
DSPI.XPT already exists
DSQFILE1.XPT already exists
DSQFILE2.XPT already exists
DRXIFF_B.XPT already exists
DRXTOT_B.XPT already exists
DRXFMT_B.XPT already exists
DSQ1_B.XPT already exists
DSQ2_B.XPT already exists
DR1IFF_C.XPT already exists
DR2IFF_C.XPT already exists
DR1TOT_C.XPT already exists
DR2TOT_C.XPT already exists
DRXFCD_C.XPT already exists
DRXMCD_C.XPT already exists
DSQ1_C.XPT already exists
DSQ2_C.XPT already exists
FOODLK_C.XPT already exists
VARLK_C.XPT already exists
FFQDC_C.XPT already exists
FFQRAW_C.XPT already exists
DR1IFF_D.XPT already exists
DR2IFF_D.XPT already exists
DR1TOT_D.XPT already exists
DR2TOT_D.XPT already exists
DRXFCD_D.XPT already exists
DRXMCD_D.XPT already exists
DSQ1_D.XPT already exists
DSQ2_D.XPT already exists
FFQDC_D.XPT already exists
FFQRAW_D.XPT already exists
DR1IFF_E.XPT already exists
DR2

In [32]:
download_data('examination', link_dictionary['examination'], base_cdc_url)

Created examination folder
Downloading AUX1.XPT at Tue, 13 Nov 2018 09:27:07
Downloaded AUX1.XPT at 1.4642879962921143s
Downloading AUXAR.XPT at Tue, 13 Nov 2018 09:27:08
Downloaded AUXAR.XPT at 3.775470018386841s
Downloading AUXTYM.XPT at Tue, 13 Nov 2018 09:27:12
Downloaded AUXTYM.XPT at 2.509443998336792s
Downloading BAX.XPT at Tue, 13 Nov 2018 09:27:14
Downloaded BAX.XPT at 0.9057416915893555s
Downloading BIX.XPT at Tue, 13 Nov 2018 09:27:15
Downloaded BIX.XPT at 4.971536874771118s
Downloading BPX.XPT at Tue, 13 Nov 2018 09:27:20
Downloaded BPX.XPT at 2.957737922668457s
Downloading BMX.XPT at Tue, 13 Nov 2018 09:27:23
Downloaded BMX.XPT at 4.179272890090942s
Downloading CVX.XPT at Tue, 13 Nov 2018 09:27:28
Downloaded CVX.XPT at 3.6596717834472656s
Downloading LEXABPI.XPT at Tue, 13 Nov 2018 09:27:31
Downloaded LEXABPI.XPT at 0.47780609130859375s
Downloading LEXPN.XPT at Tue, 13 Nov 2018 09:27:32
Downloaded LEXPN.XPT at 0.7453854084014893s
Downloading MSX.XPT at Tue, 13 Nov 2018 09:

In [33]:
download_data('laboratory', link_dictionary['laboratory'], base_cdc_url)

Created laboratory folder
Downloading SSAFB_A.XPT at Tue, 13 Nov 2018 09:34:24
Downloaded SSAFB_A.XPT at 1.2824079990386963s
Downloading LAB16.XPT at Tue, 13 Nov 2018 09:34:25
Downloaded LAB16.XPT at 0.49591708183288574s
Downloading SSAMH_A.XPT at Tue, 13 Nov 2018 09:34:25
Downloaded SSAMH_A.XPT at 0.20039081573486328s
Downloading SSANA_A.XPT at Tue, 13 Nov 2018 09:34:26
Downloaded SSANA_A.XPT at 0.8934462070465088s
Downloading LAB06.XPT at Tue, 13 Nov 2018 09:34:26
Downloaded LAB06.XPT at 1.9645698070526123s
Downloading LAB05.XPT at Tue, 13 Nov 2018 09:34:28
Downloaded LAB05.XPT at 0.25772595405578613s
Downloading LAB13AM.XPT at Tue, 13 Nov 2018 09:34:29
Downloaded LAB13AM.XPT at 0.35291600227355957s
Downloading LAB13.XPT at Tue, 13 Nov 2018 09:34:29
Downloaded LAB13.XPT at 0.414931058883667s
Downloading LAB25.XPT at Tue, 13 Nov 2018 09:34:29
Downloaded LAB25.XPT at 1.1470999717712402s
Downloading LAB11.XPT at Tue, 13 Nov 2018 09:34:31
Downloaded LAB11.XPT at 0.5150983333587646s
Downl

In [34]:
download_data('questionnaire', link_dictionary['questionnaire'], base_cdc_url)

Created questionnaire folder
Downloading ACQ.XPT at Tue, 13 Nov 2018 09:41:40
Downloaded ACQ.XPT at 0.772728681564331s
Downloading ALQ.XPT at Tue, 13 Nov 2018 09:41:41
Downloaded ALQ.XPT at 0.41146397590637207s
Downloading RXQ_ANA.XPT at Tue, 13 Nov 2018 09:41:41
Downloaded RXQ_ANA.XPT at 0.4474611282348633s
Downloading AUQ.XPT at Tue, 13 Nov 2018 09:41:42
Downloaded AUQ.XPT at 0.9824011325836182s
Downloading BAQ.XPT at Tue, 13 Nov 2018 09:41:43
Downloaded BAQ.XPT at 0.6523001194000244s
Downloading BPQ.XPT at Tue, 13 Nov 2018 09:41:43
Downloaded BPQ.XPT at 1.391211986541748s
Downloading CDQ.XPT at Tue, 13 Nov 2018 09:41:45
Downloaded CDQ.XPT at 0.3655972480773926s
Downloading CFQ.XPT at Tue, 13 Nov 2018 09:41:45
Downloaded CFQ.XPT at 0.32801008224487305s
Downloading HSQ.XPT at Tue, 13 Nov 2018 09:41:45
Downloaded HSQ.XPT at 0.57784104347229s
Downloading DEQ.XPT at Tue, 13 Nov 2018 09:41:46
Downloaded DEQ.XPT at 2.665371894836426s
Downloading DIQ.XPT at Tue, 13 Nov 2018 09:41:49
Downloa

In [36]:
cd ..

/Users/tomnahass/development


In [37]:
ls

[1m[36mAAXtoMP3[m[m/          [1m[36mcards_stats[m[m/       [1m[36mnode_modules[m[m/      [1m[36msite[m[m/
Untitled.ipynb     [1m[36mflutter[m[m/           package-lock.json
[1m[36mblog[m[m/              [1m[36mnhanes[m[m/            [1m[36mpersonal-blog[m[m/


In [38]:
cd nhanes

/Users/tomnahass/development/nhanes


In [40]:
ls demographics/

DEMO.XPT    DEMO_C.XPT  DEMO_E.XPT  DEMO_G.XPT  DEMO_I.XPT
DEMO_B.XPT  DEMO_D.XPT  DEMO_F.XPT  DEMO_H.XPT


In [41]:
import glob

In [66]:
for file in glob.glob('demographics/*_[A-Z]*'):
    print(file.split('/')[1].split("_"))

['DEMO', 'G.XPT']
['DEMO', 'F.XPT']
['DEMO', 'D.XPT']
['DEMO', 'E.XPT']
['DEMO', 'B.XPT']
['DEMO', 'C.XPT']
['DEMO', 'H.XPT']
['DEMO', 'I.XPT']


In [62]:
os.getcwd() + "/demographics/"

'/Users/tomnahass/development/nhanes/demographics/'

In [84]:
def create_xpt_dict(data_type):
    original_file_names = {}
    group_file_names = []
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))== 1:
            original_file_names[file.split('/')[1].split('.')[0]] = []
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))> 1:
            try:
                for xpt in xpt_file.split('_'):
                    original_file_names[f'{xpt}'].append(xpt_file)
            except KeyError as e:
                for xpt in xpt_file.split('_'):
                    original_file_names[f'{xpt}'].append(xpt_file)
                
    return original_file_names
        
        
    

In [85]:
create_xpt_dict('dietary')

KeyError: 'DS1IDS'

In [74]:
for file in glob.glob('dietary/*'):
    print(file)

dietary/DS1IDS_H.XPT
dietary/DR1TOT_C.XPT
dietary/VARLK_C.XPT
dietary/DSQ1_B.XPT
dietary/DSQ1_C.XPT
dietary/DS2TOT_H.XPT
dietary/DSQIDS_H.XPT
dietary/DR1TOT_D.XPT
dietary/DR2IFF_H.XPT
dietary/DR2IFF_I.XPT
dietary/DRXFMT_B.XPT
dietary/DR1TOT_E.XPT
dietary/DSBI.XPT
dietary/DR1TOT_G.XPT
dietary/DSQ1_D.XPT
dietary/DR1TOT_F.XPT
dietary/DRXFCD_H.XPT
dietary/DSQTOT_E.XPT
dietary/DRXFCD_I.XPT
dietary/DSII.XPT
dietary/DS2IDS_E.XPT
dietary/FFQDC_D.XPT
dietary/DR1IFF_C.XPT
dietary/DSQ2_C.XPT
dietary/DS2IDS_G.XPT
dietary/DSQTOT_G.XPT
dietary/DSPI.XPT
dietary/DSQTOT_F.XPT
dietary/DS2IDS_F.XPT
dietary/DSQ2_B.XPT
dietary/DR2TOT_H.XPT
dietary/FFQDC_C.XPT
dietary/DR1IFF_D.XPT
dietary/DRXIFF.XPT
dietary/DR1IFF_E.XPT
dietary/DR2TOT_I.XPT
dietary/DS1TOT_E.XPT
dietary/DS1TOT_G.XPT
dietary/DR1IFF_G.XPT
dietary/DR1IFF_F.XPT
dietary/DSQ2_D.XPT
dietary/DS1TOT_F.XPT
dietary/DR2TOT_G.XPT
dietary/DRXTOT_B.XPT
dietary/DR2TOT_F.XPT
dietary/DRXFMT.XPT
dietary/DS1TOT_H.XPT
dietary/DR2TOT_D.XPT
dietary/DR1IFF_H.XPT
di