In [105]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
from time import gmtime, strftime, localtime
import glob

In [2]:
base_cdc_url = 'https://wwwn.cdc.gov'

In [7]:
def get_table_links(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    table = soup.find(lambda tag: tag.has_attr('id') and tag['id']=="GridView1")
    
# Lambda expression for all links that end with XPT
    link_list = table.findAll(lambda tag: tag.name=='a' and tag['href'].endswith(".XPT"))
    links_only = [link.get('href') for link in link_list]
    
    return links_only


def get_multi_year(data_type, base_url):
    datatype_dict = {'demographics':'Demographics', 'dietary':'Dietary',
                     'examination':'Examination', 'laboratory':'Laboratory', 
                     'questionnaire':'Questionnaire'}
    # Can add years as future years are added
    year_list = [1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015]
    data_links = []
    for year in year_list:
        url = f"{base_url}/nchs/nhanes/search/datapage.aspx?Component={datatype_dict[data_type]}&CycleBeginYear={year}"
        temp_data_links = get_table_links(url)
        for data in temp_data_links:
            if data not in data_links:
                data_links.append(data)
                print(f"Added {data} from {year}")
        time.sleep(1)

    return data_links

def get_column_labels(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    
    # Codebook section of documentation
    # TODO -- take section or pdf htm pages
    codebook_links = soup.findAll('div', id='CodebookLinks')[0].findAll('a')
    
    dictionary = {link.string.split('-')[0].strip() : link.string.split('-')[1].strip() for link in codebook_links}
    return dictionary


In [6]:
demographic_links = get_multi_year('demographics', base_cdc_url)
dietary_links = get_multi_year('dietary', base_cdc_url)
examination_links = get_multi_year('examination', base_cdc_url)
laboratory_links = get_multi_year('laboratory', base_cdc_url)
questionnaire_links = get_multi_year('questionnaire', base_cdc_url)

Added /Nchs/Nhanes/1999-2000/DEMO.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DEMO_B.XPT from 2001
Added /Nchs/Nhanes/2003-2004/DEMO_C.XPT from 2003
Added /Nchs/Nhanes/2005-2006/DEMO_D.XPT from 2005
Added /Nchs/Nhanes/2007-2008/DEMO_E.XPT from 2007
Added /Nchs/Nhanes/2009-2010/DEMO_F.XPT from 2009
Added /Nchs/Nhanes/2011-2012/DEMO_G.XPT from 2011
Added /Nchs/Nhanes/2013-2014/DEMO_H.XPT from 2013
Added /Nchs/Nhanes/2015-2016/DEMO_I.XPT from 2015
Added /Nchs/Nhanes/1999-2000/DRXIFF.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXTOT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DRXFMT.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSBI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSII.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSPI.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE1.XPT from 1999
Added /Nchs/Nhanes/1999-2000/DSQFILE2.XPT from 1999
Added /Nchs/Nhanes/2001-2002/DRXIFF_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXTOT_B.XPT from 2001
Added /Nchs/Nhanes/2001-2002/DRXFMT_B.XPT from 200

In [8]:
link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}

In [9]:
import json

with open('xpt_link_dict.json', 'w') as f:
    json.dump(link_dictionary, f)


In [49]:
temp_df = pd.read_html("https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=1999",
            match="Data File Name", attrs = {'id': 'GridView1'})[0]

In [115]:
get_table_links("https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=1999")

['/Nchs/Nhanes/1999-2000/AUX1.XPT',
 '/Nchs/Nhanes/1999-2000/AUXAR.XPT',
 '/Nchs/Nhanes/1999-2000/AUXTYM.XPT',
 '/Nchs/Nhanes/1999-2000/BAX.XPT',
 '/Nchs/Nhanes/1999-2000/BIX.XPT',
 '/Nchs/Nhanes/1999-2000/BPX.XPT',
 '/Nchs/Nhanes/1999-2000/BMX.XPT',
 '/Nchs/Nhanes/1999-2000/CVX.XPT',
 '/Nchs/Nhanes/1999-2000/LEXABPI.XPT',
 '/Nchs/Nhanes/1999-2000/LEXPN.XPT',
 '/Nchs/Nhanes/1999-2000/MSX.XPT',
 '/Nchs/Nhanes/1999-2000/OHXDENT.XPT',
 '/Nchs/Nhanes/1999-2000/OHXPERIO.XPT',
 '/Nchs/Nhanes/1999-2000/OHXREF.XPT',
 '/Nchs/Nhanes/1999-2000/SEQ.XPT',
 '/Nchs/Nhanes/1999-2000/TB.XPT',
 '/Nchs/Nhanes/1999-2000/VIX.XPT']

In [124]:
demographic_links[0][:-4]+'.htm'

'/Nchs/Nhanes/1999-2000/DEMO.htm'

In [125]:
r = requests.get(base_cdc_url + demographic_links[0][:-4]+'.htm')

In [129]:
data = r.text

In [138]:
div_links = BeautifulSoup(data).findAll('div', id="CodebookLinks")[0]

In [150]:
div_links.findAll('a')[0].string.split('-')[1].strip()

'Respondent sequence number'

In [154]:
column_dict = get_column_labels(base_cdc_url + demographic_links[0][:-4]+'.htm')

In [156]:
test_df = pd.read_sas('Test.xpt')

In [158]:
test_df.rename(columns=column_dict)

Unnamed: 0,Respondent sequence number,Data Release Number,Interview/Examination Status,Six month time period,Gender,Age at Screening Adjudicated,Age in Months,Exam Age in Months,Race/Ethnicity,Linked NH3 Race/Ethnicity,...,Interview Weight Jack Knife Replicate 43,Interview Weight Jack Knife Replicate 44,Interview Weight Jack Knife Replicate 45,Interview Weight Jack Knife Replicate 46,Interview Weight Jack Knife Replicate 47,Interview Weight Jack Knife Replicate 48,Interview Weight Jack Knife Replicate 49,Interview Weight Jack Knife Replicate 50,Interview Weight Jack Knife Replicate 51,Interview Weight Jack Knife Replicate 52
0,1.0,1.0,2.0,2.0,2.0,2.000000e+00,29.0,31.0,4.0,2.0,...,10094.017100,9912.461855,9727.078709,1.004152e+04,9.953956e+03,9857.381983,9.865152e+03,1.032799e+04,9.809165e+03,1.032332e+04
1,2.0,1.0,2.0,2.0,1.0,7.700000e+01,926.0,926.0,3.0,1.0,...,27186.728682,27324.345051,28099.663528,2.775707e+04,2.804929e+04,26716.602006,2.687770e+04,2.726803e+04,2.740638e+04,2.698481e+04
2,3.0,1.0,2.0,1.0,2.0,1.000000e+01,125.0,126.0,3.0,1.0,...,43993.193099,44075.386428,46642.563799,4.496768e+04,4.457248e+04,44087.945688,4.483137e+04,4.448099e+04,4.538911e+04,4.378191e+04
3,4.0,1.0,2.0,2.0,1.0,1.000000e+00,22.0,23.0,4.0,2.0,...,10702.307249,10531.444441,10346.119327,1.063606e+04,5.397605e-79,10533.108939,1.065475e+04,1.085102e+04,1.056498e+04,1.101253e+04
4,5.0,1.0,2.0,2.0,1.0,4.900000e+01,597.0,597.0,3.0,1.0,...,93164.782430,92119.608772,95388.490406,9.413138e+04,9.529781e+04,91325.082461,9.164059e+04,9.281793e+04,9.428286e+04,9.199325e+04
5,6.0,1.0,2.0,2.0,2.0,1.900000e+01,230.0,230.0,5.0,4.0,...,36948.534618,36855.281734,38998.473111,3.802201e+04,3.754149e+04,36871.278832,3.752068e+04,3.715187e+04,3.801650e+04,3.671024e+04
6,7.0,1.0,2.0,2.0,2.0,5.900000e+01,712.0,712.0,4.0,2.0,...,23096.205218,22705.366233,22352.088620,2.260011e+04,2.327240e+04,22956.903156,2.314706e+04,2.280701e+04,2.308579e+04,2.391840e+04
7,8.0,1.0,2.0,1.0,1.0,1.300000e+01,159.0,160.0,3.0,1.0,...,31904.891113,32634.978618,33640.727499,3.254396e+04,3.276514e+04,31939.471692,3.251975e+04,3.221750e+04,3.267858e+04,3.165622e+04
8,9.0,1.0,2.0,2.0,2.0,1.100000e+01,133.0,133.0,4.0,2.0,...,7704.500150,7734.334276,7529.435502,7.602184e+03,7.783768e+03,7581.275523,7.866193e+03,7.974498e+03,7.783064e+03,8.034182e+03
9,10.0,1.0,2.0,2.0,1.0,4.300000e+01,518.0,518.0,4.0,2.0,...,21569.526048,21308.984171,21071.164059,2.176243e+04,2.266320e+04,21281.981862,2.167752e+04,2.172516e+04,2.162508e+04,2.253903e+04


In [159]:
demographic_links

['/Nchs/Nhanes/1999-2000/DEMO.XPT',
 '/Nchs/Nhanes/2001-2002/DEMO_B.XPT',
 '/Nchs/Nhanes/2003-2004/DEMO_C.XPT',
 '/Nchs/Nhanes/2005-2006/DEMO_D.XPT',
 '/Nchs/Nhanes/2007-2008/DEMO_E.XPT',
 '/Nchs/Nhanes/2009-2010/DEMO_F.XPT',
 '/Nchs/Nhanes/2011-2012/DEMO_G.XPT',
 '/Nchs/Nhanes/2013-2014/DEMO_H.XPT',
 '/Nchs/Nhanes/2015-2016/DEMO_I.XPT']

In [160]:
link_dictionary = {'demographics':demographic_links, 'dietary':dietary_links, 
                   'examination':examination_links, 'laboratory':laboratory_links,
                  'questionnaire':questionnaire_links}

In [168]:
list(link_dictionary.keys())

['demographics', 'dietary', 'examination', 'laboratory', 'questionnaire']

In [186]:
import os

In [31]:
def download_data(data_type, link_list, base_url):
    cwd = os.getcwd()
    try:
        os.mkdir(data_type)
        print(f'Created {data_type} folder')
    except:
        print(f'{data_type} folder exists')
    for link in link_list:
        item_name = link.split('/')[-1]
        exists = os.path.isfile(f'{cwd}/{data_type}/{item_name}')
        if exists:
            print(f'{item_name} already exists')
        else:
            current_time = time.time()
            print(f'Downloading {item_name} at {strftime("%a, %d %b %Y %H:%M:%S", localtime())}')
            r = requests.get(base_url + link, allow_redirects=True)
            open(f'{cwd}/{data_type}/{item_name}', 'wb').write(r.content)
            time_elapsed = time.time() - current_time
            print(f'Downloaded {item_name} at {time_elapsed}s')
            

In [32]:
download_data('demographics', link_dictionary['demographics'], base_cdc_url)
download_data('dietary', link_dictionary['dietary'], base_cdc_url)
download_data('examination', link_dictionary['examination'], base_cdc_url)
download_data('laboratory', link_dictionary['laboratory'], base_cdc_url)
download_data('questionnaire', link_dictionary['questionnaire'], base_cdc_url)

Created examination folder
Downloading AUX1.XPT at Tue, 13 Nov 2018 09:27:07
Downloaded AUX1.XPT at 1.4642879962921143s
Downloading AUXAR.XPT at Tue, 13 Nov 2018 09:27:08
Downloaded AUXAR.XPT at 3.775470018386841s
Downloading AUXTYM.XPT at Tue, 13 Nov 2018 09:27:12
Downloaded AUXTYM.XPT at 2.509443998336792s
Downloading BAX.XPT at Tue, 13 Nov 2018 09:27:14
Downloaded BAX.XPT at 0.9057416915893555s
Downloading BIX.XPT at Tue, 13 Nov 2018 09:27:15
Downloaded BIX.XPT at 4.971536874771118s
Downloading BPX.XPT at Tue, 13 Nov 2018 09:27:20
Downloaded BPX.XPT at 2.957737922668457s
Downloading BMX.XPT at Tue, 13 Nov 2018 09:27:23
Downloaded BMX.XPT at 4.179272890090942s
Downloading CVX.XPT at Tue, 13 Nov 2018 09:27:28
Downloaded CVX.XPT at 3.6596717834472656s
Downloading LEXABPI.XPT at Tue, 13 Nov 2018 09:27:31
Downloaded LEXABPI.XPT at 0.47780609130859375s
Downloading LEXPN.XPT at Tue, 13 Nov 2018 09:27:32
Downloaded LEXPN.XPT at 0.7453854084014893s
Downloading MSX.XPT at Tue, 13 Nov 2018 09:

In [99]:
def create_xpt_dict(data_type):
    original_file_names = {}
    group_file_names = []
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))== 1:
            original_file_names[xpt_file.split('.')[0]] = [xpt_file]
    for file in glob.glob(f'{data_type}/*'):
        xpt_file = file.split('/')[1]
        if len(xpt_file.split('_'))> 1:
            try:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'].append(xpt_file)
            except KeyError as e:
                xpt_name = xpt_file.split('_')[0]
                original_file_names[f'{xpt_name}'] = [xpt_file]               
    return original_file_names
        
        
    

In [102]:
# Create the xpt_file_dict json for individual table creation and anticipation of merged tabes
xpt_file_dict = {}
for keys in link_dictionary:
    xpt_file_dict[keys] = create_xpt_dict(keys)
    
with open('xpt_file_dict.json', 'w') as f:
    json.dump(xpt_file_dict, f)

In [103]:
xpt_file_dict

{'demographics': {'DEMO': ['DEMO.XPT',
   'DEMO_G.XPT',
   'DEMO_F.XPT',
   'DEMO_D.XPT',
   'DEMO_E.XPT',
   'DEMO_B.XPT',
   'DEMO_C.XPT',
   'DEMO_H.XPT',
   'DEMO_I.XPT']},
 'dietary': {'DSBI': ['DSBI.XPT'],
  'DSII': ['DSII.XPT'],
  'DSPI': ['DSPI.XPT'],
  'DRXIFF': ['DRXIFF.XPT', 'DRXIFF_B.XPT'],
  'DRXFMT': ['DRXFMT.XPT', 'DRXFMT_B.XPT'],
  'DSQFILE2': ['DSQFILE2.XPT'],
  'DRXTOT': ['DRXTOT.XPT', 'DRXTOT_B.XPT'],
  'DSQFILE1': ['DSQFILE1.XPT'],
  'DS1IDS': ['DS1IDS_H.XPT', 'DS1IDS_G.XPT', 'DS1IDS_F.XPT', 'DS1IDS_E.XPT'],
  'DR1TOT': ['DR1TOT_C.XPT',
   'DR1TOT_D.XPT',
   'DR1TOT_E.XPT',
   'DR1TOT_G.XPT',
   'DR1TOT_F.XPT',
   'DR1TOT_H.XPT',
   'DR1TOT_I.XPT'],
  'VARLK': ['VARLK_C.XPT'],
  'DSQ1': ['DSQ1_B.XPT', 'DSQ1_C.XPT', 'DSQ1_D.XPT'],
  'DS2TOT': ['DS2TOT_H.XPT', 'DS2TOT_G.XPT', 'DS2TOT_F.XPT', 'DS2TOT_E.XPT'],
  'DSQIDS': ['DSQIDS_H.XPT', 'DSQIDS_G.XPT', 'DSQIDS_F.XPT', 'DSQIDS_E.XPT'],
  'DR2IFF': ['DR2IFF_H.XPT',
   'DR2IFF_I.XPT',
   'DR2IFF_G.XPT',
   'DR2IFF_F.XPT'

In [104]:
with open('xpt_file_dict.json', 'w') as f:
    json.dump(xpt_file_dict, f)

In [107]:
xpt_file_dict['demographics']

{'DEMO': ['DEMO.XPT',
  'DEMO_G.XPT',
  'DEMO_F.XPT',
  'DEMO_D.XPT',
  'DEMO_E.XPT',
  'DEMO_B.XPT',
  'DEMO_C.XPT',
  'DEMO_H.XPT',
  'DEMO_I.XPT']}

In [118]:
demo_df = pd.read_sas('demographics/DEMO.xpt')
demo_df2 = pd.read_sas('demographics/DEMO_B.xpt')
demo_df3 = pd.read_sas('demographics/DEMO_C.xpt')

In [112]:
pd.concat([demo_df, demo_df2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,DMAETHN,DMARACE,DMDBORN,DMDCITZN,DMDEDUC,DMDEDUC2,DMDEDUC3,DMDHHSIZ,DMDHRAGE,DMDHRBRN,...,WTMREP43,WTMREP44,WTMREP45,WTMREP46,WTMREP47,WTMREP48,WTMREP49,WTMREP50,WTMREP51,WTMREP52
0,,,1.0,1.0,,,,3.0,27.0,1.0,...,11431.548739,11197.627101,10976.246652,11372.168411,1.132356e+04,11145.175931,1.099080e+04,11728.073781,10927.615791,11655.082962
1,,,1.0,1.0,3.0,5.0,,1.0,77.0,1.0,...,28924.957080,29016.385749,29874.143718,29559.795821,2.987449e+04,28364.509723,2.852559e+04,28965.388416,29143.509564,28705.128975
2,,,3.0,2.0,1.0,,3.000000e+00,4.0,37.0,3.0,...,46579.639702,46707.762423,49637.450326,47746.234623,4.672778e+04,46686.213166,4.750412e+04,47206.143524,48250.803538,46363.581943
3,,,1.0,1.0,,,,7.0,34.0,1.0,...,10601.255337,10425.580214,10245.050923,10542.918217,5.397605e-79,10431.037828,1.053015e+04,10753.405668,10479.950791,10910.423979
4,,,1.0,1.0,3.0,5.0,,3.0,42.0,3.0,...,102254.001874,99399.906199,104410.645039,103032.836121,1.016651e+05,99740.419843,1.000501e+05,101733.345824,103321.768919,100585.329837
5,,,1.0,1.0,3.0,,1.500000e+01,2.0,19.0,1.0,...,40223.145839,40081.731488,42429.396120,41344.983842,4.099348e+04,40051.112545,4.079848e+04,40328.920746,41468.066786,39965.248048
6,,,1.0,1.0,1.0,2.0,,1.0,59.0,1.0,...,26392.652712,25904.332418,25491.018084,26192.014475,2.569643e+04,26198.686935,2.645714e+04,26159.883475,26873.942227,27184.432305
7,,,1.0,1.0,1.0,,5.000000e+00,7.0,30.0,1.0,...,31817.044652,32532.991406,33513.553288,32457.911321,3.274574e+04,31847.180906,3.241737e+04,32126.999287,32540.407730,31564.022812
8,,,1.0,1.0,1.0,,5.000000e+00,4.0,37.0,1.0,...,7756.977231,7784.633585,7575.598399,7640.006476,7.853751e+03,7628.108795,7.830905e+03,8033.926407,7838.359295,8069.217840
9,,,1.0,1.0,2.0,3.0,,1.0,43.0,1.0,...,23023.867096,22720.497501,22425.429409,23269.099373,2.431686e+04,22318.530784,2.306857e+04,23183.065911,23093.888741,24148.668691


In [137]:
def combine_tables(data_type, xpt_dict):
    temp_df_list = []
    cwd = os.getcwd()
    for keys, values in xpt_dict[data_type].items():
        for value in values:
            print(f'Trying {cwd}/{data_type}/{value}')
            temp_df_list.append(pd.read_sas(f'{cwd}/{data_type}/{value}'))
            print(f'{cwd}/{data_type}/{value} appended')
    return pd.concat(temp_df_list)

In [161]:
demo_combo_df = combine_tables('demographics', xpt_file_dict)

Trying /Users/tomnahass/development/nhanes/demographics/DEMO.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_G.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_F.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_D.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_E.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_B.XPT appended
Trying /Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT
/Users/tomnahass/development/nhanes/demographics/DEMO_C.XPT appended
Trying /Users/tomnahass/development/nhanes/demograph

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [165]:
df99 = pd.read_sas("demographics/DEMO.XPT")
df01 = pd.read_sas('demographics/DEMO_B.XPT')
df03 = pd.read_sas('demographics/DEMO_C.XPT')
df05 = pd.read_sas('demographics/DEMO_D.XPT')
df07 = pd.read_sas('demographics/DEMO_E.XPT')
df09 = pd.read_sas('demographics/DEMO_F.XPT')
df11 = pd.read_sas('demographics/DEMO_G.XPT')
df13 = pd.read_sas('demographics/DEMO_H.XPT')
df15 = pd.read_sas('demographics/DEMO_I.XPT')

In [142]:
cwd = os.getcwd()
pd.read_table('demographics/DEMO_G.XPT')

EmptyDataError: No columns to parse from file

In [176]:
.columns

NameError: name 'df17' is not defined

In [116]:
demo_df2.columns

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIDEXMON', 'RIAGENDR', 'RIDAGEYR',
       'RIDAGEMN', 'RIDAGEEX', 'RIDRETH1', 'RIDRETH2', 'DMQMILIT', 'DMDBORN',
       'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDEDUC', 'DMDSCHOL',
       'DMDMARTL', 'DMDHHSIZ', 'INDHHINC', 'INDFMINC', 'INDFMPIR', 'RIDEXPRG',
       'RIDPREG', 'DMDHRGND', 'DMDHRAGE', 'DMDHRBRN', 'DMDHREDU', 'DMDHRMAR',
       'DMDHSEDU', 'WTINT2YR', 'WTINT4YR', 'WTMEC2YR', 'WTMEC4YR', 'SDMVPSU',
       'SDMVSTRA'],
      dtype='object')

In [117]:
demo_df.columns

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIDEXMON', 'RIAGENDR', 'RIDAGEYR',
       'RIDAGEMN', 'RIDAGEEX', 'RIDRETH1', 'RIDRETH2',
       ...
       'WTIREP43', 'WTIREP44', 'WTIREP45', 'WTIREP46', 'WTIREP47', 'WTIREP48',
       'WTIREP49', 'WTIREP50', 'WTIREP51', 'WTIREP52'],
      dtype='object', length=144)

In [119]:
demo_df3.columns

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIDEXMON', 'RIAGENDR', 'RIDAGEYR',
       'RIDAGEMN', 'RIDAGEEX', 'RIDRETH1', 'RIDRETH2', 'DMQMILIT', 'DMDBORN',
       'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDEDUC', 'DMDSCHOL',
       'DMDMARTL', 'DMDHHSIZ', 'INDHHINC', 'INDFMINC', 'INDFMPIR', 'RIDEXPRG',
       'DMDHRGND', 'DMDHRAGE', 'DMDHRBRN', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU',
       'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP',
       'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANG', 'WTINT2YR', 'WTMEC2YR',
       'SDMVPSU', 'SDMVSTRA'],
      dtype='object')

In [120]:
pd.concat([demo_df2,demo_df3])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,AIALANG,DMDBORN,DMDCITZN,DMDEDUC,DMDEDUC2,DMDEDUC3,DMDHHSIZ,DMDHRAGE,DMDHRBRN,DMDHREDU,...,SDMVPSU,SDMVSTRA,SEQN,SIAINTRP,SIALANG,SIAPROXY,WTINT2YR,WTINT4YR,WTMEC2YR,WTMEC4YR
0,,1.0,1.0,3.0,4.0,,2.0,39.0,1.0,4.0,...,2.0,22.0,9966.0,,,,85045.160060,42497.504017,9.135299e+04,4.675312e+04
1,,3.0,1.0,3.0,4.0,,5.0,48.0,3.0,3.0,...,1.0,24.0,9967.0,,,,29465.456810,13514.790582,2.945668e+04,1.359495e+04
2,,1.0,1.0,1.0,2.0,,1.0,84.0,1.0,2.0,...,2.0,20.0,9968.0,,,,20658.109377,10069.359476,2.750814e+04,1.583396e+04
3,,1.0,1.0,3.0,5.0,,3.0,55.0,1.0,5.0,...,2.0,18.0,9969.0,,,,75077.431586,43769.053390,7.853632e+04,4.530426e+04
4,,1.0,1.0,1.0,,1.000000e+01,3.0,41.0,1.0,5.0,...,2.0,27.0,9970.0,,,,32563.194542,17881.930025,3.405998e+04,1.897820e+04
5,,3.0,1.0,1.0,,9.000000e+00,5.0,46.0,3.0,2.0,...,2.0,14.0,9971.0,,,,6759.477348,3380.162395,6.968237e+03,3.560386e+03
6,,1.0,1.0,2.0,3.0,,2.0,31.0,1.0,4.0,...,1.0,26.0,9972.0,,,,93545.001858,48456.532043,9.355893e+04,5.075880e+04
7,,1.0,1.0,1.0,2.0,,1.0,63.0,1.0,2.0,...,2.0,24.0,9973.0,,,,7108.817624,3688.245542,8.634478e+03,4.373134e+03
8,,1.0,1.0,1.0,,6.000000e+00,4.0,55.0,1.0,2.0,...,1.0,24.0,9974.0,,,,5649.685460,3193.463297,5.821589e+03,3.354029e+03
9,,1.0,1.0,3.0,4.0,,1.0,80.0,1.0,4.0,...,1.0,18.0,9975.0,,,,11858.353595,6713.068326,5.397605e-79,5.397605e-79


In [144]:
os.stat(cwd+'/demographics/DEMO_G.XPT').st_size

0

In [145]:
empty_list = []
for file in glob.glob('demographics/*'):
    if os.stat(file).st_size == 0:
        empty_list.append(file)
empty_list
    

['demographics/DEMO_G.XPT',
 'demographics/DEMO_H.XPT',
 'demographics/DEMO_I.XPT']

In [150]:
def grab_empty_files(data_type, base_url):
    empty_list = []
    cwd = os.getcwd()
    for file in glob.glob(f'{cwd}/{data_type}/*'):
        if os.stat(file).st_size == 0:
            empty_list.append(file)
            os.remove(file)
    if len(empty_list) == 0:
        print("There are no empty files in this folder")
    else:
        print(f"Now re-downloading {len(empty_list)} files")
        download_data(data_type, empty_list, base_url)
    
    

In [157]:
grab_empty_files('questionnaire', base_cdc_url)

There are no empty files in this folder


In [159]:
download_data('demographics', link_dictionary['demographics'], base_cdc_url)

Created demographics folder
Downloading DEMO.XPT at Wed, 14 Nov 2018 07:03:18
Downloaded DEMO.XPT at 5.88158392906189s
Downloading DEMO_B.XPT at Wed, 14 Nov 2018 07:03:24
Downloaded DEMO_B.XPT at 1.8606090545654297s
Downloading DEMO_C.XPT at Wed, 14 Nov 2018 07:03:26
Downloaded DEMO_C.XPT at 1.8775739669799805s
Downloading DEMO_D.XPT at Wed, 14 Nov 2018 07:03:28
Downloaded DEMO_D.XPT at 1.8439769744873047s
Downloading DEMO_E.XPT at Wed, 14 Nov 2018 07:03:30
Downloaded DEMO_E.XPT at 1.8325190544128418s
Downloading DEMO_F.XPT at Wed, 14 Nov 2018 07:03:31
Downloaded DEMO_F.XPT at 1.9972810745239258s
Downloading DEMO_G.XPT at Wed, 14 Nov 2018 07:03:33
Downloaded DEMO_G.XPT at 2.0403356552124023s
Downloading DEMO_H.XPT at Wed, 14 Nov 2018 07:03:36
Downloaded DEMO_H.XPT at 1.9920167922973633s
Downloading DEMO_I.XPT at Wed, 14 Nov 2018 07:03:38
Downloaded DEMO_I.XPT at 1.9578208923339844s


In [160]:
download_data('dietary', link_dictionary['dietary'], base_cdc_url)

Created dietary folder
Downloading DRXIFF.XPT at Wed, 14 Nov 2018 07:06:52
Downloaded DRXIFF.XPT at 33.201632022857666s
Downloading DRXTOT.XPT at Wed, 14 Nov 2018 07:07:25
Downloaded DRXTOT.XPT at 4.820607900619507s
Downloading DRXFMT.XPT at Wed, 14 Nov 2018 07:07:30
Downloaded DRXFMT.XPT at 0.39957594871520996s
Downloading DSBI.XPT at Wed, 14 Nov 2018 07:07:30
Downloaded DSBI.XPT at 4.332566976547241s
Downloading DSII.XPT at Wed, 14 Nov 2018 07:07:35
Downloaded DSII.XPT at 27.412736892700195s
Downloading DSPI.XPT at Wed, 14 Nov 2018 07:08:02
Downloaded DSPI.XPT at 1.8557190895080566s
Downloading DSQFILE1.XPT at Wed, 14 Nov 2018 07:08:04
Downloaded DSQFILE1.XPT at 0.33965492248535156s
Downloading DSQFILE2.XPT at Wed, 14 Nov 2018 07:08:04
Downloaded DSQFILE2.XPT at 0.5185039043426514s
Downloading DRXIFF_B.XPT at Wed, 14 Nov 2018 07:08:05
Downloaded DRXIFF_B.XPT at 39.67218995094299s
Downloading DRXTOT_B.XPT at Wed, 14 Nov 2018 07:08:44
Downloaded DRXTOT_B.XPT at 5.677817106246948s
Downl