In [9]:
import os
import json
import pandas as pd

In [13]:
ebpage_url = 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={}&CycleBeginYear={}'
data_attributes_file_path = "data_attributes.json"
file_directories_file_path = 'file_directories.json'

start_years = [1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015]
output_directory = 'data_test'

attribute_name_map_file_path = 'data_attribute_names_map.json'

with open(attribute_name_map_file_path, "r") as file:
        attribute_name_map = json.load(file)

In [49]:
def process_attributes(directory, year_range):
    with open(attribute_name_map_file_path, "r") as file:
        attribute_name_map = json.load(file)

    result_df = pd.DataFrame({'SEQN': []})
    for file_name in os.listdir(directory):
        data_file_path = os.path.join(directory, file_name)
        file_df = pd.read_sas(data_file_path, format='xport')
        
        # if 'SEQN' in file_df.columns:
        #     print('SEQN FOUND')

        
        extracted_columns = file_df.columns.intersection(attribute_name_map.keys())
        extracted_columns = list(extracted_columns) + ['SEQN']
        print(f'Length of dataset extracted from {file_name}: ', len(file_df))
        df_i = file_df[extracted_columns]
        df_i = df_i.rename(columns=attribute_name_map)

        result_df = result_df.merge(df_i, how='outer', on='SEQN')

    print('Length of resulting DF:', len(result_df))
    return result_df

In [52]:
dfs = []
for start_year in start_years:
    year_range = str(start_year) + '-' + str(start_year + 1)
    print(f'~~~~EXTRACTING {year_range} DATA~~~~')

    output_file_path = f'{output_directory}/{year_range}'
    print()

    # extract all of the desired attributes into the same dataframe
    dfs.append(process_attributes(output_file_path, year_range))

    # break
    print(f'~~~~DONE.~~~~\n')

pd.concat(dfs)


~~~~EXTRACTING 1999-2000 DATA~~~~

Length of dataset extracted from BMX.xpt:  9282
Length of dataset extracted from BPX.xpt:  9282
Length of dataset extracted from DEMO.xpt:  9965
Length of dataset extracted from DIQ.xpt:  9493
Length of dataset extracted from LAB10.xpt:  6758
Length of dataset extracted from LAB13.xpt:  8344
Length of dataset extracted from LAB18.xpt:  6758
Length of dataset extracted from LAB25.xpt:  8832
Length of dataset extracted from MCQ.xpt:  9493
Length of dataset extracted from PAQ.xpt:  9188
Length of resulting DF: 9965
~~~~DONE.~~~~

~~~~EXTRACTING 2001-2002 DATA~~~~

Length of dataset extracted from BMX_B.xpt:  10477
Length of dataset extracted from BPX_B.xpt:  10477
Length of dataset extracted from DEMO_B.xpt:  11039
Length of dataset extracted from DIQ_B.xpt:  10470
Length of dataset extracted from L10_2_B.xpt:  557
Length of dataset extracted from L10_B.xpt:  7445
Length of dataset extracted from L13_B.xpt:  9262
Length of dataset extracted from L25_B.xp

Unnamed: 0,SEQN,Weight,Body mass index,Systolic,Diastolic,Gender,Age,Diabetes,Glycohemoglobin,Cholesterol,...,Mean volume of platelets,Coronary heart disease,Blood related diabetes,Blood related stroke,Moderate-work,Vigorous-work,Glycohemoglobin_x,Glycohemoglobin_y,Glucose_x,Glucose_y
0,1.0,3.0,14.90,,,2.0,29.0,2.0,,,...,,,,,,,,,,
1,2.0,,24.90,106.0,58.0,1.0,926.0,2.0,4.7,5.56,...,7.7,2.0,2.0,2.0,,3.0,,,,
2,3.0,,17.63,110.0,60.0,2.0,125.0,2.0,,3.34,...,8.6,,,,,,,,,
3,4.0,,,,,1.0,22.0,2.0,,,...,7.8,,,,,,,,,
4,5.0,,29.10,122.0,82.0,1.0,597.0,2.0,5.5,7.21,...,10.4,2.0,2.0,2.0,17.0,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9966,93698.0,,,,,1.0,,2.0,,,...,7.0,,,,,,,,,
9967,93699.0,,18.20,,,2.0,,2.0,,4.71,...,7.3,,,,,,,,,
9968,93700.0,,26.00,104.0,62.0,1.0,,2.0,5.2,3.72,...,9.6,2.0,2.0,,2.0,2.0,,,,
9969,93701.0,,18.10,114.0,48.0,1.0,,2.0,,5.09,...,7.8,,,,,,,,,
