In [3]:
import pandas as pd
import os
import argparse
import requests
from bs4 import BeautifulSoup
import json

In [43]:
config_directory = "config"
data_directory = 'data'

file_directories_file_name = "file_directories.json"
file_directories_file_path = os.path.join(config_directory, file_directories_file_name)

with open(file_directories_file_path, "r") as file:
        file_directory_dictionary = json.load(file)


data_attributes_file_name = "data_attributes.json"
data_attributes_file_path = os.path.join(config_directory, data_attributes_file_name)

with open(data_attributes_file_path, "r") as file:
        data_attribute_dict = json.load(file)

attribute_name_map_file_name = 'data_attribute_names_map.json'
attribute_name_map_file_path = os.path.join(config_directory, attribute_name_map_file_name)

with open(attribute_name_map_file_path, "r") as file:
        attribute_name_map = json.load(file)

In [50]:
import csv

def write_to_csv(file_path, new_row, header=False):
    try:
        if header:
            os.remove(file_path)
    except Exception:
        pass

    try:
        with open(file_path, 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(new_row)
            
    except FileNotFoundError:
        # If the file doesn't exist, create it and write the header and row
        with open(file_path, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(new_row)


In [54]:
codebook_url = 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{}/DataFiles/{}.htm#Codebook'
file_name = 'codebook.csv'
file_path = os.path.join(data_directory, file_name)
header = ['year_range', 'attribute', 'attribute_name', 'description', 'is_range',
                        'start_range_value', 'end_range_value', 'acceptable_values']

def extract_codebook():
    write_to_csv(file_path, header, header=True)

    for year_range, file_dictionary in file_directory_dictionary.items():
        year_start = year_range.split('-')[0]
        for file in sum(file_dictionary.values(), []):
            file_location = file.strip('.xpt')
            url = codebook_url.format(year_start, file_location)

            # Make a GET request to the URL
            response = requests.get(url)
            response.raise_for_status()  # Check for request errors

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all 'div' elements with the class 'pagebreak'
            divs = soup.find_all('div', class_='pagebreak')

            # Iterate over each 'div' section
            for div in divs:
                # Extract the variable name from 'dd' element with class 'info'
                variable_name = div.find('dd', class_='info').text
                
                # We only care about storing the attributes we are interested in
                if variable_name not in data_attribute_dict[year_range]:
                    continue
                
                # Extract the associated table
                table = div.find('table', class_='values')
                if table:
                    # Extract table rows
                    table_rows = table.find_all('tr')
                    
                    # Extract information from table rows
                    for row in table_rows[1:]:  # Skip the header row
                        columns = row.find_all('td')
                        code_or_value = columns[0].text.strip() if columns[0] else ''
                        value_description = columns[1].text.strip() if columns[1] else ''
                        
                        if 'missing' in value_description.lower():
                            continue
                    
                        # year_range, atttribute, is_range, start_range_value, end_range_value, acceptable_values
                        if 'range' in value_description.lower():
                            start_range_value = float(code_or_value.split('to')[0])
                            end_range_value = float(code_or_value.split('to')[1])
                            attribute_name = attribute_name_map.get(variable_name, None)
                            row = [year_range, variable_name, attribute_name, value_description, True, start_range_value, end_range_value, None]
                            write_to_csv(file_path, row)
                        else:
                            attribute_name = attribute_name_map.get(variable_name, None)
                            row = [year_range, variable_name, attribute_name, value_description, False, None, None, code_or_value]
                            write_to_csv(file_path, row)

In [53]:
pd.read_csv('data/codebook.csv')

Unnamed: 0,year_range,atttribute,is_range,start_range_value,end_range_value,acceptable_values
0,1999-2000,RIAGENDR,False,,,1.0
1,1999-2000,RIAGENDR,False,,,2.0
2,1999-2000,RIDAGEMN,True,0.0,1019.0,
3,1999-2000,BPXSY1,True,74.0,242.0,
4,1999-2000,BPXDI1,True,0.0,132.0,
...,...,...,...,...,...,...
538,2017-2018,DIQ010,False,,,1.0
539,2017-2018,DIQ010,False,,,2.0
540,2017-2018,DIQ010,False,,,3.0
541,2017-2018,DIQ010,False,,,7.0
