The program uses the functions updated by Charles Han on 3/15/2023

In [1]:
# Step 1: Importing libraries

import requests
from bs4 import BeautifulSoup, re, Comment


In [2]:
# Step 2: Creating functions

# find all options match the start and end string
def extractOptions(inputData):
    regex = r'<option value=\"All\">All data files</option>(.*?)</select>'
    result = re.findall(regex, inputData, flags=re.S)
    if len(result) > 0:
        return result[0]

# find the actual data from each option
def extractData(inputData):
    regex = r"<option value=[^>]*>(.*?)<\/option>"
    result =  re.findall(regex, inputData, flags=re.S)
    if len(result) > 0:
        return result[0]
    return ''


In [3]:

# Step 3: Constructing data file URLs (url1s) from the main website
response = requests.get('https://meps.ahrq.gov/data_stats/download_data_files.jsp')
soup = BeautifulSoup(response.text, "html.parser")
comments = soup.find_all(string=lambda text: isinstance(text, Comment))
for c in comments:
    if '<select id="pufnumber" size=1 name="cboPufNumber">' in c:
        options = extractOptions(c)
        ops = options.splitlines() #split text into lines
        f = open(r'C:\Data\MEPS_fn_x.txt', 'w')
        filtered = []
        unwanteds = ['IC Linked Data', 'CD-ROM', 'replaced', 'Population Characteristics']
        for op in ops:
            data = extractData(op)
            if data.startswith(('MEPS HC', 'HC')) and not \
            any(item in data for item in unwanteds):
                f.write(data +'\n') 
                filtered.append(data)
        f.close()   
        file = open(r'C:\Data\MEPS_fn_x.txt')
        lines = file.readlines() # Create a list of file name entries
        file.close()
        list_id = re.findall(r'\bHC-\w+\b', str(lines))   #  Extract query string
        st = "https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber="     
        url1s = [st + x for x in list_id]  # Construct URLs (url1s)
        print('Number of items (file name entries):', len(lines))
        for url1 in url1s[:5]:   print(url1) # Display the first 5 data file urls               

Number of items (file name entries): 366
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-227
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-226
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-225
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-224
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-223


In [4]:
# Step 4: Constructing data file format-specific URLs (url2s) from individual file websites

with open(r'C:\Data\urls_rev_x.markdown', 'w') as f:
    url2s = []
    for url1 in url1s:
        response = requests.get(url1)
        soup = BeautifulSoup(response.text, "html.parser")
        li = soup.find (class_ = "OrangeBox").text
        print(li, '(format-specific URLs below)', file = f)
        for link in soup.find_all('a'):
            if link.text.endswith('ZIP'):
                url2 = 'https://meps.ahrq.gov' + link.get('href').strip('..')
                print(url2, file = f)
                url2s.append(url2)
f = open(r'C:\Data\urls_rev_x.markdown')
fn_url2s = f.readlines()
f.close()            
print('Number of data file format-specific URLs (url2s):', f"{len(url2s):,d}")
for item in fn_url2s[:6]:    print(item) # One data file name & 5 url2s


Number of data file format-specific URLs (url2s): 1,071
MEPS HC-227: 2021 Jobs File (format-specific URLs below)

https://meps.ahrq.gov/data_files/pufs/h227/h227dat.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227ssp.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227v9.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227dta.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227xlsx.zip



In [5]:
count_ErrorFound = 0
count_200_ok = 0
for url2 in url2s:
    response = requests.get(url2)
    if response.status_code != 200:
        count_ErrorFound +=1   
    else:
        count_200_ok +=1
print("url2s count_ErrorFound",count_ErrorFound)
print("url2s count_200_ok",count_200_ok)
       

url2s count_ErrorFound 0
url2s count_200_ok 1071
