In [1]:
# Import required Python libraries

import requests
from bs4 import BeautifulSoup, re, Comment
import pandas as pd  
import xlsxwriter

# Step 1: Scraping the primary "MEPS data file website", 
# finding the data file names that are within the "option"  
# comment tags, and saving them in a csv file

def extractOptions(inputData):
    sub1 = str(re.escape('<option value="All">All data files</option>'))
    sub2 = str(re.escape('</select>'))
    result = re.findall(sub1+"(.*)"+sub2, inputData, flags=re.S)
    if len(result) > 0:
        return result[0]

def extractData(inputData):
    sub1 = str(re.escape('>'))
    sub2 = str(re.escape('</option>'))
    result =  re.findall(sub1+"(.*)"+sub2, inputData, flags=re.S)
    if len(result) > 0:
        return result[0]
    return ''

def main(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))

    for c in comments:
        if '<select id="pufnumber" size=1 name="cboPufNumber">' in c:
            options = extractOptions(c)
            ops = options.splitlines() #split text into lines
            fp = open(r'C:/Data/MEPS_fn.csv', 'w')
            for op in ops:
                data = extractData(op)
                if data.startswith(("MEPS HC", "HC")):
                    if "-IC" not in data:
                        if "replaced" not in data:  
                            if "CD-ROM" not in data:
                                #print(data)
                                fp.write(data +'\n')                    
            fp.close()    
            
            with open(r'C:\Data\MEPS_fn.csv', 'r') as buff:
                for i, line in enumerate(buff, 1):
                    pass
                print(f"{(i)}", 'file names listed in the MEPS website') 
                
main('https://meps.ahrq.gov/data_stats/download_data_files.jsp')

402 file names listed in the MEPS website


In [2]:
# Step 2: Creating a Python Pandas DataFrame based on the .csv file created in the previous step

colname = ['file_name']
df1 = pd.read_csv(r'C:/Data/MEPS_fn.csv',  sep='\t', names = colname)

df1["url1"] = "https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-" \
+ df1["file_name"].str.extract(r"(\d+[A-Z]*)").sum(axis=1).astype(str)

df1.reset_index(drop = True, inplace = True)

print('There are', f"{len(df1)}", 'MEPS public-use filenames listed in the MEPS Data File Web Page.\n')


There are 402 MEPS public-use filenames listed in the MEPS Data File Web Page.



In [6]:
#Step 3: Scraping the MEPS websites and displaying the manipulated data 
pd.set_option("max_colwidth", None)
url2_str_list = []
for item in df1.index:
    url1_str = df1['url1'][item]
    response = requests.get(url1_str)
    soup = BeautifulSoup(response.text, "html.parser")
    li = soup.find(class_ = "OrangeBox").text
    print('URL for', li)  
    print(url1_str)
    print('URLs for the data file in multiple formats, if available')
    for link in soup.find_all('a'):
        if link.text.endswith('ZIP'):
            url2_str = 'https://meps.ahrq.gov' + link.get('href').strip('..')
            print(url2_str)
            url2_str_list.append(url2_str)
print('A total of', f"{len(url2_str_list):,d}", ' MEPS-HC data file format-specific URLs listed on the MEPS website') 
  

URL for MEPS HC-226: MEPS Panel 23 Three-Year Longitudinal Data File
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-226
URLs for the data file in multiple formats, if available
https://meps.ahrq.gov/data_files/pufs/h226/h226dat.zip
https://meps.ahrq.gov/data_files/pufs/h226/h226ssp.zip
https://meps.ahrq.gov/data_files/pufs/h226/h226v9.zip
https://meps.ahrq.gov/data_files/pufs/h226/h226dta.zip
https://meps.ahrq.gov/data_files/pufs/h226/h226xlsx.zip
URL for MEPS HC-225: MEPS Panel 24 Longitudinal Data File
https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-225
URLs for the data file in multiple formats, if available
https://meps.ahrq.gov/data_files/pufs/h225/h225dat.zip
https://meps.ahrq.gov/data_files/pufs/h225/h225ssp.zip
https://meps.ahrq.gov/data_files/pufs/h225/h225v9.zip
https://meps.ahrq.gov/data_files/pufs/h225/h225dta.zip
https://meps.ahrq.gov/data_files/pufs/h225/h225xlsx.zip
URL for MEPS HC-224: 2020 Full Year Conso