In [14]:
# Step 1: List all MEPS data file URLs 
import requests
from bs4 import BeautifulSoup

full_url_list = []
desired_part = 'v9.zip', 'ssp.zip', 'dta.zip', 'dat.zip', 'xlsx.zip', '/'
def get_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    hrefs = soup.find_all('a')
    for href in hrefs:
        if href.text.endswith(desired_part):
            href_text = href.get_text()
            full_url = base_url + href_text
            if href_text[-1]=='/':
                get_links(full_url)        
            else:
                full_url_list.append(full_url)                
get_links('https://meps.ahrq.gov/data_files/pufs/')
print('There are', f"{len(full_url_list):,}", 'Full URLs for five format-specific data files.')
print('Listing of the first 5 URLs')
for item in full_url_list[:5]:   print(item)


There are 1,349 Full URLs for five format-specific data files.
Listing of the first 5 URLs
https://meps.ahrq.gov/data_files/pufs/h01dat.zip
https://meps.ahrq.gov/data_files/pufs/h036/h36dta.zip
https://meps.ahrq.gov/data_files/pufs/h036/h36u19dat.zip
https://meps.ahrq.gov/data_files/pufs/h036/h36u19dta.zip
https://meps.ahrq.gov/data_files/pufs/h036/h36u19ssp.zip


In [6]:
# Step 2: Create a dataframe from the list
import pandas as pd  
import numpy as np

col1 = "full_url"
df = pd.DataFrame({col1:full_url_list})
values = ['v9.zip', 'ssp.zip', 'dta.zip', 'dat.zip', 'xlsx.zip']
sub_string = list(map(df['full_url'].str.contains, values))
df['sub_string'] = np.select(sub_string, values, 'other')
print('There are', f"{len(df):,}", 'Full URLs for data files with extensions of interest.')
df['sub_string'].value_counts()

There are 1,349 Full URLs for data files with extensions of interest.


dat.zip     557
ssp.zip     557
dta.zip      79
v9.zip       78
xlsx.zip     78
Name: sub_string, dtype: int64

In [7]:
# Step 3: Create an Excel Workbook with multiple sheets
import pandas as pd 
import xlsxwriter 

with pd.ExcelWriter('MEPS_urls_WB_Oct19.xlsx') as writer:
    for i, x in df.groupby('sub_string'):
        x.drop('sub_string', axis=1).to_excel(writer, sheet_name=i, index=False)

In [7]:
%pwd

'C:\\Jupyter_Lab'