In [1]:
import saspy
import requests
from bs4 import BeautifulSoup, re
sas = saspy.SASsession(cfgname='winlocal')

SAS Connection established. Subprocess id is 12844



In [7]:
%%SAS
options nodate nonumber nosource;
/*  Fetching the main web page's contents using PROC HTTP */
filename source 'C:\Data\web_file1.txt';
proc http
url ='https://meps.ahrq.gov/data_stats/download_data_files.jsp'
     out=source;
run;

In [9]:
%%SAS
/* Parsing source code and saving the necessary variable in a SAS data set */
options nodate nonumber nosource;
filename source 'C:\Data\web_file1.txt';
libname new 'C:\Data';
data new.url1_data (keep =  url1s);  
length file_id $ 10; 
infile source length = reclen lrecl = 32767; 
input string $varying32767. reclen;  
position = prxmatch('m/"HC-\w+/i',string); 
if find(string,'CD-ROM','i') ^= 0 
    | find(string,'IC Linked Data',' i') ^= 0 
    | find(string,'replaced','i') ^= 0
    | find(string,'Population Characteristics','i') ^= 0 then position = 0;
 
if position ^= 0 then do;
file_id = scan(string, 2, '"');
st = 'https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=';
url1s = cats(st, file_id);
output;
end;
run;

In [10]:
# Converting the SAS data set to the Python DataFrame and then to the list
sas.saslib(libref='new', path="C:\\Data")
df_url1s  = sas.sd2df(table='url1_data', libref='new')
url1s = df_url1s.values.tolist()
print(type(url1s)); print("Number of PUF URLs (url1s):", len(url1s))
for url1 in url1s[:5]:   print(url1) # Display the first 5 data file urls


16                                                         The SAS System                               12:18 Tuesday, April 4, 2023

87         
88         libname new    'C:\Data'  ;
NOTE: Libref NEW was successfully assigned as follows: 
      Engine:        V9 
      Physical Name: C:\Data
89         
90         
91         

17                                                         The SAS System                               12:18 Tuesday, April 4, 2023

92         
<class 'list'>
Number of PUF URLs (url1s): 366
['https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-227']
['https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-226']
['https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-225']
['https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-224']
['https://meps.ahrq.gov/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-223']


In [5]:
# Constructing data file format-specific URLs (url2s) from individual file websites
with open(r'C:\Data\urls_saspy.markdown', 'w') as f:
    url2s = []
    for url1 in url1s:
        response = requests.get(url1[0])
        soup = BeautifulSoup(response.text, "html.parser")
        li = soup.find (class_ = "OrangeBox").text
        print(li, '(format-specific URLs below)', file = f)
        for link in soup.find_all('a'):
            if link.text.endswith('ZIP'):
                url2 = 'https://meps.ahrq.gov' + link.get('href').strip('..')
                print(url2, file = f)
                url2s.append(url2)
f = open(r'C:\Data\urls_saspy.markdown')
fn_url2s = f.readlines()
f.close()            
print('Number of data file format-specific URLs (url2s):', f"{len(url2s):,d}")
for item in fn_url2s[:6]:    print(item) # One data file name & 5 url2s

Number of data file format-specific URLs (url2s): 1,071
MEPS HC-227: 2021 Jobs File (format-specific URLs below)

https://meps.ahrq.gov/data_files/pufs/h227/h227dat.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227ssp.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227v9.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227dta.zip

https://meps.ahrq.gov/data_files/pufs/h227/h227xlsx.zip



In [6]:
count_ErrorFound = 0
count_200_ok = 0
for url2 in url2s:
    response = requests.get(url2)
    if response.status_code != 200:
        count_ErrorFound +=1   
    else:
        count_200_ok +=1
print("HTTP response request (url2s), NOT SUCCESSFUL:",count_ErrorFound)
print("HTTP response request (url2s), SUCCESSFUL:",f"{count_200_ok:,d}")

HTTP response request (url2s), NOT SUCCESSFUL: 0
HTTP response request (url2s), SUCCESSFUL: 1,071
