In [158]:
import io #for input and output operations
import requests #to get() the PDFs or url

import pdfplumber #package for reading info from the PDFs
import re #regular expressions allows a user to search for strings
import pandas as pd
from string import digits

In [159]:
url = 'https://www.whitehouse.gov/wp-content/uploads/2023/04/Justice40-Covered-Programs-List_v1.4_04-20-2023.pdf' #website containing PDFs
r = requests.get(url) #retrieve url
f = io.BytesIO(r.content) #manipulate binary data in-memory; this functions the same as with open(file, 'rb') but serves as a temporary buffer

In [160]:
def extract_text_from_pdf(pdf):
    extracted_text = ""
    for page in pdf.pages:
        extracted_text += page.extract_text()
    return extracted_text

with pdfplumber.open(f) as pdf:
# Extract text from the entire PDF
    text = extract_text_from_pdf(pdf)



In [161]:

bureau_keywords = ["Army Corps", "Covered programs list for AmeriCorps", "Agricultural Research Service (ARS)","Farm Service Agency (FSA)","Forest Service","National Institute of Food and Agriculture (NIFA)","Natural Resources Conservation Service (NRCS)","Risk Management Agency (RMA)","Rural Business-Cooperative Service (RBCS)","Rural Housing Service (RHS)","Rural Utilities Service (RUS)","Departmentwide","National Oceanic and Atmospheric Administration (NOAA)",
"Economic Development Administration (EDA)","National Institute of Standards and Technology (NIST)","Advanced Research Projects Agency - Energy (ARPA-E)","Bonneville Power Administration (BPA)","Federal Energy Management Program (FEMP)","Grid Deployment Office (GDO)","Loan Programs Office (LPO)","Office of Clean Energy Demonstrations (OCED)","Office of Cybersecurity, Energy Security, and Emergency Response (CESER)","Office of Economic Impact and Diversity (ED)","Office of Electricity (OE)","Office of Energy Efficiency and Renewable Energy (EERE)",
"Office of Environmental Management (EM)","Office of Fossil Energy and Carbon Management (FECM)","Office of Indian Energy Policy and Programs (IE)","Office of Legacy Management (LM)","Office of Manufacturing and Energy Supply Chains (MSC)","Office of Nuclear Energy (NE)","Office of Safety, Infrastructure, and Operations (NNSA)","Office of Science (SC)","Office of State and Community Energy Programs (SCP)", "Office of Technology Transitions (OTT)","Southeastern Power Administration (SEPA)","Southwestern Power Administration (SWPA)",
"Western Area Power Administration (WAPA)","Centers for Disease Control and Prevention","National Institutes of Health","Administration for Children and Families","Assistant Secretary for Administration","Federal Emergency Management Agency","Covered programs list for HUD","Bureau of Land Management","Bureau of Reclamation", "Bureau of Indian Affairs", "Bureau of Indian Education", "Bureau of Ocean Energy Management", "Bureau of Safety and Environmental Enforcement", "Fish and Wildlife Service", "National Park Service", 
"Office of Surface Mining Reclamation and Enforcement", "Office of Insular Affairs", "Department-Wide Programs", "Employment and Training Administration", "International Commissions", "Federal Highway Administration (FHWA)", "Federal Railroad Administration (FRA)", "Federal Transportation Administration (FTA)","Maritime Administration (MARAD)","Office of the Secretary of Transportation (OST)","Veterans Benefits Administration","Covered programs list for EPA","Covered programs list for NASA", "Covered programs list for NSF",
"Appalachian Regional Commission","Delta Regional Authority","Denali Commission"]

agency_keywords = ["U.S. Army Corps of Engineers (Army Corps)","Corporation for National and Community Service (AmeriCorps)","Department of Agriculture","Department of Commerce","Department of Energy","Department of Health and Human Services","Department of Homeland Security","Department of Housing and Urban Development","Department of the Interior","Department of Labor","Department of State","Department of Transportation","Department of Veterans Affairs","Environmental Protection Agency",
"National Aeronautics and Space Administration","National Science Foundation","Appalachian Regional Commission","Delta Regional Authority","Denali Commission"]


In [167]:


bulk = text.replace('\n', ' ')
 

def extract_numbered_lists_between_keywords(text, start_keyword, end_keyword):
    # Create a regular expression pattern to match numbered lists
    list_pattern = r'\d+\.\s[^\n]*'

    # Create a regular expression pattern to match the start and end keywords
    keyword_pattern = re.escape(start_keyword) + r'(.*?)' + re.escape(end_keyword)

    # Find all occurrences of text between the start and end keywords
    keyword_matches = re.findall(keyword_pattern, text, re.DOTALL)

    extracted_lists = []
    for match in keyword_matches:
        # Find all numbered lists within the matched text
        lists = re.findall(list_pattern, match)
        extracted_lists.extend(lists)

    return extracted_lists



keywords = agency_keywords
df = pd.DataFrame(columns=['agency', 'program'])

agency_program ={}

for index, key in enumerate(keywords): 

    if index < (len(keywords) - 4): #condition to not go passed the bounds of the array
        start_keyword = keywords[index]
        end_keyword = keywords[index+1]
        extracted_lists = extract_numbered_lists_between_keywords(bulk, start_keyword, end_keyword)
        
        for x in extracted_lists:
            pattern = r'\d+\.\s(.*?)(?=\d+\.\s|\Z)'
            matches = re.findall(pattern, x)
            for y in matches:
                df=df.append({'agency':start_keyword,'program':y},ignore_index=True)

    else:
        for i in range(-3,0): #hardcoding the last 3 small agencies since the formatting is a bit wonky
            special_pattern = r"(?i)\b" + re.escape(keywords[i]) + r"\b(?:.*\n)*?((?:\d+\..*(?:\n|$))+)"

            matches = re.findall(special_pattern, text)
            extracted_lists = [item.strip() for sublist in matches for item in sublist.split("\n") if item.strip()]
            for x in extracted_lists:
                df=df.append({'agency':keywords[i],'program':x},ignore_index=True)



In [163]:
result_df = df.drop_duplicates(subset=['program'], keep='first').reset_index(drop=True)
result_df['agency'][result_df['agency'] == 'U.S. Army Corps of Engineers (Army Corps)'] = 'Army Corps of Engineers' #shorten agency for comparison, there is likely a workaround on the comparison end but I would handle the issue here
result_df

Unnamed: 0,agency,program
0,Army Corps of Engineers,Aquatic Ecosystem Restoration – Construction*
1,Army Corps of Engineers,Aquatic Ecosystem Restoration – Investigations*
2,Army Corps of Engineers,Flood and Storm Damage Reduction Program – Con...
3,Army Corps of Engineers,Flood and Storm Damage Reduction Program – Inv...
4,Army Corps of Engineers,Continuing Authorities Program*
...,...,...
460,National Aeronautics and Space Administration,Applied Sciences: Community Action
461,National Aeronautics and Space Administration,Applied Sciences: Climate Resilience
462,Appalachian Regional Commission,1. Appalachian Regional Commission Grants*
463,Delta Regional Authority,1. Delta Regional Authority Grants*


In [164]:
result_df['program'] = result_df['program'].str.lstrip(digits)
result_df['program'] = result_df['program'].str.replace('\W+', ' ') #strip all special characters

result_df.to_csv('J40covered.csv',index=False)

  result_df['program'] = result_df['program'].str.replace('\W+', ' ') #strip all special characters


In [165]:
#THIS WORKS DON'T DELETE IT!!!!!!!
'''
def extract_numbered_lists_between_keywords(text, start_keyword, end_keyword):
    extracted_lists = []
    is_extracting = False
    current_list = []
    

    for line in text.split("\n"):
        # Check if the current line contains the start keyword
        if start_keyword.lower() in line.lower():
            is_extracting = True
            continue

        # Check if the current line contains the end keyword
        if end_keyword.lower() in line.lower():
            if current_list:
                extracted_lists.append(current_list)
                current_list = []
            is_extracting = False
            continue

        # If we are currently extracting, check if the line starts with a number
        if is_extracting and re.match(r'\d+\.\s+(.*)', line.strip()):
            agency_program[start_keyword] = line.strip()
            current_list.append(line.strip())
            
          
            
    return extracted_lists

keywords = agency_keywords
df = pd.DataFrame(columns=['agency', 'program'])

agency_program ={}

for index, key in enumerate(keywords): 

    if index < (len(keywords) - 1): #condition to not go passed the bounds of the array
        start_keyword = keywords[index]
        end_keyword = keywords[index+1]
        extracted=extract_numbered_lists_between_keywords(text, start_keyword, end_keyword)
      
        extracted_list = [j for i in extracted for j in i] #unpacking list
        #print(mylist)
        for x in extracted_list:
            df=df.append({'agency':start_keyword,'program':x},ignore_index=True)
    
      
    else:
        for i in range(-3,0): #hardcoding the last 3 small agencies since the formatting is a bit wonky
            pattern = r"(?i)\b" + re.escape(keywords[i]) + r"\b(?:.*\n)*?((?:\d+\..*(?:\n|$))+)"
            matches = re.findall(pattern, text)
            extracted_list = [item.strip() for sublist in matches for item in sublist.split("\n") if item.strip()]
            for x in extracted_list:
                df=df.append({'agency':keywords[i],'program':x},ignore_index=True)

'''
    

'\ndef extract_numbered_lists_between_keywords(text, start_keyword, end_keyword):\n    extracted_lists = []\n    is_extracting = False\n    current_list = []\n    \n\n    for line in text.split("\n"):\n        # Check if the current line contains the start keyword\n        if start_keyword.lower() in line.lower():\n            is_extracting = True\n            continue\n\n        # Check if the current line contains the end keyword\n        if end_keyword.lower() in line.lower():\n            if current_list:\n                extracted_lists.append(current_list)\n                current_list = []\n            is_extracting = False\n            continue\n\n        # If we are currently extracting, check if the line starts with a number\n        if is_extracting and re.match(r\'\\d+\\.\\s+(.*)\', line.strip()):\n            agency_program[start_keyword] = line.strip()\n            current_list.append(line.strip())\n            \n          \n            \n    return extracted_lists\n\nkey

#### To Do ####
- longer named/double lined programs getting cut off