In [53]:
#import packages 
import io #for input and output operations
import requests #to get() the PDFs or url
from PyPDF2 import PdfReader #package for reading info from the PDFs
import pdfplumber #package for reading info from the PDFs
import re #regular expressions allows a user to search for strings
from itertools import chain # used to merge ranges
import pandas as pd
import numpy as np
from sklearn import preprocessing #for text cleaning
import matplotlib.pyplot as plt #for plotting
from sklearn.feature_extraction.text import TfidfVectorizer #for text representation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#model testing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


## Uploading & Clean-Up ##
- Upload BIL & IRA lists from whitehouse.gov
- Clean data to make it readable and transform to mimic AJC database

#### BIL data ####

In [81]:

url = f'https://www.whitehouse.gov/wp-content/uploads/2023/06/20230615-build-gov-csv.csv'  #BIL page: https://www.whitehouse.gov/build/guidebook/
BILraw = pd.read_csv(url)

BILraw.head()

Unnamed: 0,Program Name,Category,Agency Name,Bureau Name,Funding Amount,Funding Amount Notes,Period of Availability,Funding Mechanism,New Program (Yes/No),Recipients,Program Description,Eligible Uses,Next Program Milestone,sam.gov link,Statutory Location,Federal Cost Share Requirement
0,Regulatory Program,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$160,000,000",,2022 - 2026,Direct Federal,No,Army Corps permitting activities.,Provides funds to administer the laws and regu...,Labor and related expenses for Army Corps staff.,To be determined,,"Division J, Title III",No cost share requirement. Applicants can elec...
1,Water-Related Environmental Infrastructure Ass...,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$200,000,000",,Available until expended,Direct Federal,No,"In partnership with State, local, and Tribal g...",The program funds engineering and construction...,Authorized environmental infrastructure projects,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
2,Major Rehabilitation for Rivers and Harbors,Ports and Waterways,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$1,500,000,000","Including for $250 million for Small, Remote, ...",Available until expended,Direct Federal,No,"In partnership with port authorities, the Army...",Funds the construction of authorized coastal n...,Federal commercial navigation projects on coas...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
3,Inland Waterways Projects,Ports and Waterways,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$2,500,000,000",,Available until expended,Direct Federal,No,The Army Corps will use this funding to constr...,Funds the replacement and expansion of inland ...,Federal commercial navigation projects on inla...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
4,Continuing Authorities Program (Under Flood Co...,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$465,000,000",,Available until expended,Direct Federal,No,"The Army Corps will use this funding to plan, ...","Funds eligible small Army Corps projects, incl...",A variety of water resources projects which mu...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law:..."


In [82]:
BILclean = BILraw
BILclean.rename(columns={'Program Name':'program','Agency Name':'agency', 'Bureau Name':'bureau', 'Funding Amount':'funding_amt', 'Funding Amount Notes':'funding_amt_notes','Funding Mechanism':'funding_type', 'Recipients':'eligibility', 'Program Description':'description', 'Eligible Uses':'eligible_uses', 'Next Program Milestone':'next_milestone','sam.gov link':'link'}, inplace=True)
BILclean.head()

Unnamed: 0,program,Category,agency,bureau,funding_amt,funding_amt_notes,Period of Availability,funding_type,New Program (Yes/No),eligibility,description,eligible_uses,next_milestone,link,Statutory Location,Federal Cost Share Requirement
0,Regulatory Program,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$160,000,000",,2022 - 2026,Direct Federal,No,Army Corps permitting activities.,Provides funds to administer the laws and regu...,Labor and related expenses for Army Corps staff.,To be determined,,"Division J, Title III",No cost share requirement. Applicants can elec...
1,Water-Related Environmental Infrastructure Ass...,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$200,000,000",,Available until expended,Direct Federal,No,"In partnership with State, local, and Tribal g...",The program funds engineering and construction...,Authorized environmental infrastructure projects,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
2,Major Rehabilitation for Rivers and Harbors,Ports and Waterways,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$1,500,000,000","Including for $250 million for Small, Remote, ...",Available until expended,Direct Federal,No,"In partnership with port authorities, the Army...",Funds the construction of authorized coastal n...,Federal commercial navigation projects on coas...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
3,Inland Waterways Projects,Ports and Waterways,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$2,500,000,000",,Available until expended,Direct Federal,No,The Army Corps will use this funding to constr...,Funds the replacement and expansion of inland ...,Federal commercial navigation projects on inla...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law,..."
4,Continuing Authorities Program (Under Flood Co...,Resilience,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$465,000,000",,Available until expended,Direct Federal,No,"The Army Corps will use this funding to plan, ...","Funds eligible small Army Corps projects, incl...",A variety of water resources projects which mu...,To be determined,,"Division J, Title III","In general, unless otherwise specified in law:..."


In [83]:
BILclean.drop(columns=['Category', 'Statutory Location'], axis=1, inplace=True) #drop unnecessary info
dbColumns = {'category':0, 'category_label':1, 'agencyId':2, 'eligibilityId':3 ,'application':15 , 'pollution':17, 'technical':18}
for column, position in dbColumns.items(): 
    BILclean.insert(position, column, np.nan)

BILclean.head()

Unnamed: 0,category,category_label,agencyId,eligibilityId,program,agency,bureau,funding_amt,funding_amt_notes,Period of Availability,...,New Program (Yes/No),eligibility,description,eligible_uses,application,next_milestone,pollution,technical,link,Federal Cost Share Requirement
0,,,,,Regulatory Program,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$160,000,000",,2022 - 2026,...,No,Army Corps permitting activities.,Provides funds to administer the laws and regu...,Labor and related expenses for Army Corps staff.,,To be determined,,,,No cost share requirement. Applicants can elec...
1,,,,,Water-Related Environmental Infrastructure Ass...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$200,000,000",,Available until expended,...,No,"In partnership with State, local, and Tribal g...",The program funds engineering and construction...,Authorized environmental infrastructure projects,,To be determined,,,,"In general, unless otherwise specified in law,..."
2,,,,,Major Rehabilitation for Rivers and Harbors,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$1,500,000,000","Including for $250 million for Small, Remote, ...",Available until expended,...,No,"In partnership with port authorities, the Army...",Funds the construction of authorized coastal n...,Federal commercial navigation projects on coas...,,To be determined,,,,"In general, unless otherwise specified in law,..."
3,,,,,Inland Waterways Projects,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$2,500,000,000",,Available until expended,...,No,The Army Corps will use this funding to constr...,Funds the replacement and expansion of inland ...,Federal commercial navigation projects on inla...,,To be determined,,,,"In general, unless otherwise specified in law,..."
4,,,,,Continuing Authorities Program (Under Flood Co...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$465,000,000",,Available until expended,...,No,"The Army Corps will use this funding to plan, ...","Funds eligible small Army Corps projects, incl...",A variety of water resources projects which mu...,,To be determined,,,,"In general, unless otherwise specified in law:..."


In [101]:
#agencyId column update!
#-----------------------

#IRAclean['agency'].unique() #get unique values
abbr = {'Department of Transportation': 'dot', 'Department of Defense – Army Corps of Engineers' :'dod-ace', 'General Services Administration':'gsa', 'Department of Homeland Security':'dhs', 'Environmental Protection Agency':'epa', 'Department of Energy':'doe',
        'Department of the Interior':'doi', 'Department of Agriculture':'doa', 'Department of Health and Human Services':'dhhs', 'Department of Commerce':'doc', 'Federal Communications Commission':'fcc', 'Denali Commission':'denali', 'Appalachian Regional Commission':'arc',
       'Delta Regional Authority':'dra', 'Northern Border Regional Commission':'nbrc','Southeast Crescent Regional Commission':'scrc', 'Council on Environmental Quality':'ceq', 'Department of Housing and Urban Development':'hud','U.S. Postal Service':'usps'} #create dictionary of agency abbreviates (how to handle subbranch of dod?)

BILclean['agencyId'] = BILclean['agency'].map(abbr) #map abbreviations to agency names

BILclean.head()

['Council on Environmental Quality' 'Department of Agriculture'
 'Department of Commerce' 'Department of Energy'
 'Department of Homeland Security'
 'Department of Housing and Urban Development'
 'Department of the Interior' 'Department of Transportation'
 'Environmental Protection Agency' 'General Services Administration'
 'U.S. Postal Service']


Unnamed: 0,category,category_label,agencyId,eligibilityId,program,agency,bureau,funding_amt,funding_amt_notes,Period of Availability,...,New Program (Yes/No),eligibility,description,eligible_uses,application,next_milestone,pollution,technical,link,Federal Cost Share Requirement
0,pollution,Remediation and reduction of legacy pollution,dod-ace,,Regulatory Program,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$160,000,000",,2022 - 2026,...,No,Army Corps permitting activities.,Provides funds to administer the laws and regu...,Labor and related expenses for Army Corps staff.,,To be determined,,,,No cost share requirement. Applicants can elec...
1,water,Critical clean water and wastewater infrastruc...,dod-ace,".stateGov, .tribal, .localGov",Water-Related Environmental Infrastructure Ass...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$200,000,000",,Available until expended,...,No,"In partnership with State, local, and Tribal g...",The program funds engineering and construction...,Authorized environmental infrastructure projects,,To be determined,,,,"In general, unless otherwise specified in law,..."
2,water,Critical clean water and wastewater infrastruc...,dod-ace,,Major Rehabilitation for Rivers and Harbors,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$1,500,000,000","Including for $250 million for Small, Remote, ...",Available until expended,...,No,"In partnership with port authorities, the Army...",Funds the construction of authorized coastal n...,Federal commercial navigation projects on coas...,,To be determined,,,,"In general, unless otherwise specified in law,..."
3,water,Critical clean water and wastewater infrastruc...,dod-ace,.indiv,Inland Waterways Projects,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$2,500,000,000",,Available until expended,...,No,The Army Corps will use this funding to constr...,Funds the replacement and expansion of inland ...,Federal commercial navigation projects on inla...,,To be determined,,,,"In general, unless otherwise specified in law,..."
4,water,Critical clean water and wastewater infrastruc...,dod-ace,,Continuing Authorities Program (Under Flood Co...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$465,000,000",,Available until expended,...,No,"The Army Corps will use this funding to plan, ...","Funds eligible small Army Corps projects, incl...",A variety of water resources projects which mu...,,To be determined,,,,"In general, unless otherwise specified in law:..."


In [85]:
#eligibilityId column update
#---------------------------
#eligibility ID dictionary 
eligID = {'tribes':'.tribal', 'tribal':'.tribal', 'tribe':'.tribal',
          'states':'.stateGov', 'state':'.stateGov',
          'nonprofits':'.ngo', 'nongovernmental':'.ngo', 'non-profit':'.ngo',
          'local':'.localGov','town': '.localGov', 'townships': '.localGov','cities': '.localGov','counties': '.localGov','county': '.localGov',
          'community':'.communitygroup',
          'university':'.ed', 'education':'.ed', 'school':'.ed', 'academia':'.ed', 'higher-ed':'.ed', 'college':'.ed', 'colleges': '.ed', 'universities':'.ed',
          'agricultural':'.agricultural',
          'owner':'.indiv', 'corporations':'.indiv', 'companies':'.indiv', 'business':'.indiv', 'businesses':'.indiv', 'firm':'.indiv', 'commercial':'.indiv'} #what's the difference between stategov, localgov, and statescountiescities, how to code .statesCountiesCities and .indiv 

def addEligibilityId(pddf,dict):
    eligKeys = list(dict.keys()) # lists out all the keys of the key value pairs
    pddf['eligibility'] = pddf['eligibility'].fillna('N/A') #fill nan with N/A 


    for index, row in pddf.iterrows(): #goes through each row of the dataframe
        pattern = r'\b({})\b'.format('|'.join(map(re.escape, eligKeys)))
        # unique matches
        matches = set(re.findall(pattern, row['eligibility'].lower(), re.IGNORECASE))
        values = {dict[x] for x in matches}
        pddf.loc[index,"eligibilityId"] = ', '.join(values)
        
    return pddf




In [86]:

addEligibilityId(BILclean,eligID)
BILclean.head()


Unnamed: 0,category,category_label,agencyId,eligibilityId,program,agency,bureau,funding_amt,funding_amt_notes,Period of Availability,...,New Program (Yes/No),eligibility,description,eligible_uses,application,next_milestone,pollution,technical,link,Federal Cost Share Requirement
0,,,dod-ace,,Regulatory Program,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$160,000,000",,2022 - 2026,...,No,Army Corps permitting activities.,Provides funds to administer the laws and regu...,Labor and related expenses for Army Corps staff.,,To be determined,,,,No cost share requirement. Applicants can elec...
1,,,dod-ace,".stateGov, .tribal, .localGov",Water-Related Environmental Infrastructure Ass...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$200,000,000",,Available until expended,...,No,"In partnership with State, local, and Tribal g...",The program funds engineering and construction...,Authorized environmental infrastructure projects,,To be determined,,,,"In general, unless otherwise specified in law,..."
2,,,dod-ace,,Major Rehabilitation for Rivers and Harbors,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$1,500,000,000","Including for $250 million for Small, Remote, ...",Available until expended,...,No,"In partnership with port authorities, the Army...",Funds the construction of authorized coastal n...,Federal commercial navigation projects on coas...,,To be determined,,,,"In general, unless otherwise specified in law,..."
3,,,dod-ace,.indiv,Inland Waterways Projects,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$2,500,000,000",,Available until expended,...,No,The Army Corps will use this funding to constr...,Funds the replacement and expansion of inland ...,Federal commercial navigation projects on inla...,,To be determined,,,,"In general, unless otherwise specified in law,..."
4,,,dod-ace,,Continuing Authorities Program (Under Flood Co...,Department of Defense – Army Corps of Engineers,Corps of Engineers – Civil Works,"$465,000,000",,Available until expended,...,No,"The Army Corps will use this funding to plan, ...","Funds eligible small Army Corps projects, incl...",A variety of water resources projects which mu...,,To be determined,,,,"In general, unless otherwise specified in law:..."


#### IRA data ####

In [93]:
url = f'https://www.whitehouse.gov/wp-content/uploads/2022/12/IRA-Guidebook-Funding-Programs.csv' #IRA page: https://www.whitehouse.gov/cleanenergy/inflation-reduction-act-guidebook/
columnsIncluded = ['Agency','Bureau','Program','Program Description','Funding Amount','Funding Mechanism','Period of Availability','Eligible Recipients','Tribal Eligibility?','Eligible Uses','New or Existing Program', 'Cost Share Requirement','SAM.gov Assistance Listing'] #columns tin include from raw data
IRAraw = pd.read_csv(url, usecols=columnsIncluded) #columns excluded include: Statutory Location, Formula Funding, Recent Announcements

IRAraw = IRAraw.head(-2)
IRAraw.head()

Unnamed: 0,Agency,Bureau,Program,Program Description,Funding Amount,Funding Mechanism,Period of Availability,Eligible Recipients,Tribal Eligibility?,Eligible Uses,New or Existing Program,Cost Share Requirement,SAM.gov Assistance Listing
0,Council on Environmental Quality,,Environmental and Climate Data Improvement,"To improve the quality, availability, and use ...","$32,500,000",Direct Federal Spending,"To remain available until September 30, 2026.",Direct Federal Spending,No,See program description.,Existing,,
1,Department of Agriculture,Rural Utilities Service,Powering Affordable Clean Energy (PACE),To provide partially forgivable loans to renew...,"$1,000,000,000","Loans, Loan Forgiveness","To remain available until September 30, 2031.",Eligible entities that generate electricity fo...,Yes,"Applicants can use PACE funds to finance wind,...",Existing,"Up to 50%, but may be waived at the discretion...",https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
2,Department of Agriculture,Rural Business-Cooperative Service,Biofuel Infrastructure and Agriculture Product...,To provide grants through the Higher Blend Inf...,"$500,000,000",Grants,"To remain available until September 30, 2031.",Transportation fueling facilities including fu...,Yes,"(1) Fueling stations, convenience stores, hype...",Existing,25%,https://sam.gov/fal/23ed3d84a46f4e688c6f4223fc...
3,Department of Agriculture,Rural Utilities Service,Empowering Rural America (New ERA),To fund the construction of electric distribut...,"$9,700,000,000","Loans, modifications of loans, the cost of loa...","To remain available until September 30, 2031.","Rural electric cooperatives, including e xisti...",Yes,To make energy efficiency improvements to elig...,New,"Varies, depending on product",https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
4,Department of Agriculture,Farm Service Agency,Assistance for Distressed Borrowers,To expedite assistance to distressed borrowers...,"$3,100,000,000",Direct Federal Spending,"To remain available until September 30, 2031.",Direct Federal Spending,Yes. Tribal governments as well as individual ...,Direct federal spending,New,,


In [94]:
IRAclean = IRAraw
IRAclean.rename(columns={'Program':'program','Agency':'agency', 'Bureau':'bureau', 'Funding Amount':'funding_amt', 'Funding Amount Notes':'funding_amt_notes','Funding Mechanism':'funding_type', 'Eligible Recipients':'eligibility', 'Program Description':'description', 'Eligible Uses':'eligible_uses', 'Next Program Milestone':'next_milestone','SAM.gov Assistance Listing':'link'}, inplace=True)
IRAclean.head()

Unnamed: 0,agency,bureau,program,description,funding_amt,funding_type,Period of Availability,eligibility,Tribal Eligibility?,eligible_uses,New or Existing Program,Cost Share Requirement,link
0,Council on Environmental Quality,,Environmental and Climate Data Improvement,"To improve the quality, availability, and use ...","$32,500,000",Direct Federal Spending,"To remain available until September 30, 2026.",Direct Federal Spending,No,See program description.,Existing,,
1,Department of Agriculture,Rural Utilities Service,Powering Affordable Clean Energy (PACE),To provide partially forgivable loans to renew...,"$1,000,000,000","Loans, Loan Forgiveness","To remain available until September 30, 2031.",Eligible entities that generate electricity fo...,Yes,"Applicants can use PACE funds to finance wind,...",Existing,"Up to 50%, but may be waived at the discretion...",https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
2,Department of Agriculture,Rural Business-Cooperative Service,Biofuel Infrastructure and Agriculture Product...,To provide grants through the Higher Blend Inf...,"$500,000,000",Grants,"To remain available until September 30, 2031.",Transportation fueling facilities including fu...,Yes,"(1) Fueling stations, convenience stores, hype...",Existing,25%,https://sam.gov/fal/23ed3d84a46f4e688c6f4223fc...
3,Department of Agriculture,Rural Utilities Service,Empowering Rural America (New ERA),To fund the construction of electric distribut...,"$9,700,000,000","Loans, modifications of loans, the cost of loa...","To remain available until September 30, 2031.","Rural electric cooperatives, including e xisti...",Yes,To make energy efficiency improvements to elig...,New,"Varies, depending on product",https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
4,Department of Agriculture,Farm Service Agency,Assistance for Distressed Borrowers,To expedite assistance to distressed borrowers...,"$3,100,000,000",Direct Federal Spending,"To remain available until September 30, 2031.",Direct Federal Spending,Yes. Tribal governments as well as individual ...,Direct federal spending,New,,


In [95]:
dbColumns = {'category':0, 'category_label':1, 'agencyId':2, 'eligibilityId':3 ,'application':15 , 'pollution':17, 'technical':18}
for column, position in dbColumns.items(): 
    IRAclean.insert(position, column, np.nan)

IRAclean.head()

Unnamed: 0,category,category_label,agencyId,eligibilityId,agency,bureau,program,description,funding_amt,funding_type,Period of Availability,eligibility,Tribal Eligibility?,eligible_uses,New or Existing Program,application,Cost Share Requirement,pollution,technical,link
0,,,,,Council on Environmental Quality,,Environmental and Climate Data Improvement,"To improve the quality, availability, and use ...","$32,500,000",Direct Federal Spending,"To remain available until September 30, 2026.",Direct Federal Spending,No,See program description.,Existing,,,,,
1,,,,,Department of Agriculture,Rural Utilities Service,Powering Affordable Clean Energy (PACE),To provide partially forgivable loans to renew...,"$1,000,000,000","Loans, Loan Forgiveness","To remain available until September 30, 2031.",Eligible entities that generate electricity fo...,Yes,"Applicants can use PACE funds to finance wind,...",Existing,,"Up to 50%, but may be waived at the discretion...",,,https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
2,,,,,Department of Agriculture,Rural Business-Cooperative Service,Biofuel Infrastructure and Agriculture Product...,To provide grants through the Higher Blend Inf...,"$500,000,000",Grants,"To remain available until September 30, 2031.",Transportation fueling facilities including fu...,Yes,"(1) Fueling stations, convenience stores, hype...",Existing,,25%,,,https://sam.gov/fal/23ed3d84a46f4e688c6f4223fc...
3,,,,,Department of Agriculture,Rural Utilities Service,Empowering Rural America (New ERA),To fund the construction of electric distribut...,"$9,700,000,000","Loans, modifications of loans, the cost of loa...","To remain available until September 30, 2031.","Rural electric cooperatives, including e xisti...",Yes,To make energy efficiency improvements to elig...,New,,"Varies, depending on product",,,https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
4,,,,,Department of Agriculture,Farm Service Agency,Assistance for Distressed Borrowers,To expedite assistance to distressed borrowers...,"$3,100,000,000",Direct Federal Spending,"To remain available until September 30, 2031.",Direct Federal Spending,Yes. Tribal governments as well as individual ...,Direct federal spending,New,,,,,


In [102]:
IRAclean['agencyId'] = IRAclean['agency'].map(abbr) #map abbreviations to agency names
addEligibilityId(IRAclean,eligID)
IRAclean.head()


Unnamed: 0,category,category_label,agencyId,eligibilityId,agency,bureau,program,description,funding_amt,funding_type,Period of Availability,eligibility,Tribal Eligibility?,eligible_uses,New or Existing Program,application,Cost Share Requirement,pollution,technical,link
0,,,ceq,,Council on Environmental Quality,,Environmental and Climate Data Improvement,"To improve the quality, availability, and use ...","$32,500,000",Direct Federal Spending,"To remain available until September 30, 2026.",Direct Federal Spending,No,See program description.,Existing,,,,,
1,,,doa,".stateGov, .ngo, .indiv, .tribal",Department of Agriculture,Rural Utilities Service,Powering Affordable Clean Energy (PACE),To provide partially forgivable loans to renew...,"$1,000,000,000","Loans, Loan Forgiveness","To remain available until September 30, 2031.",Eligible entities that generate electricity fo...,Yes,"Applicants can use PACE funds to finance wind,...",Existing,,"Up to 50%, but may be waived at the discretion...",,,https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
2,,,doa,,Department of Agriculture,Rural Business-Cooperative Service,Biofuel Infrastructure and Agriculture Product...,To provide grants through the Higher Blend Inf...,"$500,000,000",Grants,"To remain available until September 30, 2031.",Transportation fueling facilities including fu...,Yes,"(1) Fueling stations, convenience stores, hype...",Existing,,25%,,,https://sam.gov/fal/23ed3d84a46f4e688c6f4223fc...
3,,,doa,,Department of Agriculture,Rural Utilities Service,Empowering Rural America (New ERA),To fund the construction of electric distribut...,"$9,700,000,000","Loans, modifications of loans, the cost of loa...","To remain available until September 30, 2031.","Rural electric cooperatives, including e xisti...",Yes,To make energy efficiency improvements to elig...,New,,"Varies, depending on product",,,https://sam.gov/fal/779b429f23b04071acc7dcfd9e...
4,,,doa,,Department of Agriculture,Farm Service Agency,Assistance for Distressed Borrowers,To expedite assistance to distressed borrowers...,"$3,100,000,000",Direct Federal Spending,"To remain available until September 30, 2031.",Direct Federal Spending,Yes. Tribal governments as well as individual ...,Direct federal spending,New,,,,,


#### NLP Multiclass Text Classificaiton ####

In [89]:
#NLP Topic Classfication - automatically analyze text and then assign a set of predefined tags or categories based on its context
# Going to start with importing example classification data from 'A Just Climate' db and preprocessing/cleaning original url: https://docs.google.com/spreadsheets/d/1rUi4ZM8Zvo_hwnh8tSbH1QgurEWUr-u73gtXTINBs7I/edit#gid=781618991
SHEET_ID = '1rUi4ZM8Zvo_hwnh8tSbH1QgurEWUr-u73gtXTINBs7I'
SHEET_GID = '781618991'

url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid={SHEET_GID}' #SHEET_NAME did not work
ajcRaw = pd.read_csv(url)

ajcRaw.head()

Unnamed: 0,category,category_label,agencyId,eligibilityId,program,agency,bureau,funding_amt,funding_amt_notes,funding_type,funding_bill,eligibility,description,eligible_uses,next_milestone,application,link,pollution,technical
0,workforce,Training and workforce development,arc,".statesCountiesCities, .ngo",Appalachian Area Development: Allocations to A...,Appalachian Regional Commission,Appalachian Regional Commission,"$100,000,000",,Grants and/or cooperative agreements,0.1,"States, local governments, and nonprofits both...",The program provides Area Base Development Pro...,Projects to support the Appalachian Regional C...,The notice of funding opportunity was released...,Yes,https://www.arc.gov/state_partner_role/state-p...,No,No
1,workforce,Training and workforce development,arc,".statesCountiesCities, .ngo",Appalachian Area Development: Community Capaci...,Appalachian Regional Commission,Appalachian Regional Commission,"$10,000,000",,Grants and/or cooperative agreements,0.01,"States, local governments, and nonprofits both...",The program builds capacity throughout Appalac...,Projects to support the Appalachian Regional C...,The notice of funding opportunity was released...,Yes,https://energycommunities.gov/funding-opportun...,No,Yes
2,workforce,Training and workforce development,arc,".statesCountiesCities, .ngo",Appalachian Area Development: Regional Multist...,Appalachian Regional Commission,Appalachian Regional Commission,"$80,000,000",,Grants and/or cooperative agreements,0.08,"States, local governments, and nonprofits both...",The program provides support for multistate re...,Projects to support the Appalachian Regional C...,The notice of funding opportunity was released...,Yes,https://energycommunities.gov/funding-opportun...,No,No
3,workforce,Training and workforce development,denali,".statesCountiesCities, .tribal, .ed",Denali Commission,Denali Commission,Denali Commission,"$67,750,000",,Grants,0.068,"States, counties, cities, townships, special d...",TBD,"To be determined, but may include job training...",The notice of funding opportunity was released...,Yes,https://www.denali.gov/grants/,No,Yes
4,transportation,Clean transportation,doe,,Vehicles Technologies Office: Clean Cities,U.S. Department of Energy,Office of Energy Efficiency and Renewable Energy,,,,Subject to yearly appropriations,Depends on specific funding opportunity,The program facilitates national coordination ...,Projects to implement alternative fuels and en...,Concept papers were due by 5:00 p.m. ET on Aug...,Yes,https://cleancities.energy.gov/funding-opportu...,No,No


In [91]:
'''
#Preprocessing & Cleaning 

ajcDf = ajcRaw[['category', 'description']] #only choose necessary columns category and description
ajcDf['category'] = ajcDf['category'].str.strip()
ajcDf = ajcDf[(ajcDf['description'].notnull()) & (ajcDf['description'] != 'TBD') & (ajcDf['category'].notna())] #only include rows that have a value in the description column and don't have TBD as a description

ajcDf = ajcDf[(ajcDf['category'] != 'transportation workforce')] #EXCLUDE FOR NOW BUT FIGURE OUT HOW TO SEPARATE LATER

le = preprocessing.LabelEncoder() #create label encoder instance 
#MAY BE NECESSARY TO USE ONE HOT ENCODER INSTEAD because as it stands these labels are ranked not categorical so 'the average of climate and workforce could be transportation' but try both to see!

ajcDf['label'] = le.fit_transform(ajcDf['category']) #use fit_transform to label and create new column with label --- could also use onehotencoder but since the result is binary(0&1) but eliminates ordinality 

catToLab = dict(zip(sorted(set(ajcDf['category'])), sorted(set(ajcDf['label'])))) #create a dictionary of category keys and corresponding label values

#Text Processing - make text readable to algorithm
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(ajcDf.description).toarray()
labels = ajcDf.label
features.shape

X_train, X_test, y_train, y_test = train_test_split(ajcDf['description'], ajcDf['category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

#apply model to new data
X_train, X_test, y_train, y_test = train_test_split(ajcDf['description'], ajcDf['category'], test_size=0.33, random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
model = LogisticRegression(random_state=0).fit(X_train_tfidf, y_train)
# Convert a collection of text documents to a vector of term/token counts. 
cntVectData = count_vect.transform(BILclean['description'])

#RUN Prediction
BILclean['category'] = model.predict(cntVectData)


#update category_label column

webLabel = {'climate':'Climate Change', 'energy':'Clean Energy and Energy Efficiency','housing':'Affordable and sustainable housing','pollution':'Remediation and reduction of legacy pollution','transportation':'Clean Transportation','workforce':'Training and workforce development','water':'Critical clean water and wastewater infrastructure'} #create dictionary of agency abbreviates (how to handle subbranch of dod?)
BILclean['category_label'] = BILclean['category'].map(webLabel) #map abbreviations to agency names
BILclean

'''

"\n#Preprocessing & Cleaning \n\najcDf = ajcRaw[['category', 'description']] #only choose necessary columns category and description\najcDf['category'] = ajcDf['category'].str.strip()\najcDf = ajcDf[(ajcDf['description'].notnull()) & (ajcDf['description'] != 'TBD') & (ajcDf['category'].notna())] #only include rows that have a value in the description column and don't have TBD as a description\n\najcDf = ajcDf[(ajcDf['category'] != 'transportation workforce')] #EXCLUDE FOR NOW BUT FIGURE OUT HOW TO SEPARATE LATER\n\nle = preprocessing.LabelEncoder() #create label encoder instance \n#MAY BE NECESSARY TO USE ONE HOT ENCODER INSTEAD because as it stands these labels are ranked not categorical so 'the average of climate and workforce could be transportation' but try both to see!\n\najcDf['label'] = le.fit_transform(ajcDf['category']) #use fit_transform to label and create new column with label --- could also use onehotencoder but since the result is binary(0&1) but eliminates ordinality \n\

## To-Do & Updates ##
- Column positions
- One to many dictionary for eligibility id
- what to do about links that dont have a sam.gov link?
- Might be better to add IRA data before text classification?
- can instantiate dataframe order upon read-in as well using read_csv
- Ask if Recent Announcements in IRA data is useful?
- What to do with columns that don't overlap between the two? -maybe put into separate dataframe for now so it doesn't upset replicability
- Only include Justice 40 covered programs or just add a column for distinction?
- TODO: join two data sets, text classification, pollution & technical columns, justice 40 covered programs