# Analysis of submissions to the Request for Information (RFI) sent out by the Department of Commerce for best practices for disseminating AI-ready data
##### The RFI closed on July 16th

#### uncomment the cell below and install a package to read pdfs, and import the necessary packages

In [176]:
#!pip install pdftotext

In [180]:
import pdftotext
import os
import pandas as pd
import numpy as np
import re

#### Make a dataframe of the RFI submissions

In [181]:
# don't show full text of submission
pd.set_option('display.max_colwidth', 50)

RFIs=[]
directory = r'RFI_Submissions'
for filename in os.listdir(directory):
    if filename!= '.DS_Store':
        with open(os.path.join(directory, filename), 'rb') as f:
            filetext = ''
            pdf = pdftotext.PDF(f)
            for page in pdf:
                filetext+=page.lower()
        filetext= filetext.replace('\n', '')
        RFIs.append([filename,filetext])
RFIs = pd.DataFrame(RFIs).rename({0:'file', 1:'text'}, axis = 1)
RFIs

Unnamed: 0,file,text
0,019_Esri_RFI.pdf,"july 16, 2024esri and geospatial artificialint..."
1,030_IBM_RFI.pdf,international business machines (ibm) corporat...
2,103_FAS_RFI.pdf,"oliver wise,chief data officer, department of ..."
3,013_Data.World_RFI.pdf,"℅ capital factory701 brazos, ste 519austin, tx..."
4,009_MySidewalk_RFI.pdf,rfi reponsekeep it simplewe’ve learned our end...
5,014_Elsevier_RFI.pdf,ai and open government dataassets request for ...
6,100_Virginia_RFI.pdf,"roya pakzadfounder and director, taraazdata & ..."
7,038_Replica_RFI.pdf,"july 16, 2024dear mr. wise,as a long-time bene..."
8,008_Zois_American University.pdf,response to commerce’s rfi on improving data c...
9,020_XBRL_RFI.pdf,"july 16, 2024victoria houed, ouseau.s. departm..."


#### Make indexes match the file numbers for easier referencing

In [182]:
idxs = RFIs.file.map(lambda x: x[:3])
RFIs = RFIs.set_index(idxs)
RFIs

Unnamed: 0_level_0,file,text
file,Unnamed: 1_level_1,Unnamed: 2_level_1
19,019_Esri_RFI.pdf,"july 16, 2024esri and geospatial artificialint..."
30,030_IBM_RFI.pdf,international business machines (ibm) corporat...
103,103_FAS_RFI.pdf,"oliver wise,chief data officer, department of ..."
13,013_Data.World_RFI.pdf,"℅ capital factory701 brazos, ste 519austin, tx..."
9,009_MySidewalk_RFI.pdf,rfi reponsekeep it simplewe’ve learned our end...
14,014_Elsevier_RFI.pdf,ai and open government dataassets request for ...
100,100_Virginia_RFI.pdf,"roya pakzadfounder and director, taraazdata & ..."
38,038_Replica_RFI.pdf,"july 16, 2024dear mr. wise,as a long-time bene..."
8,008_Zois_American University.pdf,response to commerce’s rfi on improving data c...
20,020_XBRL_RFI.pdf,"july 16, 2024victoria houed, ouseau.s. departm..."


#### Find all submissions that mention recommendations collected from Whirlwind (RFI 011)
'recs' list is sorted by entries in the external 'Guidelines Appendix Tables' spreadsheet: High level vs techincal (Z to A), Category (A to Z), and Recommendation (A to Z)

In [175]:
#view full values
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

#return index out of total text length for relative location in document
#make everything lower case to handle different capitalization choices
recs = [
    ## Technical
    # Data and Metadata
    ['apache atlas'],['croissant'],['csv'],['dcat '],['dcat-us'],['dublin core'],['fuzzy matching'],['hdf5'],
    ['iso 19115','iso19115'],['json'],[' owl'],['parquet'],['rdf'],['schema.org'],['sparql'],['w3c dcat'],
    ['xml'], 
    # Licensing
    ['cc-by'],['cc0'],['creative commons'],
    # Storage and Publishing
    ['api gateway'],['bulk download'],['ZIP','compressed data'], #0, work on to catch cases
    ['standardized url','standard url','consistent url','url consistency','url standard'],
    ['portal for ai-ready'], #work on this
    ['plugin', 'plug-in'], #work on this
    ['download'], ['graphql'], ['split', 'segment'],# work on this for split data downloads for large datasets
    ['openapi', 'open api'], ['synthetic data','preview data'],['restful'], # 8 for restful api vs 11 for restful
    ['robots.txt'],['sitemap'],['streaming api'],['breadcrumb'],['webhook'],
    # Validation and Quality
    ['apache data cleaner'],['blockchain'],['confidence interval','uncertainty measure','measure uncertainty'],
    ['ai fairness 360'],['quality metric','measure quality'],['runtime application self-protection',' rasp '],
    ['talend'],

    ## High level
    # Data and Metadata
    ['Unknown'],#work on
    ['data dictionary'],['readiness level'], #work on
    ['ontolog','controlled vocabular'],['open data format'],['quality metrics embedded in metadata'], #work on
    ['semantic web'],['standardized metadata','metadata standards'],['version control'],
    # Documentation
    ['historical'],#work on
    ['api document'],['provenance'],['bias'],#work on
    ['collection method'],['terms of use'],['tutorial','sample code'],['usage guideline'],
    # Feedback
    ['consortia'], #work on
    ['hackathon'],['community engage','forum'],['usage tracking'], #add more tracking
    # Licensing
    # Storage and Publishing
    ['many formats','multiple formats'],
    # Validation and Quality
    ['zero trust','never trust, always verify'],
    ['low-bandwidth','low bandwidth'],#work on low internet access
    ['edge case','stress test'],['devsecops'],['interoperab'],['linked data standard','linked data principle'],
    ['maig','manifesto for ai gov'],['language'], #work on
    ['discriminat'],['ords','open data rights statement'],['odrl','open digital rights language'],
    ['responsible ai  license'], #work on
    ['section 508'],['accessibility standard'],['wcag','web content accessibility guidelines'],
    ['benchmark data','benchmark'], #benchmark gives 11 more, investigate
]
#recs_files_df = pd.DataFrame(columns = ['recommendation', 'files']) #list, list, dictionary{file:index/length}
entries={}
for rec in recs:
    entries[str(rec)]=[]
    for synonym in rec:
        for file,text in zip(RFIs['file'],RFIs['text']):
            entry = [rec, file, re.findall(f'{synonym}', text)] #vs r?
            #print(entry)
            if len(entry[2])>0:
                entries[str(rec)]+= [file]
col1 = list(entries.keys())
col2 = list(entries.values())
frequent_recs_df = pd.DataFrame({'recommendation':col1, 'files':col2, 'files_count':map(lambda x: len(x),col2)})
frequent_recs_df

Unnamed: 0,recommendation,files,files_count
0,['apache atlas'],[011_WhirlwindTechnologies_RFI.pdf],1
1,['croissant'],"[102_MIT_RFI.pdf, 018_MLCommons_RFI.pdf, 104_Google_RFI.pdf]",3
2,['csv'],"[013_Data.World_RFI.pdf, 009_MySidewalk_RFI.pdf, 014_Elsevier_RFI.pdf, 008_Zois_American University.pdf, 020_XBRL_RFI.pdf, 040_Gretel_RFI.pdf, 017_SAS_RFI.pdf, 015_Element84_RFI.pdf, 033_PROTESORO_RFI.pdf, 023_Deloitte_RFI.pdf, 010_ScaleAI_RFI.pdf, 037_UnstructuredTech_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 034_IEEE_RFI.pdf, 036_Microsoft_RFI.pdf, 027_CSET_RFI.pdf, 028_Kitware_RFI.pdf]",17
3,['dcat '],"[008_Zois_American University.pdf, 034_IEEE_RFI.pdf, 021_Jess_UVirginia_RFI .pdf, 028_Kitware_RFI.pdf]",4
4,['dcat-us'],[011_WhirlwindTechnologies_RFI.pdf],1
5,['dublin core'],"[013_Data.World_RFI.pdf, 014_Elsevier_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 034_IEEE_RFI.pdf, 025_StardogUnion_RFI.pdf]",5
6,['fuzzy matching'],[011_WhirlwindTechnologies_RFI.pdf],1
7,['hdf5'],"[013_Data.World_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 028_Kitware_RFI.pdf]",3
8,"['iso 19115', 'iso19115']","[013_Data.World_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf]",2
9,['json'],"[013_Data.World_RFI.pdf, 014_Elsevier_RFI.pdf, 008_Zois_American University.pdf, 020_XBRL_RFI.pdf, 040_Gretel_RFI.pdf, 017_SAS_RFI.pdf, 015_Element84_RFI.pdf, 033_PROTESORO_RFI.pdf, 023_Deloitte_RFI.pdf, 010_ScaleAI_RFI.pdf, 037_UnstructuredTech_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 034_IEEE_RFI.pdf, 102_MIT_RFI.pdf, 036_Microsoft_RFI.pdf, 029_Leidos_RFI.pdf, 027_CSET_RFI.pdf, 028_Kitware_RFI.pdf]",18


#### Additional Analyses (not initially useful)

In [18]:
### tfidf analysis
# def tfidf_data(reviews_ser, review):
#     # Remove non-alphanumeric characters and split by whitespace
#     words = re.findall(r'\b\w+\b', review)
#     out = pd.DataFrame(pd.Series(words).value_counts(), columns=['count']) 
#     #out = pd.DataFrame(pd.Series(review.split()).value_counts())

#     out['indx'] = out.index
#     out['tf'] = out['count']/len(review.split())
#     out['idf'] = np.log(len(reviews_ser)/ (out['indx'].apply(lambda word: sum(reviews_ser.str.contains(fr'\b{re.escape(word)}\b', regex=True)))))
#     out['tfidf'] = out['tf']*out['idf']
#     return out.drop(columns = 'indx').sort_values(['tfidf', 'count'])

# for filename in os.listdir(directory):
#     if filename!= '.DS_Store' and filename!= 'combined.pdf':
#         with open(os.path.join(directory, filename), 'rb') as f:
#             rfi = ''
#             pdf = pdftotext.PDF(f)
#             for page in pdf:
#                 rfi+=page.lower()
#         print(filename)
#         display(tfidf_data(RFIs, rfi))
#     break


### count all word frequencies overall
# directory = r'RFI_Submissions'
# with open(os.path.join(directory, 'combined.pdf'), 'rb') as f:
#     combined = ''
#     pdf = pdftotext.PDF(f)
#     for page in pdf:
#         combined+=page.lower()
# words = re.findall(r'\b\w+\b', combined)
# all_counts = pd.DataFrame(pd.Series(words).value_counts(), columns=['count']) 
# all_counts