# Analysis of submissions to the Request for Information (RFI) sent out by the Department of Commerce for best practices for disseminating AI-ready data
##### The RFI closed on July 16th

#### Uncomment the cell below and install a package to read pdfs, and import the necessary packages

In [None]:
#!pip install PyMuPDF

In [1]:
import fitz
import os
import pandas as pd
import numpy as np
import re

#### Make a dataframe of the RFI submissions

In [2]:
# truncate submission text
pd.set_option('display.max_colwidth', 50)

RFIs = []
directory = r'C:\Users\mgaddi\OneDrive - U.S. Department of Commerce\Desktop\RFI Submissions' # the folder where the pdf submissions are stored locally
for filename in os.listdir(directory):
    if filename != '.DS_Store':
        with open(os.path.join(directory, filename), 'rb') as f:
            filetext = ''
            doc = fitz.open(f)
            # PyMuPDF reads pdfs one page at a time. Concatenate all the pages of a file together
            for page in doc:
                filetext += page.get_text().lower() # save in lowercase to handle different captitalization styles when searching later
                
        filetext = filetext.replace('\n', ' ')
        RFIs.append([filename,filetext])
RFIs = pd.DataFrame(RFIs).rename({0:'file', 1:'text'}, axis = 1)
RFIs

Unnamed: 0,file,text
0,005_OmniTrustAI_RFI.pdf,department of commerce comments for rfi submit...
1,006_Narendra_RFI.docx,"public submission as of: 7/9/24, 1:04 pm recei..."
2,008_Zois_American University.pdf,response to commerce’s rfi on improving data c...
3,009_MySidewalk_RFI.pdf,rfi reponse keep it simple we’ve learned our e...
4,010_ScaleAI_RFI.pdf,scale ai response to the departme...
5,011_WhirlwindTechnologies_RFI.pdf,...
6,012_BenchmarkLabs_RFI.pdf,honorable gina raimondo secretary u.s. departm...
7,013_Data.World_RFI.pdf,"℅capital factory 701 brazos, ste 519 austin, t..."
8,014_Elsevier_RFI.pdf,ai and open government data...
9,015_Element84_RFI.pdf,ai and open government data assets rfi respons...


In [3]:
# make indexes match the file numbers for easier referencing
idxs = RFIs.file.map(lambda x: x[:3])
RFIs = RFIs.set_index(idxs)
RFIs

Unnamed: 0_level_0,file,text
file,Unnamed: 1_level_1,Unnamed: 2_level_1
5,005_OmniTrustAI_RFI.pdf,department of commerce comments for rfi submit...
6,006_Narendra_RFI.docx,"public submission as of: 7/9/24, 1:04 pm recei..."
8,008_Zois_American University.pdf,response to commerce’s rfi on improving data c...
9,009_MySidewalk_RFI.pdf,rfi reponse keep it simple we’ve learned our e...
10,010_ScaleAI_RFI.pdf,scale ai response to the departme...
11,011_WhirlwindTechnologies_RFI.pdf,...
12,012_BenchmarkLabs_RFI.pdf,honorable gina raimondo secretary u.s. departm...
13,013_Data.World_RFI.pdf,"℅capital factory 701 brazos, ste 519 austin, t..."
14,014_Elsevier_RFI.pdf,ai and open government data...
15,015_Element84_RFI.pdf,ai and open government data assets rfi respons...


#### Find all submissions that mention common recommendations
The 'recs' list of recommendations is based on entries in the external 'Guidelines Appendix Tables' spreadsheet: https://docs.google.com/spreadsheets/d/1AP5s1GENEfR4U2v1_NmAQBcoDOIVADaQxxUD_zfmv94/edit?usp=sharing

In [4]:
#view full values
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

# Maybe implement later: return index out of total text length for relative location in document
recs = [
    ### Technical
    ## Data and Metadata
    ['apache atlas'],['croissant'],['csv'],['dcat '],['dcat-us'],['dublin core'],['hdf5'],
    ['iso 19115','iso19115', '19115'],['json'],[' owl'],['parquet'],['rdf'],['schema.org'],['sparql'],
    ['xml'], ['zip '],
    ## Licensing
    ['apache license', 'apache 2.0'],#'apache' could catch more cases but would include 'apache parquet' which is different
    ['cc-by','cc by', 'ccby'],['cc0', 'cc 0', 'cc-0'],['creative commons'],
    ['mit license', ' mit '], #add other open source licenses?
    ['responsible ai  license'], #RAIL acroym would catch too many false positives
    ## Storage and Publishing
    ['api gateway'],['breadcrumb'],['graphql'],['odrl','open digital rights language'],['openapi'],['restful','rest api'],
    ['robots.txt'],['sitemap'],['streaming api'],['wcag','web content accessibility guidelines'],['webhook'],
    ## Validation and Quality
    ['apache data cleaner'],['ai fairness 360'],['maig','manifesto for ai gov'],['runtime application self-protection',' rasp '],
    ['talend'],

    ### High level  (not alphabetized/reorganized yet)
    ## Data and Metadata
    ['unknown'],#work on
    ['data dictionary'],['readiness level'], #work on
    ['ontolog','controlled vocabular'],['open data format'],['quality metrics embedded in metadata'], #work on
    ['semantic web'],['standardized metadata','metadata standards'],['version control', 'historical version','version history'],
    ## Documentation
    ['api document'],['provenance'],['bias'],#work on
    ['collection method'],['terms of use','usage guideline'],['tutorial','sample code'],
    ## Feedback
    ['consortia'], #work on
    ['hackathon'],['community engage','forum'],['usage tracking'], #add more tracking synonyms
    ## Licensing
    ## Storage and Publishing
    ['many formats','multiple formats'], ['ords','open data rights statement'],['language'], #work on
    ['bulk download'],['standardized url','standard url','consistent url','url consistency','url standard'],
    ['low-bandwidth','low bandwidth'],#work on low internet access
    ['synthetic data'],['data preview'],['portal for ai-ready'], #work on this
    ['plugin', 'plug-in'], #work on this
    ## Validation and Quality
    ['confidence interval','uncertainty measure','measure uncertainty'],['edge case','stress test'],['interoperab'],
    ['linked data standard','linked data principle'],['zero trust','never trust, always verify'],
    ['quality metric','measure quality'],
    
    ## Out of Guidelines Scope
    ['blockchain'],['discriminat'],['ords','open data rights statement'],['odrl','open digital rights language'],
    ['section 508'],['accessibility standard'],
    ['benchmark data','benchmark'], #benchmark gives 11 more, investigate
    ['devsecops'],
]

entries={}
for rec in recs:
    entries[str(rec)]=[]
    for synonym in rec:
        for file,text in zip(RFIs['file'],RFIs['text']):
            entry = [rec, file, re.findall(f'{synonym}', text)]
            if len(entry[2])>0: # if at least one instance of the synonym was found in the file, save it to the dictionary
                if file not in entries[str(rec)]:
                    entries[str(rec)]+= [file]

rec_column = list(entries.keys())
file_column = list(entries.values())
frequent_recs_df = pd.DataFrame({'recommendation':rec_column, 'files':file_column, 
                                 'files_count':map(lambda x: len(x), file_column)})
frequent_recs_df

Unnamed: 0,recommendation,files,files_count
0,['apache atlas'],[011_WhirlwindTechnologies_RFI.pdf],1
1,['croissant'],"[018_MLCommons_RFI.pdf, 102_MIT_RFI.pdf, 104_Google_RFI.pdf]",3
2,['csv'],"[008_Zois_American University.pdf, 009_MySidewalk_RFI.pdf, 010_ScaleAI_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 013_Data.World_RFI.pdf, 014_Elsevier_RFI.pdf, 015_Element84_RFI.pdf, 017_SAS_RFI.pdf, 020_XBRL_RFI.pdf, 023_Deloitte_RFI.docx, 027_CSET_RFI.pdf, 028_Kitware_RFI.pdf, 033_PROTESORO_RFI.pdf, 034_IEEE_RFI.pdf, 036_Microsoft_RFI.pdf, 037_UnstructuredTech_RFI.pdf, 040_Gretel_RFI.docx]",17
3,['dcat '],"[008_Zois_American University.pdf, 021_Jess_UVirginia_RFI .pdf, 028_Kitware_RFI.pdf, 034_IEEE_RFI.pdf]",4
4,['dcat-us'],[011_WhirlwindTechnologies_RFI.pdf],1
5,['dublin core'],"[011_WhirlwindTechnologies_RFI.pdf, 013_Data.World_RFI.pdf, 014_Elsevier_RFI.pdf, 025_StardogUnion_RFI.pdf, 034_IEEE_RFI.pdf]",5
6,['hdf5'],"[011_WhirlwindTechnologies_RFI.pdf, 013_Data.World_RFI.pdf, 028_Kitware_RFI.pdf]",3
7,"['iso 19115', 'iso19115', '19115']","[011_WhirlwindTechnologies_RFI.pdf, 013_Data.World_RFI.pdf]",2
8,['json'],"[008_Zois_American University.pdf, 010_ScaleAI_RFI.pdf, 011_WhirlwindTechnologies_RFI.pdf, 013_Data.World_RFI.pdf, 014_Elsevier_RFI.pdf, 015_Element84_RFI.pdf, 017_SAS_RFI.pdf, 020_XBRL_RFI.pdf, 023_Deloitte_RFI.docx, 027_CSET_RFI.pdf, 028_Kitware_RFI.pdf, 029_Leidos_RFI.pdf, 033_PROTESORO_RFI.pdf, 034_IEEE_RFI.pdf, 036_Microsoft_RFI.pdf, 037_UnstructuredTech_RFI.pdf, 040_Gretel_RFI.docx, 102_MIT_RFI.pdf]",18
9,[' owl'],"[011_WhirlwindTechnologies_RFI.pdf, 025_StardogUnion_RFI.pdf]",2


#### Find all licenses mentioned, and the text around them to provide context, in order to find new types that haven't been included yet 
##### This method can be used for any other words for more context and checking the intended meaning


In [5]:
licenses = []
for file,text in zip(RFIs['file'],RFIs['text']): 
    licenses.append([file, re.findall('.{100}license.{150}', text)])
licenses

[['005_OmniTrustAI_RFI.pdf', []],
 ['006_Narendra_RFI.docx', []],
 ['008_Zois_American University.pdf',
  ['d statistical validation methods. 4. data licensing: 1 • open licensing: commerce should adopt open licenses like creative commons (cc by 4.0) to facilitate broad reuse while ensuring attribution. • clear licensing terms: standardized, machine-readable lice']],
 ['009_MySidewalk_RFI.pdf', []],
 ['010_ScaleAI_RFI.pdf',
  ['ced and able to support development of models or applications, with  attribution. open data commons licenses offer clear terms and conditions for the use, sharing,  and modification of data. commerce, in collaboration with its industry partners, should cons',
   't any derived data must also be shared. it should be noted however, that any  disclaimers in such a license are not likely to fully address concerns about data quality, privacy, or  ethical use (all of which will be addressed at various points elsewhere in ',
   'nal any  accompanying conditions or rest

#### Additional Analyses (not initially useful)

In [None]:
### tfidf analysis
# def tfidf_data(reviews_ser, review):
#     # Remove non-alphanumeric characters and split by whitespace
#     words = re.findall(r'\b\w+\b', review)
#     out = pd.DataFrame(pd.Series(words).value_counts(), columns=['count']) 
#     #out = pd.DataFrame(pd.Series(review.split()).value_counts())

#     out['indx'] = out.index
#     out['tf'] = out['count']/len(review.split())
#     out['idf'] = np.log(len(reviews_ser)/ (out['indx'].apply(lambda word: sum(reviews_ser.str.contains(fr'\b{re.escape(word)}\b', regex=True)))))
#     out['tfidf'] = out['tf']*out['idf']
#     return out.drop(columns = 'indx').sort_values(['tfidf', 'count'])

# for filename in os.listdir(directory):
#     if filename!= '.DS_Store' and filename!= 'combined.pdf':
#         with open(os.path.join(directory, filename), 'rb') as f:
#             rfi = ''
#             pdf = pdftotext.PDF(f)
#             for page in pdf:
#                 rfi+=page.lower()
#         print(filename)
#         display(tfidf_data(RFIs, rfi))
#     break


### count all word frequencies overall
# directory = r'RFI_Submissions'
# with open(os.path.join(directory, 'combined.pdf'), 'rb') as f:
#     combined = ''
#     pdf = pdftotext.PDF(f)
#     for page in pdf:
#         combined+=page.lower()
# words = re.findall(r'\b\w+\b', combined)
# all_counts = pd.DataFrame(pd.Series(words).value_counts(), columns=['count']) 
# all_counts