## PDF Search of Keywords

**Set Up**

Install libraries as needed.

In [None]:
#!pip install pdfplumber
#!pip install nltk

Import and dowload libraries

In [1]:
import pdfplumber
import os
import re
import csv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mark.zais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**PDF Text Search**

Define a function to extract text from PDF with page numbers

In [2]:
def extract_text_from_pdf_with_pages(pdf_path):
    data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if text:
                data.append((page_num, text))
    return data

Define a function to search for sentences with both keywords

In [3]:
from nltk.tokenize import sent_tokenize

def search_sentences_with_both_keywords(data, keywords):
    if len(keywords) != 2:
        raise ValueError("This function is designed for exactly two keywords.")
    
    results = []
    keyword1, keyword2 = keywords
    pattern1 = re.compile(re.escape(keyword1), re.IGNORECASE)
    pattern2 = re.compile(re.escape(keyword2), re.IGNORECASE)

    for page_num, text in data:
        sentences = sent_tokenize(text)
        for sentence in sentences:
            if pattern1.search(sentence) and pattern2.search(sentence):
                results.append((page_num, sentence.strip()))
    
    return results

Define a functon to write results to CSV

In [4]:
def write_results_to_csv(results, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Document', 'Page Number', 'Sentence'])
        for result in results:
            writer.writerow(result)

Set folder path

In [5]:
folder_path = 'C:\\Users\\mark.zais\\OneDrive - Integration Innovation\BD Data Capture Opportunities\\AIMSS\\Bidders Library\\bidders_library_pdfs'

Pick key words

In [6]:
keywords = ['data', 'governance']

Name the CSV output file

In [7]:
output_csv = 'search_results.csv'

Loop through PDFs and search for sentences with both keywords.
Export result to a CSV file.

In [8]:
all_results = []

for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        try:
            data = extract_text_from_pdf_with_pages(pdf_path)
            results = search_sentences_with_both_keywords(data, keywords)
            for page_num, sentence in results:
                all_results.append((filename, page_num, sentence))
        except Exception as e:
            print(f"Error processing {filename}: {e}\n")

write_results_to_csv(all_results, output_csv)
print(f"Results have been exported to {output_csv}")

Processing: 206_Continuous_Developmental_Integration_Working_Group_Charter__Certified_Current_Feb_7__2023_.pdf
Processing: Accountability_and_Management_of_MDA_Equipment_and_Other_Accountable_Property_Manual.pdf
Processing: BMDS_Ground_Test_Concept_of_Operations.pdf
Processing: Common_Test_Nomenclature.pdf
Processing: Cyberspace_Workforce_Management_Program.pdf
Processing: Data_Program_Strategy_Implementation.pdf
Processing: DD_254__HQ0857_23_R_0001__AIMSS_Signed.pdf
Processing: DT_101_BMDS_Cybersecurity_Test_StrategyDS.pdf
Processing: DT_102_BMDS_Cybersecurity_Test_CONOPS.pdf
Processing: DX_DeSimone_Signed_DGB_Charter_MDA_220923_MWCK.pdf
Processing: Engineering_Technical_Review_Process.pdf
Processing: Flight_Test_Failure_Response_Process.pdf
Processing: Guide_DataProducer_1.pdf
Processing: ICJ_Memo_028_ACAS_Exclusion_Exemption_and_Deviation_07Dec2022.pdf
Processing: ICJ_Memo_029_ESS_Baselines_19Jan2023.pdf
Processing: ICN_Emergency_Use_Passowrd_Policy__signed_.pdf
Processing: ICN_SOP_