In [3]:
import os
import pdfplumber
import shutil
from PyPDF2 import PdfReader

In [4]:
current_directory = os.getcwd()
pdf_folder = os.path.join(current_directory, '..', 'data', 'Downloaded_Documents')

In [5]:
def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

print("Content of the current folder:", os.listdir())

Content of the current folder: ['Downloaded_Documents', '.DS_Store', 'OCR_Required', 'documents.csv', 'doris_data_preprocessing.ipynb', 'OCR_Not_Required', 'doris_data_preprocessing.py']


In [6]:
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

pdf_contents = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    pdf_content = read_pdf(pdf_path)
    pdf_contents.append(pdf_content)

print("Loaded PDF files:")
for pdf_file in pdf_files:
    print(f" - {pdf_file}")

print(f"\nTotal number of PDF files loaded: {len(pdf_files)}")

Loaded PDF files:
 - DORIS_Automated_speed_enforcement_program_report_nyc_government_publications.pdf
 - DORIS_Barnes_Dance_Study_nyc_government_publications.pdf
 - DORIS_Bike_Share_Usage_Data_Report_-_Q3_2017_nyc_government_publications.pdf
 - DORIS_Strategic_plan_2016_nyc_government_publications.pdf
 - DORIS_Brooklyn_Bridge_Promenade_Report_nyc_government_publications.pdf
 - DORIS_Bike_Share_Usage_Data_Report_-_Q2_2017_nyc_government_publications.pdf
 - DORIS_Accessible_Waterborne_Commuter_Services_nyc_government_publications.pdf
 - DORIS_Vision_Zero_Accomplishments_Report_nyc_government_publications.pdf
 - DORIS_Neighborhood_Slow_Zones_Report_nyc_government_publications.pdf
 - DORIS_Vision_Zero_Year_Four_Report_nyc_government_publications.pdf
 - DORIS_Coordinated_Street_Furniture_Franchise_Year_11_Update_nyc_government_publications.pdf
 - DORIS_Performance_Indicators_Report_nyc_government_publications.pdf
 - DORIS_Accessible_Water_Borne_Commuter_Services_Report_nyc_government_public

In [7]:
def is_ocr_required(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        num_pages = len(reader.pages)
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            text = page.extract_text()
            if len(text.strip()) > 0:
                return False  # OCR is not required
    return True  # OCR is required

In [9]:
pdf_folder = os.path.join(current_directory, '..', 'data', 'Downloaded_Documents')
ocr_required_folder = os.path.join(current_directory, '..', 'data', 'OCR_Required')
ocr_not_required_folder = os.path.join(current_directory, '..', 'data', 'OCR_Not_Required')

In [10]:
os.makedirs(ocr_required_folder, exist_ok=True)
os.makedirs(ocr_not_required_folder, exist_ok=True)

In [11]:
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    if is_ocr_required(pdf_path):
        print(f"OCR is required for {pdf_file}")
        shutil.move(pdf_path, os.path.join(ocr_required_folder, pdf_file))
    else:
        print(f"OCR is not required for {pdf_file}")
        shutil.move(pdf_path, os.path.join(ocr_not_required_folder, pdf_file))

OCR is not required for DORIS_Automated_speed_enforcement_program_report_nyc_government_publications.pdf
OCR is not required for DORIS_Barnes_Dance_Study_nyc_government_publications.pdf
OCR is not required for DORIS_Bike_Share_Usage_Data_Report_-_Q3_2017_nyc_government_publications.pdf
OCR is not required for DORIS_Strategic_plan_2016_nyc_government_publications.pdf
OCR is not required for DORIS_Brooklyn_Bridge_Promenade_Report_nyc_government_publications.pdf
OCR is not required for DORIS_Bike_Share_Usage_Data_Report_-_Q2_2017_nyc_government_publications.pdf
OCR is required for DORIS_Accessible_Waterborne_Commuter_Services_nyc_government_publications.pdf
OCR is not required for DORIS_Vision_Zero_Accomplishments_Report_nyc_government_publications.pdf
OCR is not required for DORIS_Neighborhood_Slow_Zones_Report_nyc_government_publications.pdf
OCR is not required for DORIS_Vision_Zero_Year_Four_Report_nyc_government_publications.pdf
OCR is required for DORIS_Coordinated_Street_Furniture_F