In [5]:
import os
from tabulate import tabulate

# Define a function to count PDF files in a folder
def count_pdf_files(folder):
    pdf_count = 0
    for file in os.listdir(folder):
        if file.endswith('.txt'):
            pdf_count += 1
    return pdf_count

# Define the root directory where the folders are located
root_directory = 'data/nlp'  # Atualizado com o novo caminho

# List all directories in the root directory
directories = [d for d in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, d))]

# Create a list to store folder names and their respective PDF counts
folder_pdf_counts = []

# Iterate over each directory
for directory in directories:
    pdf_count = count_pdf_files(os.path.join(root_directory, directory))
    folder_pdf_counts.append([directory, pdf_count])

# Sort the folder PDF counts list by PDF count
folder_pdf_counts.sort(key=lambda x: x[1], reverse=True)

# Print the table
print(tabulate(folder_pdf_counts, headers=['Folder Name', 'PDF Count']))


Folder Name                            PDF Count
-----------------------------------  -----------
SICREDI CAMPOS GERAIS E GRANDE CURI           89
COOP SICREDI ARAXINGÚ                         31
SICREDI CEARÁ                                 28
COOP SICREDI NORTE                            26
COOP SICREDI NORTE SC                         26
COOP SICREDI CELEIRO OESTE                    25
COOP SICREDI UNIÃO METROPOLITANA              21
COOP SICREDI EVOLUÇÃO                         19
COOP SICREDI VALE SÃO FRANCISCO               18
COOP SICREDI SUDOESTE MT_PA                   17
CC SICREDI COOMAMP                            16
COOP SICREDI FORÇA DOS VENTOS                 16
COOP SICREDI ROTA DAS TERRAS RS_MG            15
COOP SICREDI ARACAJU                          13
COOP SICREDI CAMPO GRANDE                     13
COOP SICREDI INTEGRAÇÃO RS_MG                 13
SICREDI CREDUNI                               13
COOP SICREDI CENT PERNAMBUCANA                10
COOP SICREDI CULTURA

In [None]:
#pip install pdf2image
#pip install poppler-utils
#pip install pytesseract
#pip install tesseract
# !pip install tesseract-ocr
# !pip install pytesseract
#!pip install pytesseract
#pip install PyPDF2




In [7]:
import os
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from pdf2image.exceptions import PDFPageCountError

# Set TESSDATA_PREFIX environment variable to point to the parent directory of the tessdata directory
os.environ['TESSDATA_PREFIX'] = r'C:\Program Files (x86)\Tesseract-OCR'

# Setting the path to the Tesseract-OCR executable and the language
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
tesseract_lang = 'eng'

# Function to extract text from a PDF file using OCR
def extract_text_ocr(file_path):
    text = ""
    try:
        pages = convert_from_path(file_path)
        for page in pages:
            text += pytesseract.image_to_string(page, lang=tesseract_lang)
    except PDFPageCountError as e:
        print(f"Error getting the number of pages from PDF file '{file_path}': {e}")
    except Exception as e:
        print(f"Error extracting text from PDF file '{file_path}': {e}")
    return text

# Function to extract text from a PDF file
def extract_text_pdf(file_path):
    text = ""
    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            if reader.is_encrypted:
                print(f"PDF file '{file_path}' is encrypted. Unable to extract text.")
                return text
            file.seek(0)
            for page in reader.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF file '{file_path}': {e}")
    return text

# Input and output directories
input_directory = 'data/raw data'
output_directory = 'data/nlp'

# Traverse subfolders and PDF files
for root_folder, subfolders, files in os.walk(input_directory):
    for file in files:
        if file.endswith('.pdf'):
            full_path = os.path.join(root_folder, file)
            output_file_name = os.path.splitext(file)[0] + '.txt'
            output_path = os.path.join(root_folder.replace(input_directory, output_directory), output_file_name)

            if os.path.exists(output_path):
                print(f"Text file '{output_file_name}' already exists. Skipping to the next file.")
                continue

            print(f"Processing file: {full_path}")

            try:
                extracted_text = extract_text_pdf(full_path)
                if extracted_text.count(' ') < 40:
                    extracted_text = extract_text_ocr(full_path)
                
                output_folder = root_folder.replace(input_directory, output_directory)
                os.makedirs(output_folder, exist_ok=True)

                with open(output_path, 'w', encoding='utf-8') as file_output:
                    file_output.write(extracted_text)

                print(f"Completed: {full_path}")

            except Exception as e:
                print(f"Error during processing of '{full_path}': {e}")


Text file 'ago-age_de_04042023_-_sicredi_coomamp.txt' already exists. Skipping to the next file.
Text file 'ata_age_digital_2020-10906245163955933242.txt' already exists. Skipping to the next file.
Text file 'ata_ago-age_2020_-_sicredi_coomamp-14269571789062083743.txt' already exists. Skipping to the next file.
Text file 'ata_agoage_2022_-_sicredi_coomamp.txt' already exists. Skipping to the next file.
Text file 'ata_ago_age_2018-5498222428478009411.txt' already exists. Skipping to the next file.
Text file 'ata_de_homologacao-2409316498742798561.txt' already exists. Skipping to the next file.
Text file 'chapas_inscritas-7992939441822030231.txt' already exists. Skipping to the next file.
Text file 'edital-de-convocacao-ago-age-2020-9922671639141617290.txt' already exists. Skipping to the next file.
Text file 'edital_age__presencial_maio2023.txt' already exists. Skipping to the next file.
Text file 'edital_ago-age_-_2022-5192814098258424985.txt' already exists. Skipping to the next file.