In [1]:
import os
import pinecone
import os.path
from dotenv import load_dotenv

# load environment
env_filepath = os.path.join(os.path.dirname("__file__"), ".env")

load_dotenv(env_filepath)

pinecone_api_key = os.getenv("PINECONE_API_KEY")
env_name = "gcp-starter"

pinecone.init(api_key=pinecone_api_key, environment=env_name)

In [3]:
# Check pinecone-client compatibility
import pinecone.info
import console

version_info = pinecone.info.version()
server_version = ".".join(version_info.server.split(".")[:2])
print(f"Pinecone server version: {server_version}")
client_version = ".".join(version_info.client.split(".")[:2])
print(f"Pinecone client version: {client_version}")

assert client_version == server_version, "Pinecone server and client versions do not match. PLease  update the Pinecone client."

Pinecone server version: 2.0
Pinecone client version: 2.0


In [4]:
# Connect to Pinecone index
try:
    pinecone_index = pinecone.Index(os.getenv("PINECONE-INDEX_NAME"))
except Exception as e:
    print(e.message)

In [5]:
# PDF Parsing
# Methodology taken from:
# https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517
# Author: George Stavrakis

import PyPDF2
import pdfplumber
import pytesseract
import pdf2image
from datetime import datetime
from PIL import Image
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure

In [6]:

pdfs_dir_path = os.path.join(os.path.dirname("__file__"), "pdfs")

# Step 1: Prelimary analysis of PDF documents to determine each PDF type:
# programatically generated; scanned images; or scanned documents with OCR.
# Use PDFMiner to take a PDF document object of multiple pages and analyse each page.

def crop_image_from_pdf(element, page):
    # Get the coordinates of the page
    [image_left, image_top, image_right, image_bottom] = [element.x0, element.y0, element.x1, element.y1]
    # Crop the image
    page.mediabox.lower_left = (image_left, image_bottom)
    page.mediabox.upper_right = (image_right, image_top)
    # Save the cropped image to a new PDF
    writer = PyPDF2.PdfWriter()
    writer.add_page(page)
    # Save the new PDF
    output_filename = datetime.now().strftime("%d_%m_%y_%H_%M_%S_%f")
    with open(output_filename, "wb") as image_to_pdf:
        writer.write(image_to_pdf)
    
    output_filepath = os.path.join(os.path.dirname("__file__", output_filename))

    return output_filepath

def convert_pdf_to_image(filepath):
    images = pdf2image.convert_from_path(filepath)
    image = images[0]
    output_filename = os.path.splitext(filepath)[0] + ".png"
    image.save(output_filename, "PNG")

def extract_text_from_image(filepath):
    """
    Extracts texts from PNG images using Google Tesseract
    Optical Character Recoginition (OCR)

    :param filepath: The filepath of the image. Only PNG images are supported.
    """
    if filepath.lower().endswith() == ".png":
        image = Image.open(filepath)
        text = pytesseract.image_to_string(image)
        return text
        

def LTFigure_to_text(element, page):
    # To apply OCR software to images, images must be cropped from other pages.
    cropped_pdf_filepath = crop_image_from_pdf(element=element, page=page)
    convert_pdf_to_image(cropped_pdf_filepath)
    extract_text_from_image(os.path.splitext(cropped_pdf_filepath)[0] + ".png")
    

def LTTextContainer_to_text(element):
    line_text = element.get_text()

def LTRect_to_text(element):
    pass

def convert_pdf_to_text(filepath):
    for page_number, page in enumerate(extract_pages(filepath)):
        for element in page:
            if isinstance(element, LTTextContainer):
                LTTextContainer_to_text(element)
            elif isinstance(element, LTFigure):
                # Function to convert PDF to image
                pass
            elif isinstance(element, LTRect):
                # Function to extract text with OCR
                pass
            else:
                console.warning("Encountered element of unknown type - unable to parse. Processing next element...")

def convert_pdf_to_text(filepath):
    with open(filepath, 'rb') as file:
        pdf = PdfFileReader(file)
        

def parse_pdfs(pdfs_dir_path):
    if not os.path.exists(pdfs_dir_path):
        raise FileNotFoundError(f"The directory {pdfs_dir_path} cannot be found.")

    for filename in os.listdir(pdfs_dir_path):
        if filename.lower().endswith('.pdf'):
            filepath = os.path.join(pdfs_dir_path, filename)

In [7]:
testpath = r"/Users/niallmcmanus/Documents/TrafficLight"
os.path.splitext(testpath)[0] + ".png"

'/Users/niallmcmanus/Documents/TrafficLight.png'

In [26]:
print(datetime.now().strftime("%d_%m_%y_%H_%M_%S_%f"))

17_01_24_20_28_09_101123
