In [None]:
!pip install pytesseract==0.3.10
!pip install pdf2image==1.16.3
!pip install pandas==2.0.3
!pip install langdetect==1.0.9
!pip install Pillow==10.0.0
!pip install openai

import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import re
import os
from langdetect import detect
from PIL import Image, ImageEnhance, ImageFilter
import openai
import json

# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Helper functions
def clean_text(text):
    """Cleans OCR text for better processing."""
    text = re.sub(r"\s+", " ", text)  # Normalize whitespace
    text = text.replace("’", "'").replace("‘", "'").replace("`", "'")  # Standardize quotes
    text = re.sub(r"[{}()\[\]]", "", text)  # Remove brackets
    return text

def preprocess_image(image):
    """Enhances image for OCR, especially for handwritten text."""
    gray_image = image.convert('L')  # Convert to grayscale
    enhancer = ImageEnhance.Contrast(gray_image)
    enhanced_image = enhancer.enhance(2.0)  # Increase contrast
    sharpened_image = enhanced_image.filter(ImageFilter.SHARPEN)  # Apply sharpening filter
    return sharpened_image

def extract_information(text):
    """Extract fields from OCR text using an LLM."""
    # Clean text if necessary
    text = clean_text(text)

    # Define the prompt for the LLM
    prompt = f"""
You are an assistant that extracts structured data from text documents.

Extract the following information from the text below:

- Company Name
- Company Identifier
- Document Purpose
- Key Terms (list any key terms found such as Nomination, Dismissal, Transfer, Modification, Sale, Renewal, etc.)

Provide the extracted information in the following JSON format:

{{
    "Company Name": "...",
    "Company Identifier": "...",
    "Document Purpose": "...",
    "Key Terms": ["...", "..."]
}}

Text:
{text}
"""
    # Call the LLM
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        temperature=0,
    )

    # Get the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']

    # Parse the assistant's reply as JSON
    try:
        info = json.loads(assistant_reply)
    except json.JSONDecodeError:
        # Handle the error
        print("Error parsing JSON from LLM response.")
        info = {
            "Company Name": None,
            "Company Identifier": None,
            "Document Purpose": None,
            "Key Terms": []
        }

    return info

# List all PDF files in the current working directory
pdf_folder = "/content"  # Adjust the folder based on where your files are
uploaded_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# Process each PDF file
all_data = []  # To store extracted information
processed_files = set()  # To avoid duplicated file entries

for pdf_file in uploaded_files:
    try:
        print(f"Processing file: {pdf_file}")
        if os.path.basename(pdf_file) not in processed_files:
            # Convert only the first page of the PDF
            images = convert_from_path(pdf_file, first_page=1, last_page=1)
            print(f"Number of images extracted from {pdf_file}: {len(images)}")
            for image in images:
                # Preprocess the image (especially useful for handwritten text)
                preprocessed_image = preprocess_image(image)

                # Perform OCR on the first page with multi-language support
                text = pytesseract.image_to_string(preprocessed_image, lang='eng+fra+ned+deu')  # Multi-language OCR
                print(f"OCR Text for {pdf_file}:\n{text}\n{'='*80}")

                extracted_info = extract_information(text)
                all_data.append({
                    "file_name": os.path.basename(pdf_file),
                    "page_number": 1,  # Only process the first page
                    **extracted_info,
                })
            processed_files.add(os.path.basename(pdf_file))  # Mark this file as processed
    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")

# Convert results to DataFrame
df = pd.DataFrame(all_data)

# Display the DataFrame directly
df.head()  # This will display the top rows of the DataFrame

# Optionally, display the entire DataFrame
pd.set_option('display.max_rows', None)  # This will display all rows in the DataFrame
df  # This will display the entire DataFrame


