In [1]:
import easyocr
from PIL import Image
import cv2
import numpy as np
from matplotlib import pyplot as plt
import fitz


In [2]:
pdffile = "extract/24000005.pdf"
doc = fitz.open(pdffile)
print(doc.page_count)
zoom = 6
mat = fitz.Matrix(zoom, zoom)
count = 0
# Count variable is to get the number of pages in the pdf
for p in doc:
    count += 1
for i in range(count):
    val = f"image_{i+1}.png"
    page = doc.load_page(i)
    pix = page.get_pixmap(matrix=mat)
    pix.save(val)

doc.close()

1


### Normalization

In [18]:
img = cv2.imread("image_1.png", cv2.IMREAD_GRAYSCALE)

#thresh, im_bw = cv2.threshold(img, 210, 300, cv2.THRESH_BINARY)


## Min-Max Normalization
# If you need it in [0, 255] range for an 8-bit image
normalized_img_255 = cv2.normalize(img, None, alpha=1, beta=255, norm_type=cv2.NORM_MINMAX)

## Z-Score Normalization (Standardization)
# Calculate the mean and standard deviation
mean, std_dev = cv2.meanStdDev(img)
z_score_normalized_img = (img - mean[0][0]) / std_dev[0][0]
z_score_normalized_img_255 = cv2.normalize(z_score_normalized_img, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)

## Apply Histogram Equalization
equalized_img = cv2.equalizeHist(img)

## CLAHE (Contrast Limited Adaptive Histogram Equalization)
clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(100, 10))
clahe_img = clahe.apply(img)

cv2.imwrite("processed_image.png", clahe_img)

True

### Image Scaling

In [19]:
img = cv2.imread("image_1.png", cv2.IMREAD_GRAYSCALE)

# Define target and assumed initial PPI
target_ppi = 300
initial_ppi = 150  # Adjust this if you know the initial PPI (e.g., 72, 96, or 150 PPI)

# Calculate scaling factor to achieve 300 PPI or higher
scaling_factor = target_ppi / initial_ppi

# Calculate new dimensions based on the scaling factor
new_width = int(img.shape[1] * scaling_factor)
new_height = int(img.shape[0] * scaling_factor)
new_size = (new_width, new_height)

# Resize the image with high-quality interpolation (Cubic interpolation)
upscaled_img = cv2.resize(clahe_img, new_size, interpolation=cv2.INTER_LANCZOS4)

# Save the upscaled image
cv2.imwrite("processed_image.png", upscaled_img)

True

### Noise Removal

In [21]:
# Load the image
img = cv2.imread("image_1.png", cv2.IMREAD_GRAYSCALE)

# Step 1: Apply Median Blur to remove salt-and-pepper noise
denoised_img = cv2.medianBlur(upscaled_img, 3)  # Use a small kernel size to avoid blurring text

# Step 2: Apply Gaussian Blur for additional smoothing
denoised_img_g = cv2.GaussianBlur(denoised_img, (5, 5), 4)

# Optional Step: Apply Bilateral Filter if edges need more preservation
denoised_img_s = cv2.bilateralFilter(denoised_img_g, 10, 75, 75)

cv2.imwrite("processed_image.png", denoised_img_s)

True

### Thinning and Skeletonization

In [90]:
img = cv2.imread("image_1.png", cv2.IMREAD_GRAYSCALE)

kernel = np.ones((2, 2), np.uint8)

erosion = cv2.erode(img, kernel, iterations = 1)
dilation = cv2.dilate(img, kernel, iterations = 1) # opposite of erosion
opening = cv2.morphologyEx(erosion, cv2.MORPH_OPEN, kernel)
closing = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)


cv2.imwrite("processed_image.png", erosion)

True

### Thresholding and Binarization

In [3]:
img = cv2.imread("image_1.png", cv2.IMREAD_GRAYSCALE)

denoised_img = cv2.medianBlur(img, 3)  # Kernel size can be adjusted
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced_img = clahe.apply(denoised_img)
binary_img = cv2.adaptiveThreshold(
    enhanced_img, 
    255, 
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 
    19, 
    1
)

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
dilated_img = cv2.dilate(binary_img, kernel, iterations=1)
thresh, im_bw = cv2.threshold(dilated_img, 240, 210, cv2.THRESH_BINARY)
cv2.imwrite("processed_image.png", im_bw)  # Or use `deskewed_img` if dilation is not needed

True

In [None]:
# Load image using cv2
image = cv2.imread("image_1.png")

# Example preprocessing: convert to grayscale and apply GaussianBlur
gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Normalization
clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(100, 10))
clahe_img = clahe.apply(gray_img)

# Removing Noise
denoised_img = cv2.GaussianBlur(clahe_img, (5, 5), 0)

kernel = np.ones((2, 2), np.uint8)
erosion = cv2.erode(denoised_img, kernel, iterations = 1)

thresh, im_bw = cv2.threshold(erosion, 220, 300, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

cv2.imwrite("processed_image.png", im_bw)

True

### Get bounding boxes

In [40]:
# Initialize the EasyOCR reader with the desired language
reader = easyocr.Reader(['fr'])  # Specify language

# Load the image
image_file = "processed_image.png"
img = cv2.imread(image_file)

# Detect text using EasyOCR
results = reader.readtext(img)

# Initialize a list to store the coordinates of horizontal bounding boxes
horizontal_boxes = []

# Collect bounding boxes that are horizontal
for (bbox, text, confidence) in results:
    if confidence > 0.3:  # Confidence threshold
        # bbox contains the coordinates of the box around the text
        (top_left, top_right, bottom_right, bottom_left) = bbox
        top_left = tuple(map(int, top_left))
        bottom_right = tuple(map(int, bottom_right))

        # Calculate width and height
        width = bottom_right[0] - top_left[0]
        height = bottom_right[1] - top_left[1]
        
        # Only process horizontal bounding boxes
        if width > height:
            horizontal_boxes.append((top_left[0], top_left[1], bottom_right[0], bottom_right[1]))  # Store as (x1, y1, x2, y2)

# Sort boxes by their y-coordinate (top of the bounding box) for line-based grouping
horizontal_boxes.sort(key=lambda box: box[1])

# Merge bounding boxes on the same line
merged_boxes = []
current_line_boxes = []

# Define a threshold for merging boxes on the same line
y_threshold = 10  # Adjust based on spacing between lines in the image
x_gap_threshold = 20  # Adjust based on spacing between words

for box in horizontal_boxes:
    if not current_line_boxes:
        current_line_boxes.append(box)
        continue

    _, y1, _, y2 = box
    _, prev_y1, _, prev_y2 = current_line_boxes[-1]

    # Check if the box is on the same line as the previous one
    if abs(y1 - prev_y1) <= y_threshold or abs(y2 - prev_y2) <= y_threshold:
        current_line_boxes.append(box)
    else:
        # Merge all boxes in the current line group
        min_x1 = min(b[0] for b in current_line_boxes)
        min_y1 = min(b[1] for b in current_line_boxes)
        max_x2 = max(b[2] for b in current_line_boxes)
        max_y2 = max(b[3] for b in current_line_boxes)
        merged_boxes.append((min_x1, min_y1, max_x2 - min_x1, max_y2 - min_y1))  # Store as (x, y, w, h)
        current_line_boxes = [box]

# Merge any remaining boxes in the last line
if current_line_boxes:
    min_x1 = min(b[0] for b in current_line_boxes)
    min_y1 = min(b[1] for b in current_line_boxes)
    max_x2 = max(b[2] for b in current_line_boxes)
    max_y2 = max(b[3] for b in current_line_boxes)
    merged_boxes.append((min_x1, min_y1, max_x2 - min_x1, max_y2 - min_y1))

# Save the initial image with merged bounding boxes
for (x, y, w, h) in merged_boxes:
    img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)  # Green box

output_image_path = "output_with_bounding_boxes_easyocr.png"
cv2.imwrite(output_image_path, img)
print("Image with merged bounding boxes saved as", output_image_path)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Image with merged bounding boxes saved as output_with_bounding_boxes_easyocr.png


In [41]:
# Initialize a list to store the extracted text from each bounding box
document = []

# Perform OCR on each bounding box region
for i, (x, y, w, h) in enumerate(merged_boxes):
    # Crop each region of interest
    roi = img[y:y + h, x:x + w]

    # Perform OCR on the cropped region
    result = reader.readtext(roi, detail=1)

    # Extract text from OCR results and append to document
    box_text = " ".join([text for (_, text, confidence) in result if confidence > 0.3])
    document.append(box_text)

    # Print results for each bounding box (optional for debugging)
    print(f"Results for bounding box {i + 1}:")
    for (bbox, text, confidence) in result:
        print(f"Detected Text: {text} | Confidence: {confidence}")


Results for bounding box 1:
Detected Text: Cople à publler aux annexes au Rionideur belge | Confidence: 0.3260280522122447
Results for bounding box 2:
Results for bounding box 3:
Detected Text: bcloo | Confidence: 0.7762932974180531
Results for bounding box 4:
Detected Text: 24000005* | Confidence: 0.9953947650202262
Detected Text: au | Confidence: 0.999609236786169
Detected Text: du tribuîffâl dë Y'entreprise | Confidence: 0.24719620795899053
Detected Text: greffe | Confidence: 0.9958295211606153
Results for bounding box 5:
Detected Text: 361986926 | Confidence: 0.36588501912038923
Results for bounding box 6:
Detected Text: (on entler) : | Confidence: 0.673627987758791
Detected Text: GRANDR | Confidence: 0.9742132245710441
Results for bounding box 7:
Detected Text: (en abrégs) | Confidence: 0.5598454815176684
Results for bounding box 8:
Detected Text: Forme légale : | Confidence: 0.7556094759473633
Detected Text: SRL | Confidence: 0.8677277938076173
Results for bounding box 9:
Detecte

In [42]:
document

['Cople à publler aux annexes au Rionideur belge',
 '',
 'bcloo',
 '24000005* au greffe',
 '361986926',
 '(on entler) : GRANDR',
 '(en abrégs)',
 'Forme légale : SRL',
 'Adresse complole du siege : 209 Boulevard Lambermont; Bolte 12, 1030 Schaerbook',
 'MODIFICATION SIEGE SOCIAL',
 'GRANDR 0 ayant 8on slBge sodal à 1030 Schaorbeek; Boulevard',
 'Lambenont 2096012 qu0:',
 '',
 '',
 'du',
 '',
 'Deuxiàme résoluton Approbatlon de la modlcation présentée',
 '@t confne l changement',
 'Coordlnatlon 0t publilcatlon',
 'dit cavant',
 'Lassemblée générale extraordlnalre donne lous pouvoira à Yadminstratour pour faire le nécéssalrequant',
 'POUR BXTRAT ANALYTIQUE',
 '',
 'Gerant TiesOr MADODA TUYINAMA',
 'extraordlnalre du 13 décembre 2023.',
 'Nom 0t quallle du notaire Instrumentant ou d la porgonnoou doe pereonnos',
 '']

In [None]:
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from typing import List
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI



model = ChatOpenAI(model="gpt-4o-mini")

# Define the output schema
class Document(BaseModel):
    company_name: str = Field(description="Company Name")
    company_identifier: str = Field(description="Company Identifier Number")
    document_purpose: str = Field(description="Document Purpose")
    key_terms: str = Field(description="Key terms related with the document")

class Info(BaseModel):
    infomration: List[Document]

parser = PydanticOutputParser(pydantic_object=Info)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an AI assistant specialized in extracting key business details from documents. "
            "Your goal is to identify and translate relevant information into English if necessary, then format the output "
            "according to the specified JSON schema. Ensure that each extracted field is accurate, complete, and follows the schema precisely.\n\n"
            "Please extract the following fields:\n\n"
            "1. **Company Name**: The full name of the company as it appears in the document.\n"
            "2. **Company Identifier**: A unique identifier for the company, such as a registration or business number. Only extract the number.\n"
            "3. **Document Purpose**: The purpose or intent of the document (e.g., 'Appointment of Directors', 'Annual Report'). "
            "Translate this to English if it’s in another language.\n"
            "4. **Key Terms about the Document Purpose**: Extract detailed information relevant to the document’s purpose, such as roles, positions, and effective dates. "
            "For instance, if the document covers the appointment of directors, include terms like the position title and effective date. Translate these terms to English if needed.\n\n"
            "{format_instructions}"
        ),
        (
            "human",
            "{query}"
        ),
    ]
).partial(format_instructions=parser.get_format_instructions())

chain = prompt | model | parser
result = chain.invoke({"query": document})

  from .autonotebook import tqdm as notebook_tqdm


In [44]:
print(result)

infomration=[Document(company_name='GRANDR', company_identifier='361986926', document_purpose='Modification of Registered Office', key_terms='Legal form: SRL; Complete address of the registered office: 209 Boulevard Lambermont; Box 12, 1030 Schaerbeek; Extraordinary general assembly gives all powers to the administrator to carry out the necessary arrangements.')]


In [281]:
from dotenv import load_dotenv
from langchain_huggingface  import HuggingFaceEndpoint
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.schema import AIMessage, HumanMessage, SystemMessage

load_dotenv()

# Initialize the LLM
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    model_kwargs={"return_dict": True}
)

model = ChatOpenAI(model="gpt-4o-mini")

prompt = PromptTemplate(
    input_variables=["text"], 
    template="""
    Please take the following text and organize it into well-structured sections with clear and relevant headings \
    and paragraphs in French. Make sure that:

    - Extract what is the Qbjet_de Lacte.
    
    - Each section has an appropriate heading that reflects its content.
    - All original content is preserved without omissions or modifications.
    - The text is grouped into logically organized paragraphs under each heading.

    Text to format:
    {text}
    """
)

chain = prompt | model | StrOutputParser()

result = chain.invoke({"text": document})

In [282]:
print(result)

# Document Officiel

## Objet de l'Acte
La société a pour objet, tant en Belgique qu'à l'étranger, pour son compte propre ou pour le compte de tiers, seul ou en association, les activités suivantes :

- **Constructions**
  - Génie civil : construction de routes, de voies ferrées, de ponts et de tunnels.
  - Construction de réseaux et de lignes.
  - Construction de réseaux pour fluides.
  - Construction de réseaux électriques et de télécommunications.

- **Travaux de construction spécialisés**
  - Démolition ; préparation des sites ; forages d'essai et sondages.
  - Travaux d'installation.
  - Travaux d'isolation.
  - Mise en œuvre dans des bâtiments ou d'autres projets de construction de matériaux d'isolation thermique, matériaux d'isolation acoustique et anti-vibratile.
  - Travaux d'isolation de canalisations de chauffage ou de réfrigération, de chambres froides ou d'entrepôts frigorifiques.
  - Travaux de finition.
  - Travaux de plâtrerie : Application dans des bâtiments ou d'autre

In [245]:
# Initialize the EasyOCR reader with the desired language
reader = easyocr.Reader(['fr'])  # Specify language

# Load the image
image_file = "processed_image.png"
img = cv2.imread(image_file)

# Detect text using EasyOCR
results = reader.readtext(img)

# Initialize a list to store the coordinates of horizontal bounding boxes
bounding_boxes = []

# Draw bounding boxes and store coordinates for each detected horizontal text region
for (bbox, text, confidence) in results:
    if confidence > 0.4:  # Confidence threshold
        # bbox contains the coordinates of the box around the text
        (top_left, top_right, bottom_right, bottom_left) = bbox
        top_left = tuple(map(int, top_left))
        bottom_right = tuple(map(int, bottom_right))

        # Check if the bounding box is horizontal by comparing width and height
        width = bottom_right[0] - top_left[0]
        height = bottom_right[1] - top_left[1]
        
        if width > height:  # Only process if the width is greater than height
            # Draw the rectangle around the horizontal text
            img = cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)  # Green box
            
            # Append the bounding box to the list
            bounding_boxes.append((top_left[0], top_left[1], width, height))

# Save the initial image with bounding boxes
output_image_path = "output_with_bounding_boxes_easyocr.png"
cv2.imwrite(output_image_path, img)
print("Image with bounding boxes saved as", output_image_path)

# Print bounding boxes for verification
print("Horizontal Bounding Boxes:", bounding_boxes)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Image with bounding boxes saved as output_with_bounding_boxes_easyocr.png
Horizontal Bounding Boxes: [(3069, 197, 222, 42), (1367, 271, 1631, 109), (1580, 359, 856, 110), (2004, 428, 676, 213), (3071, 487, 158, 150), (334, 589, 197, 52), (397, 652, 71, 48), (325, 699, 222, 61), (2418, 679, 434, 116), (364, 758, 144, 65), (2001, 829, 438, 110), (2707, 858, 516, 133), (931, 887, 604, 117), (1991, 917, 566, 110), (910, 1107, 394, 79), (1370, 1103, 386, 75), (1167, 1179, 147, 65), (1076, 1265, 238, 61), (1373, 1243, 769, 85), (1053, 1348, 269, 69), (953, 1453, 370, 79), (1375, 1450, 604, 69), (655, 1564, 211, 61), (859, 1553, 465, 92), (1378, 1555, 1401, 75), (1162, 1724, 1577, 75), (1161, 1791, 1383, 84), (737, 1893, 1776, 76), (2540, 1895, 806, 79), (694, 1956, 776, 75), (736, 2157, 2614, 84), (649, 2218, 2701, 85), (650, 2285, 2713, 81), (646, 2350, 1291, 75), (736, 2547, 2114, 87), (733, 2755, 2262, 79), (3016, 2760, 332, 65), (645, 2815, 2697, 88), (650, 2881, 532, 75), (733, 3018, 11