In [5]:
!pip install easyocr




In [7]:
import os
import numpy as np
import easyocr
from PIL import Image


In [8]:
# Set the TESSDATA_PREFIX environment variable
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'


In [9]:
# Function to convert images to JPEG format
def convert_to_jpg(image_path):
    try:
        # Open the image using PIL (Python Imaging Library)
        image = Image.open(image_path)

        # Convert the image to JPEG format
        image_jpg_path = os.path.splitext(image_path)[0] + '.jpg'
        image.save(image_jpg_path, 'JPEG')

        return image_jpg_path
    except Exception as e:
        print(f"Error converting image to JPEG: {e}")
        return None

In [10]:
# Function to perform OCR on the scanned document images
def perform_ocr(image_np, language='ar'):
    try:
        # Perform OCR using EasyOCR
        reader = easyocr.Reader([language])
        result = reader.readtext(image_np)

        # Extract text from the OCR result
        extracted_text = ' '.join([text[1] for text in result])

        return extracted_text
    except Exception as e:
        print(f"Error performing OCR: {e}")
        return None

In [11]:






# Function to preprocess the extracted text
def preprocess_text(text):
    # Implement any necessary preprocessing steps here
    # For simplicity, we'll skip preprocessing in this example.
    return text

# Function to extract key features from the preprocessed text
def extract_features(text):
    # Implement feature extraction here
    # For simplicity, we'll skip feature extraction in this example.
    return {"Type de Document": "Contrat", "Dates": "12 février 2022", "Parties Impliquées": ["A", "B"],
            "Termes Clés": ["Contrat", "Allégations"], "Éléments d'Action": "Dates d'audience"}






In [12]:
# Function to process a single image
def process_image(image_path):
    try:
        # Convert image to JPEG format
        image_path = convert_to_jpg(image_path)
        if image_path is None:
            return

        # Open the image using PIL (Python Imaging Library)
        image = Image.open(image_path)

        # Convert PIL image to numpy array
        image_np = np.array(image)

        # Perform OCR on the image
        extracted_text = perform_ocr(image_np)

        if extracted_text:
            # Print the extracted text
            print("Extracted Text from", image_path)
            print(extracted_text)

            # Preprocess the extracted text
            preprocessed_text = preprocess_text(extracted_text)

            # Extract features from the preprocessed text
            extracted_features = extract_features(preprocessed_text)

            # Display the extracted features
            print("Extracted Features from", image_path)
            print(extracted_features)
    except Exception as e:
        print(f"Error processing image: {e}")

In [13]:
# Function to process all images in the specified folder
def process_images_in_folder(images_folder):
    try:
        for filename in os.listdir(images_folder):
            if filename.endswith((".png", ".webp")):  # Check for .png and .webp extensions
                image_path = os.path.join(images_folder, filename)
                process_image(image_path)
    except Exception as e:
        print(f"Error processing images in folder: {e}")


In [14]:
# Main function
def main():
    try:
        # Directory containing the scanned document images
        images_folder = '/content/drive/MyDrive/projet'

        # Process all images in the folder
        process_images_in_folder(images_folder)
    except Exception as e:
        print(f"Error in main function: {e}")

if __name__ == "__main__":
    main()


Extracted Text from /content/drive/MyDrive/projet/image 2.jpg
دد النبابة ع 2008/200. الجمه ورية التونسية بطاق وزارة العدل وحقوق الإنسان اعكمة الابتدانية بالمهديسة تضى القانون الإبتدائية بالمهدي وكيل الجمهورية لسدى المحكم عملا بالفصول 59و60و115من قانون المرافعات الجنائية : متوظفى أوأعوان القوة العامة بأن (2) ي ودعسوا نأمر ونأذن قسومة بن عمار بن حسين محمد الصالح ن الجناى الإمتثال المولود في: 1957/2/16 المهدية القاطن: اولاد مولاهم الزعيرات مخالفة قرار المراقبة الادارية المتهم ق ج ان ارتكاب ما ذكر ينطق عليه القانون طبق الفصل التشويش بما تقتضيه هاته البطاقة كما نطلب من سائر رؤساء القوة العامة أن يحققوا إجراء ناحية بالمهدية في2008/5 كي للمحاكم السج  سانر للقانود السواسى 150من حبث العمل السوا راسى حرر
Extracted Features from /content/drive/MyDrive/projet/image 2.jpg
{'Type de Document': 'Contrat', 'Dates': '12 février 2022', 'Parties Impliquées': ['A', 'B'], 'Termes Clés': ['Contrat', 'Allégations'], "Éléments d'Action": "Dates d'audience"}
Extracted Text from /content/drive/MyDrive/projet/IM