Custom OCR Project - Image Preprocessing
------------------------------------------
This script preprocesses images by converting them to grayscale, applying Gaussian blur, 
and performing thresholding to enhance text clarity for OCR processing.

In [2]:
# Import necessary libraries
import cv2  # OpenCV for image processing
import os  # OS module for file handling
import numpy as np  # NumPy for numerical operations
import matplotlib.pyplot as plt  # Matplotlib for image visualization

# Step 1: Define Paths

In [4]:
# Path to the folder containing raw images
IMAGE_FOLDER = r"C:\Users\hp\project 10\raw_images"
# Path to the folder where preprocessed images will be saved
PREPROCESSED_FOLDER = r"C:\Users\hp\project 10\preprocessed_images"

# Ensure the preprocessed images folder exists
if not os.path.exists(PREPROCESSED_FOLDER):
    os.makedirs(PREPROCESSED_FOLDER)

# Step 2: Function to Preprocess an Image

In [6]:
def preprocess_image(image_path, save_path):
    '''
    This function loads an image, converts it to grayscale, applies Gaussian blur,
    and performs thresholding to enhance text clarity. The processed image is then saved.
    '''
    image = cv2.imread(image_path)  # Load image
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)  # Apply Gaussian blur
    _, thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)  # Apply Otsu's thresholding
    
    # Save the processed image
    processed_image_path = os.path.join(save_path, os.path.basename(image_path))
    cv2.imwrite(processed_image_path, thresh)
    
    return thresh

# Step 3: Process All Images in the Folder

In [8]:
# Get list of all image files in the raw images folder
image_files = os.listdir(IMAGE_FOLDER)
for img_file in image_files:
    img_path = os.path.join(IMAGE_FOLDER, img_file)
    preprocess_image(img_path, PREPROCESSED_FOLDER)

print("Data preprocessing complete. Preprocessed images saved.")

Data preprocessing complete. Preprocessed images saved.
