### import the libraries

In [105]:
import cv2
import numpy as np

### create the image processing PipeLine

In [106]:
def segment_the_line(thresh):
    '''
        this function (vertically only) finds 
        where the text starts and end in the image
        and return the index of stating and ending
    '''
    thresh = thresh//255.0
    up_j = 99999999
    down_j= -999999999
    for j,  i in enumerate( thresh):
        if i.min() == 0:
            if(up_j> j): 
                up_j = j

            if(down_j< j): 
                down_j = j
    
    return max(up_j-3,0), min(down_j+4, len(thresh))

In [107]:
def segmentTheLine(thresh):
    '''
        this function take a black and white image, 
        then crop the text from the white background
    
    '''
    i, j = segment_the_line(thresh)
    thresh = thresh[i:j]
    thresh = cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE)
    i, j = segment_the_line(thresh)
    thresh = thresh[i:j]
    thresh = cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE)
    thresh = cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE)
    thresh = cv2.rotate(thresh, cv2.ROTATE_90_CLOCKWISE)

    return thresh

In [108]:
# Read image
image = cv2.imread('./OCR_Text_Dataset/OCR_Text/4.jpg')
image = cv2.resize(image, (10000, 1000))

# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply thresholding
_, thresh = cv2.threshold(gray, 85, 255, cv2.THRESH_BINARY)
_, thresh = cv2.threshold(thresh, 0, 255, cv2.THRESH_BINARY)

# apply image enhancement
smoothed_image = cv2.GaussianBlur(thresh, (7, 7), 0)
kernel = np.array([[-1,-1,-1], [-1, 9,-1],[-1,-1,-1]])
sharpened = cv2.filter2D(smoothed_image, -1, kernel) # applying the sharpening kernel to the input image & displaying it.

# Perform morphological operations to clean up the image
kernel = np.ones((9, 3), np.uint8)
img_erosion = cv2.erode(sharpened, kernel, iterations=2)


# kernel = np.ones((3, 1), np.uint8)
# img_erosion = cv2.erode(img_erosion, kernel, iterations=1)

result = cv2.resize(img_erosion, (1000, 100))
result = segmentTheLine(result)
_, result = cv2.threshold(result, 0, 255, cv2.THRESH_BINARY)


cv2.imshow('result image', result)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [109]:
np.unique(result)

array([  0, 255], dtype=uint8)

### preprocess the images and save them into separete folder for model training

In [110]:
import os
import shutil
from tqdm import tqdm

directory = './OCR_Text_Dataset/OCR_Text/'
data_files = [f.name for f in os.scandir(directory)]
images_files = [f.name for f in os.scandir(directory) if f.name.endswith('.jpg') and f.is_file()]
text_files = [f.name for f in os.scandir(directory) if f.name.endswith('.txt') and f.is_file()]

error_files = []
not_found_file = []

for file in tqdm(text_files):
    file_name = file.split('.')[0]
    
    try:
        try:
            image = cv2.imread(directory + file_name + '.jpg')
        except:
            not_found_file.append(file)
            continue
            
        image = cv2.resize(image, (10000, 1000))

        # Convert the image to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply thresholding
        _, thresh = cv2.threshold(gray, 85, 255, cv2.THRESH_BINARY)
        _, thresh = cv2.threshold(thresh, 0, 255, cv2.THRESH_BINARY)

        # apply image enhancement
        smoothed_image = cv2.GaussianBlur(thresh, (7, 7), 0)
        kernel = np.array([[-1,-1,-1], [-1, 9,-1],[-1,-1,-1]])
        sharpened = cv2.filter2D(smoothed_image, -1, kernel) # applying the sharpening kernel to the input image & displaying it.

        # Perform morphological operations to clean up the image
        kernel = np.ones((9, 3), np.uint8)
        img_erosion = cv2.erode(sharpened, kernel, iterations=2)


        # kernel = np.ones((3, 1), np.uint8)
        # img_erosion = cv2.erode(img_erosion, kernel, iterations=1)

        result = cv2.resize(img_erosion, (1000, 100))
        result = segmentTheLine(result)
        _, result = cv2.threshold(result, 0, 255, cv2.THRESH_BINARY)
        cv2.imwrite('./data/images/'+file_name+'.jpg',result)
        
    except:
        error_files.append(file)
        continue
        
    file_name = file.split('.')[0]
    
    # Specify the path of the file you want to copy
    file_to_copy = directory + file

    # Specify the path of the destination directory you want to copy to
    destination_directory = './data/labels'

    # Use the shutil.copy() method to copy the file to the destination directory
    shutil.copy(file_to_copy, destination_directory)
    

    

100%|████████████████████████████████████████████████████████████████████████████| 19975/19975 [22:44<00:00, 14.63it/s]


#### these are the text files that does not have a image 

In [115]:
not_found_file

[]

#### these are the image files that does not have text inside it

In [116]:
len(error_files)

347

In [117]:
error_files

['10160.txt',
 '10183.txt',
 '10248.txt',
 '10300.txt',
 '10303.txt',
 '10335.txt',
 '10481.txt',
 '1071.txt',
 '10766.txt',
 '10780.txt',
 '10871.txt',
 '10897.txt',
 '11044.txt',
 '11066.txt',
 '11098.txt',
 '11116.txt',
 '11129.txt',
 '1122.txt',
 '11318.txt',
 '11321.txt',
 '11383.txt',
 '11414.txt',
 '11497.txt',
 '11540.txt',
 '1159.txt',
 '1163.txt',
 '11631.txt',
 '11672.txt',
 '11677.txt',
 '11744.txt',
 '11828.txt',
 '11877.txt',
 '11878.txt',
 '12017.txt',
 '12051.txt',
 '12152.txt',
 '12259.txt',
 '1232.txt',
 '12338.txt',
 '12390.txt',
 '12435.txt',
 '12487.txt',
 '12537.txt',
 '12798.txt',
 '12812.txt',
 '1285.txt',
 '12913.txt',
 '13092.txt',
 '13150.txt',
 '13159.txt',
 '1319.txt',
 '13210.txt',
 '13241.txt',
 '13302.txt',
 '13314.txt',
 '13325.txt',
 '13343.txt',
 '13344.txt',
 '13392.txt',
 '13416.txt',
 '13443.txt',
 '13465.txt',
 '13488.txt',
 '13593.txt',
 '13606.txt',
 '1368.txt',
 '13715.txt',
 '139.txt',
 '1396.txt',
 '14040.txt',
 '14109.txt',
 '1421.txt',
 '14

In [118]:
directory = './data/'
images_files = [f.name for f in os.scandir(directory+'images') if f.name.endswith('.jpg') and f.is_file()]
text_files = [f.name for f in os.scandir(directory+'labels') if f.name.endswith('.txt') and f.is_file()]

len(images_files), len(text_files)

(19628, 19628)