In [None]:
import os
import cv2
from pdf2image import convert_from_path
import numpy as np
import shutil
import gc
from myFunctions import *
import random


# Convert PDF to JPG

In [None]:
folder_in = './PDF/'
folder_out = './BBox_Label_Tool/Images/001/'

for input_pdf in os.listdir(folder_in):

    pages = convert_from_path(folder_in+input_pdf, 400)
    
    image = pages[0]
    image = np.array(image)
    height, width, depth = image.shape
    imgScale = 0.2
    newX,newY = image.shape[1]*imgScale, image.shape[0]*imgScale
    resized = cv2.resize(image,(int(newX),int(newY)))
    small = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)
    
    cv2.imwrite(folder_out+input_pdf[:-4] +'.jpg', small)


del pages
del image
del imgScale
del newX
del newY
del resized
del small
del height, width, depth

gc.collect()

# Bounding Boxes Labelling

If needed, do the bounding box labelling at this point. You need to navigate to the folder BBox_Label_Tool and run the file main_jpg.py.

# Move to the "Data" folder

In [None]:
folder_images = './BBox_Label_Tool/Images/001/'
folder_labels = './BBox_Label_Tool/Labels/001/'
folder_images_out = "./data/train/images/"
folder_labels_out = "./data/train/boxes/"

for file in os.listdir(folder_images):
    if file[:-4] != "bible_1":
        shutil.copyfile(folder_images + file , folder_images_out+ file)
for file in os.listdir(folder_labels):
    if file[:-4] != "bible_1":
        shutil.copyfile(folder_labels + file , folder_labels_out+ file)


# Load everything

In [None]:
image_folder = './data/train/images/'
label_folder = './data/train/boxes/'
transcription_folder = './data/train/transcriptions/'

In [None]:
file_list = []
label_list = []
transcription_list=[]
ID_list=[]
for file in os.listdir(label_folder):
    ID = file[:-4]
    ID_list.append(ID)
    file_list.append(image_folder+ID + ".jpg")
    label_list.append(label_folder + ID + ".txt")
    transcription_list.append(transcription_folder+ID+".txt")

rect_list = []
for fname in label_list:
    if os.path.isfile(fname):
        with open(fname) as f:
            content = f.readlines()

        rectangles = []
        for i, line in enumerate(content):
            line= line.replace('\n', '')
            line= line.replace(' word', '')
            if i > 0:
                
                x, y, x2, y2 = line.split(' ')
                w = int(x2) - int(x)
                h = int(y2) - int(y)
                rectangles.append([int(x), int(y), w, h])
                   
        rect_list.append(rectangles)

word_list = []
for fname in transcription_list:
    words = []
    if os.path.isfile(fname):
        with open(fname) as f:
            content = f.readlines()        
        for i, line in enumerate(content):
            line= line.replace('\n', '')
            words.append(line)     
        
    else:
        open(fname, "a").close()
    
    word_list.append(words)
        
max_h = 0
max_w = 0

data = []
for idx in range(len(file_list)):
    img = cv2.imread(file_list[idx], 0)
    data.append([img, rect_list[idx], word_list[idx], ID_list[idx]])
    height, width = img.shape
    if height > max_h:
        max_h = height
    if width > max_w:
        max_w = width

new_data = []
for image, rect, word, ID in data:
    image, new_rect = pad_image(image, rect, max_w, max_h)
    if len(word) == 0:
        print("The file " + str(ID) + " does not have any words transcription.")
    new_data.append([image, new_rect, word, ID])

data = new_data
del file_list
del label_list
del transcription_list
del ID_list
del rect_list
del word_list
del new_data
gc.collect()

# Check if number of words == number of boxes

At this point, you need to do the transcription file. One ".txt" file per image, in the folder './data/train/transcriptions/'.

In [None]:
data_words = []
for selected_idx in range(len(data)):
    image = data[selected_idx][0]
    rects = data[selected_idx][1]
    words = data[selected_idx][2]
    #draw_rects(image, rects)
    #print(words)
    if len(words) != len(rects):
        print("Not the same number of rectangles and words!")
        print("Words in txt file: " + str(len(words)))
        print("Boxes in txt file : " + str(len(rects)))
        draw_rects(image, rects)

    else:
        
        for i in range(len(words)):
            x,y,w,h = rects[i]
        
            word_image = image[y:(y+h), x:(x+w)]
            data_words.append([word_image, words[i]])
            

# Prepare the data (and squeeze the images)

In [None]:
squeeze = False

if squeeze:
    new_data = []
    for image, rect, words, ID in data:
        new_image, new_rect = squeeze_image(image,rect, 0.8, 0.8)
        h, w = new_image.shape
        desired_h, desired_w= new_image.shape
        new_h = (h // 32)*32
        new_w = (w // 32)*32
        
        y_min = (h - new_h)//2
        x_min = (w - new_w)//2
        
        new_image2, new_rect2 = resize_image(new_image, new_rect, x_min, y_min, new_w, new_h, with_border_rect = True)
        new_data.append([new_image2, new_rect2, words, ID])
    data = new_data
    del new_data
    gc.collect()
else:
    max_w = 0
    max_h = 0
    for image, rect,words, ID in data:
        h, w = image.shape
        if h > max_h:
            max_h = h
        if w > max_w:
            max_w = w
    desired_w=max_w
    desired_h=max_h

print("The images will have the shape " + str((desired_w,desired_h)))

# Create the augmented dataset (It may compute for a long time)

In [None]:
print("The size of the initial dataset is " + str(len(data)))

augmented_dataset = []

for image, rects,words, ID in data[:5]:
    for blur_coef in range(2):
        if blur_coef == 0:
            blur = image
        else:
            blur = cv2.blur(image,(blur_coef,blur_coef))
        #for angle in range(-40,40,2):
            #angle= angle/10
            #if angle is not 0:
                #rotated_image, rotated_rects =rotate_image(blur, rects, angle)
                #new_image, new_rect = squeeze_image(rotated_image,rotated_rects, 0.9, 0.9)

                #padded_image, padded_rects = pad_image(new_image, new_rect, desired_w, desired_h)
                #augmented_dataset.append([padded_image, padded_rects])
        for bright_coef in range(-50, 50, 15):
            bright_image = blur.copy()
            for x in range(blur.shape[0]):
                for y in range(blur.shape[1]):
                    bright_image[x,y]=min(max(blur[x,y] + bright_coef, 0), 255)
            
            for width_squeeze_coef in range(80, 100,8):
                width_squeeze_coef = width_squeeze_coef/100
                (squeeze_img, squeeze_rects) = squeeze_image(bright_image,rects, width_squeeze_coef, 1)
                
                padded_image, padded_rects = pad_image(squeeze_img, squeeze_rects, desired_w, desired_h)
                augmented_dataset.append([padded_image, padded_rects,words, ID])
            for height_squeeze_coef in range(80, 100,8):
                height_squeeze_coef = height_squeeze_coef/100
                (squeeze_img, squeeze_rects) = squeeze_image(bright_image,rects, 1, height_squeeze_coef)
                padded_image, padded_rects = pad_image(squeeze_img, squeeze_rects, desired_w, desired_h)
                augmented_dataset.append([padded_image, padded_rects, words, ID])

del data
gc.collect()
print("The size of the augmented dataset is " + str(len(augmented_dataset)))

# Show some random images to check if everything went well

In [None]:
for i in range(10):
    idx = random.randint(0,len(augmented_dataset))
    draw_rects(augmented_dataset[idx][0], augmented_dataset[idx][1])

# Save into a folder

In [None]:
import os, shutil
folder = './data/train/augmented_images/'

try:
    os.listdir(folder)
except:
    os.mkdir(folder)

for the_file in os.listdir(folder):
    file_path = os.path.join(folder, the_file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)

for i in range(len(augmented_dataset)):
    image = augmented_dataset[i][0]
    rects = augmented_dataset[i][1]
    ID = augmented_dataset[i][3]
    with open(folder+ID + "_" +str(i)+'.txt', 'a') as the_file:
        the_file.write(str(len(rects)) + '\n')
        for j in range(len(rects)):
            the_file.write(str(rects[j][0]) +' '+str(rects[j][1]) +' '+str(rects[j][2]) + \
                           ' '+str(rects[j][3])+' word\n')
    cv2.imwrite(folder+ID + "_" +str(i)+'.png', image)

# Extract the individual words

In [None]:
data_words = []
for selected_idx in range(len(augmented_dataset)):
    image = augmented_dataset[selected_idx][0]
    rects = augmented_dataset[selected_idx][1]
    words = augmented_dataset[selected_idx][2]
    
    if len(words) != len(rects):
        print("Not the same number of rectangles and words!")
        print(len(words))
        print(len(rects))
        draw_rects(image, rects)
    else:
        
        for i in range(len(rects)):
            x,y,w,h = rects[i]
        
            word_image = image[y:(y+h), x:(x+w)]
            data_words.append([word_image, words[i]])
            
print("There are " + str(len(data_words)) + " extracted words.")

del augmented_dataset
gc.collect()

# Pad the words

In [None]:
max_width = 256
max_height = 75
for image, label in data_words:
    h, w = image.shape
    if h > max_height:
        max_height = h
    if w > max_width:
        max_width = w
    if len(label) == 0:
        draw_rects(image, [])

print("The words' images will have a height of " +str(max_height)+ 
      " pixels and a width of " +str(max_width)+ " pixels.") 
data_words_padded = []
for image, label in data_words:
    new_image, _ = pad_image(image, [], max_width, max_h)
    data_words_padded.append([new_image, label])
    
del data_words
gc.collect()

# Save them to a folder

In [None]:
def uniqueid():
    seed = random.getrandbits(32)
    while True:
       yield seed
       seed += 1
        
folder = './data/train/words_images/'
try:
    os.listdir(folder)
except:
    os.mkdir(folder)
    
for the_file in os.listdir(folder):
    file_path = os.path.join(folder, the_file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)
        
unique_sequence = uniqueid()

with open(folder+"labels"+'.txt', 'a') as the_file:
    for i in range(len(data_words_padded)):
        image = data_words_padded[i][0]
        label = data_words_padded[i][1]
        id = next(unique_sequence)
        cv2.imwrite(folder+str(id)+'.png', image)
        the_file.write(str(id) +' '+ str(label)+ "\n") 