In [6]:
import os
import ntpath
import numpy as np
import cv2
from matplotlib import pyplot as plt
from pdf2image import convert_from_path

#https://stackoverflow.com/questions/28816046/
#displaying-different-images-with-actual-size-in-matplotlib-subplot
def display(im_path):
    dpi = 80
    im_data = plt.imread(im_path)

    height, width  = im_data.shape[:2]
    
    # What size does the figure need to be in inches to fit the image?
    figsize = width / float(dpi), height / float(dpi)

    # Create a figure of the right size with one axes that takes up the full figure
    fig = plt.figure(figsize=figsize)
    ax = fig.add_axes([0, 0, 1, 1])

    # Hide spines, ticks, etc.
    ax.axis('off')

    # Display the image.
    ax.imshow(im_data, cmap='gray')

    plt.show()
    

    
################ GRAY SCALE CONVERSION ###########    
def grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)    



################ Noise Removal ###########
def noise_removal(image):
    import numpy as np
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.dilate(image, kernel, iterations=1)
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.erode(image, kernel, iterations=1)
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    image = cv2.medianBlur(image, 3)
    return (image)


########### Dilation and Erosion ##########
#######
#######
#######
########### Making the front THIN ########
def thin_font(image):
    import numpy as np
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2),np.uint8)
    image = cv2.erode(image, kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return (image)



########### Making the front THICK ########
def thick_font(image):
    import numpy as np
    image = cv2.bitwise_not(image)
    kernel = np.ones((2,2),np.uint8)
    image = cv2.dilate(image, kernel, iterations=1)
    image = cv2.bitwise_not(image)
    return (image)




########### DETECTING SKEW ########
def getSkewAngle(cvImage) -> float:
    # Prep image, copy, convert to gray scale, blur, and threshold
    newImage = cvImage.copy()
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Apply dilate to merge text into meaningful lines/paragraphs.
    # Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
    # But use smaller kernel on Y axis to separate between different blocks of text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=2)

    # Find all contours
    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)
    for c in contours:
        rect = cv2.boundingRect(c)
        x,y,w,h = rect
        cv2.rectangle(newImage,(x,y),(x+w,y+h),(0,255,0),2)

    # Find largest contour and surround in min area box
    largestContour = contours[0]
    print (len(contours))
    minAreaRect = cv2.minAreaRect(largestContour)
    cv2.imwrite("temp/boxes.jpg", newImage)
    # Determine the angle. Convert it to the value that was originally used to obtain skewed image
    angle = minAreaRect[-1]
    if angle < -45:
        angle = 90 + angle
    return -1.0 * angle




############## Rotate the image around its center ###############
def rotateImage(cvImage, angle: float):
    newImage = cvImage.copy()
    (h, w) = newImage.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    newImage = cv2.warpAffine(newImage, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return newImage



############### Deskew image ###############
def deskew(cvImage):
    angle = getSkewAngle(cvImage)
    return rotateImage(cvImage, -1.0 * angle)



############### REMMOVING BORDERS ###############
def remove_borders(image):
    contours, heiarchy = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cntsSorted = sorted(contours, key=lambda x:cv2.contourArea(x))
    cnt = cntsSorted[-1]
    x, y, w, h = cv2.boundingRect(cnt)
    crop = image[y:y+h, x:x+w]
    return (crop)

In [12]:

s= '/n01/data/nlp_aeac/mostofa_env/INPUT/PDF_NEW/'
outputLoc = '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/'
outputLoc_1 = '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/'
for path, subdirs, files in os.walk(s):
    for filename in files:
        img_name=filename[:-4]
        inputLoc = s+filename
        ocrLoc = outputLoc+filename
        ocrLoc_1 = outputLoc_1+filename
        print(inputLoc)
        print(ocrLoc_1)
        print(ocrLoc)
        if not os.path.exists(ocrLoc_1):
            os.makedirs(ocrLoc_1)
            os.makedirs(ocrLoc)
            dpi=320
            pages = convert_from_path(inputLoc ,dpi )
            print("New File Transfer........")
            for i in range(len(pages)):
                page = pages[i]
                page.save('{}/{}_{}.jpg'.format(ocrLoc_1,img_name,i), 'JPEG')
                new_im=ocrLoc_1+"/"+img_name+"_"+str(i)+".jpg"
                print(new_im)
                img = cv2.imread(new_im)
                gray_image = grayscale(img)
                no_noise = noise_removal(gray_image)
                kernel_dilation = np.ones((1,1), np.uint8)
                ret, cv_otsu_binary = cv2.threshold(no_noise, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
                dilation = cv2.dilate(cv_otsu_binary, kernel_dilation, iterations=1)
                cv2.imwrite('{}/{}_{}.jpg'.format(ocrLoc,img_name,i), dilation)
        print(inputLoc)
        print(filename)
        print(ocrLoc)

/n01/data/nlp_aeac/mostofa_env/INPUT/PDF_NEW/IB_Claim # 21453296565.pdf
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf
New File Transfer........
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_0.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_1.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_2.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_3.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_4.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_5.jpg
/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_TEMP/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_6.jpg
/n01/data/nlp_ae

In [16]:
import img2pdf

In [18]:
import img2pdf
import glob
from fpdf import FPDF
from PIL import Image
import os
import ntpath

OutPutLoc= '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGE_PROCESSED_PDF_NEW/'
InPutLoc = '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/'

for path, subdirs, files in os.walk(InPutLoc):
    for dirs in subdirs:
        print(dirs)
        img_folder= InPutLoc+str(dirs)+"/"
        img_folder_list= InPutLoc+str(dirs)+"/*.jpg"
        imagelist = sorted(glob.glob(img_folder_list), key=os.path.getmtime)
        print(imagelist)
        pdf= FPDF()
#         pdf= FPDF(unit="pt", format=[width + 2*margin, height + 2*margin])
        out_pdf_name= OutPutLoc+str(dirs)
        with open(out_pdf_name,"wb") as f:
            f.write(img2pdf.convert(imagelist))
#         for image in imagelist:
# #             img=img_folder+image
#             pdf.add_page()
#             pdf.image(image, 10,210,297)
#         pdf.output(out_pdf_name, "F")

IB_Claim # 21453296565.pdf
['/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_0.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_1.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_2.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_3.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_4.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_5.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_6.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_7.jpg', '/n01/data/nlp_aeac/mostofa_env/INPUT/IMAGES_NEW/IB_Claim # 21453296565.pdf/IB_Claim # 21453296565_8.jpg', '/n01/dat