In [32]:
import platform
from tempfile import TemporaryDirectory
from pathlib import Path
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pdfplumber
import time
import os
import json

In [47]:
def pdfplumber_jud_pdf(filename):
    '''
    Decide whether the pdf contains scanned pictures that can not be detected as texts directly
    '''
    doc = pdfplumber.open(filename)
    for pages in doc.pages:
        if pages.extract_words():
            return True
    return False


def write_json(target_path, target_file, data):
    if not os.path.exists(target_path):
        try:
            os.makedirs(target_path)
        except Exception as e:
            print(e)
            raise
    with open(os.path.join(target_path, target_file), 'w') as f:
        json.dump(data, f)


def read_pdf_plum(file_path, target_path):
    '''
    Extract text directly if it is detectable
    '''
    with pdfplumber.open(file_path) as pdf:
        page_num = 1
        for page in pdf.pages:          
            text = page.extract_text()
            text = text.replace("-\n", "")
            page_str = str(page_num)
            dictionary = {
                'Article': file_path,
                'Page': page_str,
                'Text': text
            }
            target_file = file_path + '_' + page_str + '.json'
            write_json(target_path, target_file, dictionary)
            page_num += 1


def OCR_singlefile(file, target_path):
    '''
    Extract text from scanned pdfs using OCR
    '''
    image_file_list = []
    # Part1 : Converting PDF to images
    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(file, 500)
        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}\page_{page_enumeration:03}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)

    #Part2 - Recognizing text from the images using OCR
    page_num = 1
    for image_file in image_file_list:
        text = str(((pytesseract.image_to_string(Image.open(image_file)))))
        text = text.replace("-\n", "")
        page_str = str(page_num)
        dictionary = {
                'Article': image_file,
                'Page': page_str,
                'Text': text
            }
        target_file = image_file + '_' + page_str + '.json'
        write_json(target_path, target_file, dictionary)
        page_num += 1

    return texts



def text_extraction(in_path, target_path):
    '''
    Extract text from pdf files.
    If the file contains scanned pictures, extract using OCR.
    Otherwise, extract text directly.
    '''
    file_texts = {}
    for file in os.listdir(in_path): 
        if file.endswith(".pdf"):
            file_path = in_path + '/' + file
            if pdfplumber_jud_pdf(file_path):
                texts = read_pdf_plum(file_path, target_path)
            else:
                texts = OCR_singlefile(file, target_path)

In [48]:
start = time.time()
in_path = '/Users/macbook/Desktop/DSI Capstone/Input PDFs'
out_path = '/Users/macbook/Desktop/DSI Capstone/Input PDFs/Jsons'
text_extraction(in_path, out_path)
end = time.time()
print(end - start)

20.721532821655273
