In [39]:
import platform
from tempfile import TemporaryDirectory
from pathlib import Path
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pdfplumber
import time
import os
import json

In [40]:
def pdfplumber_jud_pdf(filename):
    '''
    Decide whether the pdf contains scanned pictures that can not be detected as texts directly
    '''
    doc = pdfplumber.open(filename)
    for pages in doc.pages:
        extraction = pages.extract_words()
        # If the texts are not readable, return False
        if not extraction:
            return False
        # Or if the texts can not be splitted by words, return False
        words_lengths = [len(dictionary['text']) for dictionary in extraction]
        if max(words_lengths) > 50:
            return False
    return True


def write_json(target_path, target_file, data):
    '''
    Output extracted results as a json file
    '''
    if not os.path.exists(target_path):
        try:
            os.makedirs(target_path)
        except Exception as e:
            print(e)
            raise
    file_path = target_path + '/' + target_file
    with open(file_path, 'w') as f:
        json.dump(data, f)


def read_pdf_plum(file, file_path, target_path):
    '''
    Extract text directly if it is detectable
    '''
    with pdfplumber.open(file_path) as pdf:
        page_num = 1
        for page in pdf.pages:          
            text = page.extract_text()
            text = text.replace("\n", " ")
            page_str = str(page_num)
            dictionary = {
                'Article': file,
                'Page': page_str,
                'Text': text
            }
            target_file = file[:-4] + '_' + page_str + '.json'
            write_json(target_path, target_file, dictionary)
            page_num += 1


def OCR_singlefile(file, file_path, target_path):
    '''
    Extract text from scanned pdfs using OCR
    '''
    image_file_list = []
    # Part1 : Converting PDF to images
    with TemporaryDirectory() as tempdir:
        pdf_pages = convert_from_path(file_path, 500)
        for page_enumeration, page in enumerate(pdf_pages, start=1):
            filename = f"{tempdir}\page_{page_enumeration:03}.jpg"
            page.save(filename, "JPEG")
            image_file_list.append(filename)

    #Part2 - Recognizing text from the images using OCR
    page_num = 1
    for image_file in image_file_list:
        text = str(((pytesseract.image_to_string(Image.open(image_file)))))
        text = text.replace("\n", " ")
        page_str = str(page_num)
        dictionary = {
                'Article': file,
                'Page': page_str,
                'Text': text
            }
        target_file = file[:-4] + '_' + page_str + '.json'
        write_json(target_path, target_file, dictionary)
        page_num += 1
    



def text_extraction(in_path, target_path):
    '''
    Extract text from pdf files.
    If the file contains scanned pictures, extract using OCR.
    Otherwise, extract text directly.
    Output a json file for every page
    '''
    for file in os.listdir(in_path): 
        if file.endswith(".pdf"):
            start = time.time()
            file_path = in_path + '/' + file
            if pdfplumber_jud_pdf(file_path):
                texts = read_pdf_plum(file, file_path, target_path)
            else:
                texts = OCR_singlefile(file, file_path, target_path)
            end = time.time()
            print(file + ': ' + str(end - start))

In [41]:
in_path = '/Users/macbook/Desktop/DSI Capstone/Input PDFs'
out_path = '/Users/macbook/Desktop/DSI Capstone/Input PDFs/Jsons'
text_extraction(in_path, out_path)

2306.04819.pdf: 240.80949997901917
2301.13174.pdf: 78.89621686935425
Pictures.pdf: 11.98810601234436
