In [77]:
import os 
import sys
from PIL import Image
from pathlib import Path
import re

from pdf2image import convert_from_path 
import pytesseract
import spacy
from spacy_langdetect import LanguageDetector

# If you don't have tesseract executable in your PATH, include the following:
#pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

# List of available languages
print(pytesseract.get_languages(config=''))

['eng', 'fra', 'osd']


In [80]:
# set folder to pdf files:
scraping_folder = Path('../scraping/output')

# set output txt folder:
output_folder = Path('./output')

# set target pdf file:
pdf_file = "140-140-2020-014320.pdf"

In [83]:
# OCR part

# with the code from geeksforgeeks:
# https://www.geeksforgeeks.org/python-reading-contents-of-pdf-using-ocr-optical-character-recognition/

''' 
Part #1 : Converting PDF to images 
    
'''
# Make the output folder for images
image_dir = output_folder / f'images/{pdf_file[:-4]}'

if not image_dir.exists():
    os.makedirs(image_dir)

# Store all the pages of the PDF in a variable 
pages = convert_from_path(scraping_folder / pdf_file, dpi = 425)
  
# Counter to store images of each page of PDF to image 
image_counter = 1
  
# Iterate through all the pages stored above 
for page in pages: 
  
    filename = "page_"+str(image_counter)+".png"
      
    # Save the image of the page in system 
    page.save(image_dir / filename, 'PNG')
  
    # Increment the counter to update filename 
    image_counter = image_counter + 1
    
    ''' 
Part #2 - Recognizing text from the images using OCR 
'''
    
# Variable to get count of total number of pages 
filelimit = image_counter-1
  
# Creating a text file to write the output 
outfile = f"{pdf_file[:-4]}.txt"
  
# Open the file in append mode so that  
# All contents of all images are added to the same file 
f = open(output_folder / outfile, "a")
  
# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1):
    
    filename = "page_"+str(i)+".png"

    # Recognize the text as string in image using pytesserct
    text = str(((pytesseract.image_to_string(Image.open(image_dir / filename), lang="fra"))))
    
    # Delete word-break-ups at the end of a line
    text = text.replace('-\n', '')
    
    # Replace form feed (new page) character with normal new line
    text = re.sub(r'\f', r'\n', text)
    
    # Continue the same or next sentence on the same line (delete line breaks inside alinea)
    text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', text)
    
    # Finally, write the processed text to the file and provide it to language detection on a background thread
    f.write(text)
    
# Close the file after writing all the text. 
f.close()

In [84]:
def language_detection(text_file):
    '''This function takes the text file from OCR
    and detects each line's language and stores it if it is in French.
    This fuction returns a new text file in output folder of lines in French'''
    
    file = open(output_folder / text_file, 'r')
    
    rows_in_fr = []
    
    nlp = spacy.load('fr_core_news_sm')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
    for row in file:
        doc = nlp(row)
        if doc._.language["language"] == "fr" or row == [] or row[0].isnumeric():
            rows_in_fr.append(row)
    output = '\n'.join(rows_in_fr)
    
    f = open(output_folder / f'{text_file[:-4]}_FR.txt', "w")
    f.write(output)
    f.close()
    
    return

In [85]:
language_detection("140-140-2020-014320.txt")