# Parsing of Sustainability Reports in PDF format

## Setup

In [None]:
import os
import json
import re

REPORT_DIRECTORY = './00_data/reports'
REPORTS = os.listdir(REPORT_DIRECTORY)
PARSING_RESULTS_DIRECTORY = '00_data/parsing_results'
IMAGES = '00_data/parsing_results/pdf_images'

## Parsing with pdfminer.six

In [None]:
from pdfminer.high_level import extract_text

def cleanse(text):
    # TODO:
    # A lot of cleansing could be done, e.g. 
    # - unifying different encodings 
    # - removing special characters
    # - concatenating hyphened words, sentences and paragraphs

    # Remove leading spaces from texts
    return re.sub(' +', ' ', text.strip())
    
def parse_pdf_pdfminer(file):
    # Read pdf using pdfminer
    text = extract_text(file)

    # Paragraphs are delimited by double newlines
    paragraphs = {}
    paragraph_index = 0

    # Add each paragraph with its index to a dict
    for paragraph in text.split('\n\n'):

        # Cleanse text
        paragraph = cleanse(paragraph)

        # Split text by dots to extract sentences
        # Only use sentences if it does not come from an empty block
        sentences = paragraph.split('.')
        if len(sentences) > 0:
            sentence_index = 0
            for sentence in paragraph.split('.'):
                sentence = sentence + '.'

                # Add sentence to dictionary with a paragraph index and a sentence index
                if paragraph_index not in paragraphs:
                    paragraphs[paragraph_index] = {}
                
                paragraphs[paragraph_index][sentence_index] = sentence
                sentence_index += 1

            paragraph_index += 1

    return paragraphs



In [None]:

# Save dictionary to json
for file in REPORTS:
    print('{}/{}'.format(REPORT_DIRECTORY, file))
    parsed_text = parse_pdf_pdfminer('{}/{}'.format(REPORT_DIRECTORY, file), )

    with open('{}/{}_{}.json'.format(PARSING_RESULTS_DIRECTORY, 'pdfminer', file[:-4]), 'w') as f:
        json.dump(parsed_text, f, indent=4)   

## Parsing with OCR

In [None]:
import fitz
import easyocr

def cleanse(text):
    # TODO:
    # A lot of cleansing could be done, e.g. 
    # - unifying different encodings 
    # - removing special characters
    # - concatenating hyphened words, sentences and paragraphs

    # Remove leading spaces from texts
    return re.sub(' +', ' ', text)

# Load model for english language
reader = easyocr.Reader(['en'])

# This approach has to save each pdf page as an image before it can be processed using OCR
# To save the pages as images, we use pyMuPDF. For the OCR part we use easyocr

# Set the dpi resolution of the images
dpi = 600

# Set the zoom of the saved image
zoom = dpi / 72
magnify = fitz.Matrix(zoom, zoom)

# Parse each report
for file in REPORTS:
    print('{}/{}'.format(REPORT_DIRECTORY, file))

    # Save report pages as images
    doc = fitz.open('{}/{}'.format(REPORT_DIRECTORY, file))
    count = 0
    for page in doc:
        pix = page.get_pixmap(matrix=magnify)  # render page to an image
        pix.save('{}/{}_{:02d}.png'.format(IMAGES, file, count))
        count += 1

    paragraphs = {}
    paragraph_index = 0

    # Use OCR on each page
    for img in [x for x in os.listdir('{}'.format(IMAGES)) if file in x]:

        # easyocr returns a list of paragraphs
        text = reader.readtext('{}/{}'.format(IMAGES, img), paragraph=True)

        # Cleanse each paragraph, split it into sentences and add it to dictionary
        for paragraph in text:

            paragraph = cleanse(paragraph[1])
            sentence_index = 0
            # Split text by dots to extract sentences
            for sentence in paragraph.split('.'):
                sentence = sentence + '.'
                if paragraph_index not in paragraphs:
                    paragraphs[paragraph_index] = {}
                
                # Add sentence to dictionary with a paragraph index and a sentence index
                paragraphs[paragraph_index][sentence_index] = sentence
                sentence_index += 1

            paragraph_index += 1

    # Save dictionary to json
    with open('{}/{}_{}.json'.format(PARSING_RESULTS_DIRECTORY, 'easyocr', file[:-4]), 'w') as f:
        json.dump(paragraphs, f, indent=4)
    