In [1]:
import numpy as np
import cv2
from PIL import Image
import sys
import os
import pytesseract
import pdfplumber
from pdf2image import convert_from_path
import nltk
nltk.download('stopwords')
from unidecode import unidecode
from difflib import SequenceMatcher


class DocumentExtractor():
    def __init__(self, path):
        self.dir_path = path

    def similar(self, a, b):
        return SequenceMatcher(None, a, b).ratio()

    def get_text_from_pdf(self):
        pages = convert_from_path(self.path, 300, fmt='jpeg')
        extracted_text = []
        for img in pages:
            img = np.array(img)
            img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            kernel = np.ones((2, 2), np.uint8)
            img = cv2.dilate(img, kernel, iterations=1)
            img = cv2.erode(img, kernel, iterations=1)
            img = cv2.adaptiveThreshold(cv2.bilateralFilter(img, 9, 75, 75), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
            img = Image.fromarray(img)
            text = pytesseract.image_to_string(img, lang='por')
            extracted_text.append(text)
        return (extracted_text)

    def read_file(self):
        pdf = pdfplumber.open(self.path)
        text = []
        for page in pdf.pages:
            text.append(page.extract_text())
        if None in text:
            print("Handling file as image...")
            text = self.get_text_from_pdf()
        text = ' (NEWPAGE) '.join(text)
        return text

    def clean_text(self, text, remove_stop_words=True):
        stop_words = nltk.corpus.stopwords.words("portuguese")
        text_by_words = text.replace('\n', ' ').split(' ')
        if remove_stop_words:
            formated_text = [unidecode(word.lower()) for word in text_by_words if (len(word) >= 1) & (word.lower() not in stop_words)]
        else:
            formated_text = [unidecode(word.lower()) for word in text_by_words if (len(word) >= 1)]
        return formated_text

    def get_prazo_clauses(self, text):
        sentences = []
        for idx, i in enumerate(text):
            if self.similar('clausula', i) > 0.7:
                clause = text[idx:]
                if any(test in clause[:10] for test in ['prazo', 'vigencia']):
                    for j in range(1, len(clause)):
                        if self.similar('clausula', clause[j]) > 0.7:
                            clause = clause[:j]
                            break
                    # print(' '.join(clause))
                    sentences.append(' '.join(clause))
        return sentences

    def read_all_documents(self):
        prazos = []
        if os.path.isdir(self.dir_path):
            for file in sorted(os.listdir(self.dir_path)):
                print('Reading File {}'.format(file))
                self.path = '{}/{}'.format(self.dir_path, file)
                text = self.read_file()
                print('Cleaning Texts')
                text = self.clean_text(text, remove_stop_words=False)
                print('Getting Prazos')
                sentences = self.get_prazo_clauses(text)
                prazos.append(sentences)
                print('\n')
        else:
            print('Reading File {}'.format(self.dir_path))
            self.path = self.dir_path
            text = self.read_file()
            print('Cleaning Texts')
            text = self.clean_text(text, remove_stop_words=False)
            print('Getting Prazos')
            sentences = self.get_prazo_clauses(text)
            prazos.append(sentences)
            print('\n')
        return prazos

    def write_prazos(self, prazos):
        erros = []
        with open('output.txt', 'w') as f:
            for i in range(len(prazos)):
                f.write('FILE: {}\n'.format(i))
                if len(prazos) == 0:
                    erros.append(i)
                    f.write('NONE FOUND! \n')
                for _string in prazos[i]:
                    f.write(str(_string) + '\n')
                f.write('\n')

[nltk_data] Downloading package stopwords to /home/pedro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
mypath = './documents/contratante/09.pdf'
extractor = DocumentExtractor(mypath)
file = extractor.read_all_documents()
print(file)
# extractor.write_prazos(prazos)

Reading File ./documents/contratante/09.pdf
Handling file as image...
Cleaning Texts
Getting Prazos


[['ciainsula sexia: dos prazos de exboicao o prazo para prestacao dos servicos ora contratado sera de 60 (sessenta) dias a contar da data de recebimento da ordem de execucao de servicos, podendo ser prorrogado a criterio da administracao. f']]


In [8]:
from matplotlib.pyplot import imshow
from IPython.display import display # to display images

In [10]:
import pkg_resources
pkg_resources.working_set.by_key['pdf2image'].version

'1.16.0'