In [294]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [295]:
import os
import docx

In [296]:
def getAllDocxFiles(input_dir='data'):
    all_doc_file_names = filter(lambda x: x.endswith('.docx'), os.listdir(input_dir))
    return all_doc_file_names

In [297]:
def getDocumentRawParagraphs(doc):
    heading = ''
    text = ''
    paragraphs = []
    for para in doc.paragraphs:
        para_text = para.text.strip()
        if para_text:
            if para.runs[0].bold or para_text.count('-') > 10:
                para_text = para_text.replace('-', '')
                if heading or text:
                    paragraphs.append((heading, text))
                heading = para_text
                text = ''
            else:
                text += para_text + '\n'
    paragraphs.append((heading, text))
    return paragraphs

In [298]:
def getKnownWords():
    known_words = []
    with open('data/wordsEn.txt', 'r') as f:
        known_words = f.readlines()
    known_words = map(lambda x: x.strip(), known_words)
    return known_words

In [299]:
class Sentence(object):
    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.chars_to_remove = ['~','-', '`','.', '!', '?', '@', '#', '$', '%',\
                            '^', '&', ',', '(', ')', '_', '+', '*',\
                            '=', '<', '>', ';', ':', '"', '[', ']', '/',\
                            '\\', '|', '~', '{', '}']
        self.context = ''
        self.value = ''
        self.clean()
        
    def replaceSpecialChars(self, word):
        value = word.split(',')
        try:
            splitted = word.split('/')
            _ = map(int, splitted)
        except:
            if len(word) > 7 and len(splitted) > 1:
                value = 'or '.join(splitted).split(' ')
        return value
    
    def clean_part(self, sentence_part):
        words = sentence_part.split()
        new_words = []
        for word in words:
            if word in self.chars_to_remove:
                continue
            if word[-1] in self.chars_to_remove:
                word = word[:-1]
            if word[0] in self.chars_to_remove:
                word = word[1:]
            splitted = self.replaceSpecialChars(word)
            new_words += splitted
        return new_words
    
    def clean(self):
        sentence = self.raw_text.strip().lower()
        base_parts = sentence.split(':')
        if len(base_parts) > 1:
            self.context = self.clean_part(base_parts[0])
            base_parts = base_parts[1:]
        remaining_sentence = ':'.join(base_parts)
        self.value = self.clean_part(remaining_sentence)
        
    def __str__(self):
        return ('%s => '%(' '.join(self.context)) if len(self.context) > 0 else '') + ' '.join(self.value)

In [300]:
class Paragraph(object):
    def __init__(self, raw_text, heading=''):
        self.raw_text = raw_text
        self.heading = heading
        self.sentences = []
        self.clean()
        
    def clean(self):
        for raw_sentence in self.raw_text.split('\n'):
            raw_sentence = raw_sentence.strip()
            if raw_sentence:
                self.sentences.append(Sentence(raw_sentence))
            
    def __str__(self):
        text = self.heading + '\n' if self.heading else ''
        text += '\n'.join(map(str, self.sentences))
        return text
    
    def __repr__(self):
        return self.__str__()

In [301]:
doc = docx.Document('data/Lei_753_notes_ano.docx')
raw_paragraphs = getDocumentRawParagraphs(doc)
paragraphs = [Paragraph(heading, content) for heading, content in raw_paragraphs]