In [195]:
import re
import tika
from tika import parser

In [223]:
tika.initVM()

In [224]:
filename = '../data/<filename>.pdf'

In [225]:
parsed = parser.from_file(filename, xmlContent=True)

In [None]:
print(parsed['content'])

In [226]:
with open('output.html', 'w') as f:
    f.write(parsed['content'])

In [6]:
dir(parsed)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [10]:
import sqlite3

In [11]:
conn = sqlite3.connect('~/src/DeepLearning/DrQA/legal.db')

In [14]:
c = conn.cursor()
c.execute('select text from documents where id=?', ('2edb99cf-ec9d-42a9-99e8-5b1d7e4762fd',))

<sqlite3.Cursor at 0x1108936c0>

In [None]:
print(c.fetchone())

In [16]:
conn.close()

In [7]:
import spacy

In [21]:
text = '''In exercising the Company Delegations, you must comply with the rules set out below.   
 
Rule 1:  Your Area of Responsibility and Budget'''

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp(text)

In [13]:
for sent in doc.sents:
    print('>>')
    print(sent)
    print('<<')

>>
In exercising the Company Delegations, you must comply with the rules set out below.   
 

<<
>>
Rule 1:  
<<
>>
Your Area of Responsibility and Budget
<<


In [14]:
SENTENCE_BOUNDARY_MARKERS = ['.', '?', '!']

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text in SENTENCE_BOUNDARY_MARKERS:
            doc[token.i + 1].is_sent_start = True

    return doc


In [15]:
nlp.add_pipe(set_custom_boundaries, before='parser')

In [22]:
doc = nlp(text)

In [23]:
for sent in doc.sents:
    print('>>')
    print(sent)
    print('<<')

>>
In exercising the Company Delegations, you must comply with the rules set out below.
<<
>>
  
 

<<
>>
Rule 1:  
<<
>>
Your Area of Responsibility and Budget
<<


In [6]:
from spacy.lang.en import English
from spacy.pipeline import SentenceSegmenter

In [8]:
nlp = English()  # just the language with no model

In [25]:
sbd = nlp.create_pipe('sentencizer')

In [26]:
nlp.add_pipe(sbd)

In [27]:
doc = nlp(text)

In [28]:
for sent in doc.sents:
    print('>>')
    print(sent)
    print('<<')

>>
In exercising the Company Delegations, you must comply with the rules set out below.
<<
>>
  
 
Rule 1:  Your Area of Responsibility and Budget
<<


In [29]:
text = '''Rule 1:  Your Area of Responsibility and Budget'''

In [30]:
doc = nlp(text)

In [88]:
def print_sentences(doc):
    for sent in doc.sents:
        print('>>{}<<'.format(sent))


In [32]:
text = '''TABLE OF CONTENTS 
1. PURPOSE OF THESE COMPANY DELEGATIONS 4 
2. COMPANY DELEGATIONS FRAMEWORK 4 
3. COMPANY DELEGATION RULES 6 
4. COMPANY POWERS OF ATTORNEY 7 
5. AUTHORITY TO ACT 8 
6. SUB-DELEGATION OF SIGNING POWER 8 
7. GUIDELINES TO THE COMPANY DELEGATIONS 9 
8. NOTIFICATIONS TO THE BOARD 11 
9. NOTIFICATIONS TO THE CEO 11 
10. VARIATION AND REVOCATION 11 
11. EFFECTIVE DATE 11 
'''

In [65]:
LIST_NUM_SHAPES = ['d', 'd.d', 'd.dd', 'dd.d', 'dd.dd', 'd.d.d', 'd.d.dd', 'd.dd.d', 'd.dd.dd']

In [66]:
BULLET_MARKERS = [u'•', '*', 'o']

In [67]:
def is_bullet(token):
    return token.text in BULLET_MARKERS


In [68]:
def is_list_num(token):
    return token.shape_ in LIST_NUM_SHAPES


In [196]:
def is_roman_numeral(token):
    """
    Validate if a Spacy Token is a roman numeral

    See https://stackoverflow.com/questions/267399/how-do-you-match-only-valid-roman-numerals-with-a-regular-expression

    :param token: Spacy Token
    :return: (bool)
    """
    if token.text in ['', '.', ')']:
        return False

    match = re.match(r'^(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})[.)]?$', token.text, re.IGNORECASE)
    return True if match else False


In [197]:
def is_ordered_list_item(token, next_token):
    if next_token and next_token.text in ['.', ')']:
        if is_list_num(token):
            return True

        if token.is_alpha and len(token) == 1:
            return True

        if is_roman_numeral(token):
            return True

    if is_list_num(token):
        return True

    if is_roman_numeral(token):
        return True

    return False


In [218]:
def detect_heading(text_or_tokens, nlp=None):
    if nlp is None:
        doc = text_or_tokens
    else:
        doc = nlp(text_or_tokens)

    n = len(doc)
    for i, token in enumerate(doc):
        if '\n' in token.text:
            continue

        next_token = doc[i + 1] if (i + 2) < n else None
        if i == 0:
            if not (token.is_title or token.is_upper or is_ordered_list_item(token, next_token)):
                return False
        elif i == (n - 1):
            if not (token.is_title or token.is_upper or token.is_digit or token.text in ['.', ':', ')']):
                return False
        else:
            if not (token.is_title or token.is_upper or token.is_stop or token.is_punct or
                    token.is_digit or token.text in [',', '-']):
                return False

    return True


In [205]:
def split_sentences(doc):
    """ Custom sentence segmentation """
    start = 0
    line_start = 0
    n = len(doc)
    in_list_num = False
    newline = False
    line = []
    for j, word in enumerate(doc):
        i = word.i
        next_token = doc[j + 1] if (j + 2) < n else None
        if newline:
            newline = False
            if is_bullet(word):
                yield doc[start:i]
                start = i
            elif is_ordered_list_item(word, next_token):
                in_list_num = True
                yield doc[start:i]
                start = i
            elif line_start == 0 and detect_heading(line):
                if line_start > start:
                    yield doc[start:line_start]

                yield doc[line_start:i]
                start = i

            line = []
            line_start = i
        elif '\n' in word.text:
            newline = True
        elif is_bullet(word) and i != 0:
            yield doc[start:i]
            start = i
        elif word.text in ['?', '!']:
            yield doc[start:i + 1]
            start = i + 1
        elif word.text == '.':
            if ((i + 1) == n or doc[i + 1].is_title or doc[i + 1].text == '\n') and not in_list_num:
                yield doc[start:i + 1]
                start = i + 1
        elif is_ordered_list_item(word, next_token):
            in_list_num = True
        else:
            in_list_num = False

        line.append(word)

    if start < n:
        if line_start == 0 and detect_heading(line):
            if line_start > start:
                yield doc[start:line_start]

            yield doc[line_start:n]
        else:
            yield doc[start:n]


In [206]:
sbd = SentenceSegmenter(nlp.vocab, strategy=split_sentences)

In [207]:
nlp = English()  # just the language with no model
nlp.add_pipe(sbd)

In [219]:
text = '''Issue 16 14 December 2018'''

In [220]:
doc = nlp(text)

In [221]:
is_list_num(doc[4])

False

In [222]:
detect_heading(doc)

True

In [None]:
print_sentences(doc)

In [19]:
import sys
sys.path.append('../onesource')

In [27]:
import importlib

In [28]:
import tika_extract

In [29]:
extractor = tika_extract.TextExtractor()

In [30]:
structured_content, text_list = [], []

In [31]:
extractor._process_text(text, structured_content, text_list, nlp)

In [32]:
structured_content

[{'type': 'heading', 'text': 'TELSTRA RESTRICTED (December'}]

In [None]:
[(i, token) for i, token in enumerate(doc)]

In [276]:
detect_heading(doc[0:16])

True

In [277]:
text = ''' 
'''

In [278]:
doc = nlp(text)

In [None]:
[(i, token) for i, token in enumerate(doc)]

In [123]:
import re

In [124]:
def is_roman_numeral(token):
    """
    Validate if a Spacy Token is a roman numeral

    See https://stackoverflow.com/questions/267399/how-do-you-match-only-valid-roman-numerals-with-a-regular-expression

    :param token: Spacy Token
    :return: (bool)
    """
    if token.text in ['', '.', ')']:
        return False

    match = re.match(r'^(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})[.)]?$', token.text, re.IGNORECASE)
    return True if match else False


In [131]:
def is_ordered_list_item(token, next_token):
    if next_token and next_token.text in ['.', ')']:
        if token.is_digit:
            return True

        if token.is_alpha and len(token) == 1:
            return True

        if is_roman_numeral(token):
            return True

    if token.shape_ in ['d.', 'dd.']:
        return True

    if is_roman_numeral(token):
        return True

    return False


In [125]:
doc[15]

)

In [126]:
is_roman_numeral(doc[15])

False

In [130]:
' ' in ['', '.', ')']

False

In [134]:
is_ordered_list_item(doc[14], doc[15])

False

In [135]:
text = '1 2 3'
doc = nlp(text)
[(i, token) for i, token in enumerate(doc)]

[(0, 1), (1, 2), (2, 3)]

In [136]:
doc[0].is_title

False

In [146]:
text = '''
'''

In [147]:
doc = nlp(text)

In [None]:
print_sentences(doc)