In [14]:
import sys
import io
import functools
import timeit

from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

import re

from nltk import tokenize

import pandas as pd

In [15]:
def extract_text(pdf_path):
    content = []
    
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

    content.append(text)

    
@functools.lru_cache(maxsize=128)
def sentence_tokenizer(content):
    sents = tokenize.sent_tokenize(content)
    return sents

In [8]:
path = '/Users/n1r7/Desktop/fincrime/pdf_parse/test/Wolfsberg_tax.pdf'

pdf = extract_text(path)
pdf

'   © The Wolfsberg Group 2019 1  Wolfsberg Guidance on Customer Tax Evasion           Wolfsberg Guidance on Customer Tax Evasion 1. Introduction  The Wolfsberg Group1 is pleased to publish guidance on how Financial Institutions (FIs) can mitigate and manage the risks associated with money laundering in the form of customer tax evasion (‘Guidance’). For the purposes of this guidance, tax evasion relates to tax related criminal offences prescribed by laws of a jurisdiction and considered to be a predicate offence to money laundering. Tax evasion generally includes the deliberate concealment or misrepresentation of beneficial ownership of assets, income and gains, or otherwise fraudulent conduct, designed to divert money from the public revenue.   Other forms of tax related risks, such as tax planning, avoidance and non-compliance falling short of criminal liability, will not be covered in this paper. Tax planning involves organising one’s affairs in the most tax efficient manner within 

In [9]:
pdf = re.sub('\x0c \d\d\d', '', pdf)
pdf = pdf.replace('\uf0d8', '')
pdf

'   © The Wolfsberg Group 2019 1  Wolfsberg Guidance on Customer Tax Evasion           Wolfsberg Guidance on Customer Tax Evasion 1. Introduction  The Wolfsberg Group1 is pleased to publish guidance on how Financial Institutions (FIs) can mitigate and manage the risks associated with money laundering in the form of customer tax evasion (‘Guidance’). For the purposes of this guidance, tax evasion relates to tax related criminal offences prescribed by laws of a jurisdiction and considered to be a predicate offence to money laundering. Tax evasion generally includes the deliberate concealment or misrepresentation of beneficial ownership of assets, income and gains, or otherwise fraudulent conduct, designed to divert money from the public revenue.   Other forms of tax related risks, such as tax planning, avoidance and non-compliance falling short of criminal liability, will not be covered in this paper. Tax planning involves organising one’s affairs in the most tax efficient manner within 

In [10]:
sentences = sentence_tokenizer(pdf)
sentences

['   © The Wolfsberg Group 2019 1  Wolfsberg Guidance on Customer Tax Evasion           Wolfsberg Guidance on Customer Tax Evasion 1.',
 'Introduction  The Wolfsberg Group1 is pleased to publish guidance on how Financial Institutions (FIs) can mitigate and manage the risks associated with money laundering in the form of customer tax evasion (‘Guidance’).',
 'For the purposes of this guidance, tax evasion relates to tax related criminal offences prescribed by laws of a jurisdiction and considered to be a predicate offence to money laundering.',
 'Tax evasion generally includes the deliberate concealment or misrepresentation of beneficial ownership of assets, income and gains, or otherwise fraudulent conduct, designed to divert money from the public revenue.',
 'Other forms of tax related risks, such as tax planning, avoidance and non-compliance falling short of criminal liability, will not be covered in this paper.',
 'Tax planning involves organising one’s affairs in the most tax effic

In [11]:
pd.set_option('display.max_colwidth',0)
df = pd.DataFrame(data=sentences)

In [12]:
df.head(20)

Unnamed: 0,0
0,© The Wolfsberg Group 2019 1 Wolfsberg Guidance on Customer Tax Evasion Wolfsberg Guidance on Customer Tax Evasion 1.
1,Introduction The Wolfsberg Group1 is pleased to publish guidance on how Financial Institutions (FIs) can mitigate and manage the risks associated with money laundering in the form of customer tax evasion (‘Guidance’).
2,"For the purposes of this guidance, tax evasion relates to tax related criminal offences prescribed by laws of a jurisdiction and considered to be a predicate offence to money laundering."
3,"Tax evasion generally includes the deliberate concealment or misrepresentation of beneficial ownership of assets, income and gains, or otherwise fraudulent conduct, designed to divert money from the public revenue."
4,"Other forms of tax related risks, such as tax planning, avoidance and non-compliance falling short of criminal liability, will not be covered in this paper."
5,"Tax planning involves organising one’s affairs in the most tax efficient manner within the intent of the law and, typically, with an honest belief that it is a legal method of reducing a tax liability."
6,"Tax avoidance refers to conduct that, while still within the letter of the law, generally involves the deliberate exploitation of weaknesses in the tax system."
7,"In recent years, tax evasion, and the facilitation thereof, have gained greater global prominence from tax authorities, regulators and the public."
8,"This increase in prominence has led to FIs seeking to enhance their focus on their customers’ tax affairs, as well as conduct by persons acting on the FIs’ behalf that may constitute the facilitation of tax evasion."
9,While tax evasion facilitation risk is distinct from the underlying predicate offence (i.e.
