# Extracting Keywords using Natural Language Processing (NLP)

First we load all the necessary Python libraries (i.e., nltk and textract)

In [10]:
import nltk
from nltk.corpus import stopwords
import textract
import re

from wand.image import Image
from PIL import Image as PI
import pyocr
import pyocr.builders
import io


### Setting Options and Loading the Document to Parse

In [27]:
stop = stopwords.words('english') # Setting Stopwords for English Language
text = textract.process("./Hartford invoice.pdf") # Loading a PDF-Document and converting it into plain text

In [18]:
# Converting the PDF into a JPEG
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0]
req_image = []
final_text = []

image_pdf = Image(filename="./Hartford invoice.pdf", resolution=300)
image_jpeg = image_pdf.convert('jpeg')

for img in image_jpeg.sequence:
    img_page = Image(image=img)
    req_image.append(img_page.make_blob('jpeg'))

In [45]:
# Run OCR
for img in req_image: 
    txt = tool.image_to_string(
        PI.open(io.BytesIO(img)),
        lang=lang,
        builder=pyocr.builders.TextBuilder()
    )
    final_text.append(txt)

In [46]:
text = str(final_text[0])
print(text)

Insurance Bill Page 1

Pay Online: www.thehartford.comlservic_g£r_rt_e£
For Customer Service Call:1-866—467-8730
7 am. to 7 pm. Central Time (Mon — Fri)

 

THE
HARTFORD

Billing Company:
Hartford Fire Insurance Company

 

Report Claims 24 hours a day. 1-800—3273636

 

Bill Date: 03I24I17 Billing Account #: 15092109

[To Pay in Full: $529.54 jMinimum Due: $31.24 {Due Date: 04l13l17 ]

By paying the Pay in Full amount shown above. you will not pay the service fee assessed on this bill. If your payment is not received
by the due date. a late fee of $10.00 will be assessed.

 

 

Named insured: CCAR CONSULTANTS LLC
Your Agent: NUTMEG INSURANCE AGENCY INCIPHS

004306 1/1

 

 

ACCOUNT SUMMARY IMPORTANT MESSAGES

Previous Account Balance . Thank you for placing your insurance with The Hartford.
Payments & Adjustments $53.46

Premium Activity $583.00

New Fee(s) $7.00

Account Balance $536.54

 

l TRANSACTION DETAILS (since your last bill)

 

 

 

 

 

 

 

 

Transaction i 1 l— WWW

### Processing the data and applying NLP

In [47]:
# Segmenting the Text in order to return an array with sentences
text = ' '.join([i for i in text.split() if i not in stop])
sentences = nltk.sent_tokenize(text)

In [48]:
# Tokenize each sentence in order to break each sentence into an array of words
sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [49]:
# POS Tag each word in each sentence
sentences = [nltk.pos_tag(sent) for sent in sentences]

In [51]:
# Functions to parse types of keywords
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

def extract_names(document):
    names = []
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

def extract_locations(document):
    locations = []
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'GPE':
                    locations.append(' '.join([c[0] for c in chunk]))
    return locations

def extract_dates(string):
    r = re.compile(r'[A-Z]\w+\s\d+')
    return r.findall(string)    

### Execution and Results 

In [52]:
names = extract_names(text)
locations = extract_locations(text)
dates = extract_dates(text)

In [55]:
print(names)
print(locations)
print(dates)

['Insurance', 'Bill', 'Hartford Fire Insurance Company Report', 'Bill Date', 'Full', 'Due Date', 'Pay Full', 'Fee Dale', 'Account Number', 'Changes Amount']
['Thank', 'New', 'Check', 'Full']
['Page 1', 'Claims 24', 'INCIPHS 004306', 'Transaction 1', 'Dale 1', 'Description 1', 'Activity 03124117', 'Fee 57', 'NJPLIGA 02', 'Owners 52', 'Businas 02', 'Compensation 5222', 'Fund 02', 'You 621', 'TOTALS 653', 'AB 01', 'Box 660916', 'TX 75266', 'NJ 08852']
