# Extracting Keywords using Natural Language Processing (NLP)

First we load all the necessary Python libraries (i.e., nltk and textract)

In [98]:
import nltk
from nltk.corpus import stopwords
import textract
import re

### Setting Options and Loading the Document to Parse

In [102]:
stop = stopwords.words('english') # Setting Stopwords for English Language
text = textract.process("./letter.pdf") # Loading a PDF-Document and converting it into plain text

In [103]:
# Convert to String and Print scrambled text
text = str(text)
print(text)

b'Name__________________________\n\nDate_________________\n\n(Example of a Business Letter)\n\nMs. Emily Smith 6999 Main Street Chicago, IL 88998\n\n(Return Address \xc2\xad of letter writer)\n\nJanuary 2, 2000\n\n(Date: month, day, year)\n\nMr. Sam Jones President KR Toys 3444 Elm Drive Wichita, KS 66500\n\n(Inside Address \xc2\xad recipient\'s "formal" name, title, company\nname, address, city, state, zip)\n\nDear Mr. Jones:\n\n(Salutation \xc2\xad formal)\n\nThis letter is to inform you of a problem I have had with a toy that I purchased at your\n\nstore.\n\n(Body of Letter \xc2\xad purpose and details)\n\nI purchased a karaoke machine at your store on December 20, 1999. It was to be used for a New Years Eve party that my family had for some of my friends. That evening, before the party, we plugged in the machine and set it up. When I first tried to test the machine, the CD started to play sound, but there were no words on the screen. We read the directions and tried other CD\'s, bu

### Processing the data and applying NLP

In [80]:
# Segmenting the Text in order to return an array with sentences
text = ' '.join([i for i in text.split() if i not in stop])
sentences = nltk.sent_tokenize(text)

In [81]:
# Tokenize each sentence in order to break each sentence into an array of words
sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [82]:
# POS Tag each word in each sentence
sentences = [nltk.pos_tag(sent) for sent in sentences]

In [95]:
# Functions to parse types of keywords
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

def extract_locations(document):
    locations = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'GPE':
                    locations.append(' '.join([c[0] for c in chunk]))
    return locations

def extract_dates(string):
    r = re.compile(r'[A-Z]\w+\s\d+')
    return r.findall(string)    

### Execution and Results 

In [None]:
names = extract_names(text)
locations = extract_locations(text)
dates = extract_dates(text)