# Extracting Keywords using Natural Language Processing (NLP)

First we load all the necessary Python libraries (i.e., nltk and textract)

In [98]:
import nltk
from nltk.corpus import stopwords
import textract
import re

### Setting Options and Loading the Document to Parse

In [74]:
stop = stopwords.words('english') # Setting Stopwords for English Language
text = textract.process("/Users/lmeyer/Documents/Freelance Work/Proposals/letter.pdf") # Loading a PDF-Document and converting it into plain text

In [79]:
text = str(text)

### Processing the data and applying NLP

In [80]:
# Segmenting the Text in order to return an array with sentences
text = ' '.join([i for i in text.split() if i not in stop])
sentences = nltk.sent_tokenize(text)

In [81]:
# Tokenize each sentence in order to break each sentence into an array of words
sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [82]:
# POS Tag each word in each sentence
sentences = [nltk.pos_tag(sent) for sent in sentences]

In [95]:
# Functions to parse types of keywords
def extract_phone_numbers(string):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(string)
    return [re.sub(r'\D', '', number) for number in phone_numbers]

def extract_email_addresses(string):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    return r.findall(string)

def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

def extract_locations(document):
    locations = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'GPE':
                    locations.append(' '.join([c[0] for c in chunk]))
    return locations

def extract_dates(string):
    r = re.compile(r'[A-Z]\w+\s\d+')
    return r.findall(string)    

### Execution and Results 

In [None]:
names = extract_names(text)
locations = extract_locations(text)
dates = extract_dates(text)

In [99]:
print(names)
print(locations)
print(dates)

['Sam', 'Jones', 'Mr. Jones', 'Regards', 'Yours', 'Typed']
['New']
['Smith 6999', 'IL 88998', 'January 2', 'Toys 3444', 'KS 66500', 'December 20', 'Copyright 2005']
