# Python NLTK library demo

### First Import Dependencies



In [None]:
from PIL import Image
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer


## Lets look at an example Invoice

In [None]:

raw_invoice_image_path = 'invoice.jpg'
raw_invoice_image = Image.open(raw_invoice_image_path)
raw_invoice_image

## After Some OCR text scraping, we assume data is in an unformatted text file.

### Here I've manually created a file, OCR is outside of the scope of this demo

In [None]:
scraped_image_path = 'ocr_scraped.png'
scraped_image = Image.open(scraped_image_path)
scraped_image

## Lets read the text file and tokenize it

In [None]:
def readFile(path):
    file = open(path, "r")
    text = file.read()
    #print("Text = ", text)
    return text

#open file and tokenize the words
text = readFile('scraped_text.txt')
tokens = nltk.word_tokenize(text)
tokens


### Lets clean the data by removing punctuation, stop words, and very short words.  In this case I've also chosen to remove tokens that are only numerical, which may or may not be correct.

In [None]:
#tokenize with regular expression that removes symbols
def tokenize_alpha_num(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    return tokens                            
                                
#remove stopwords
def remove_stop_words(tokens):
    filteredTokens=[]
    swords = set(stopwords.words('english'))
    for t in tokens:
        if t not in swords:
            filteredTokens.append(t)
    return filteredTokens

#remove tokens below n characters
def remove_short_words(tokens, min):
    filteredTokens = []
    for t in tokens:
        if len(t) >= min:
            filteredTokens.append(t)
    return filteredTokens

#remove numbers
def remove_digits(tokens):
    filteredTokens = []
    for t in tokens:
        if not t.isdigit():
            filteredTokens.append(t)
    return filteredTokens

tokens = tokenize_alpha_num(text)
tokens = remove_stop_words(tokens)
tokens = remove_short_words(tokens, 3)
tokens = remove_digits(tokens)
tokens


## Here we have an opportunity to use machine learning to recognize tokens as items that are relevant to the customer's business.  For example, we could identify tokens in the format PXXXX as Product IDs in our system.

### NOTE that I'm not really doing machine learning here, only faking it.

### To simulate this, I'll cheat and use simple functions to replace product numbers with 'PRODUCTID' and an Invoice number with INVOICEID

In [None]:
def is_product_id(token):
    if len(token)==5 and token[:1]=='P' and token[1:].isdigit():
        return True
    return False

def is_invoice_id(token):
    if len(token)==8 and token[:3]=='INV' and token[3:].isdigit():
        return True
    return False

def recognize_my_items(tokens):
    recognized_tokens=[]
    for t in tokens:
        if is_product_id(t):
            recognized_tokens.append("PRODUCTID")
        elif is_invoice_id(t):
            recognized_tokens.append("INVOICEID")
        else:
            recognized_tokens.append(t)
    return recognized_tokens

tokens = recognize_my_items(tokens)
tokens

## We can use NLTK to tag the items as nouns, verbs, etc.

In [None]:
tagged = nltk.pos_tag(tokens)
tagged

entities = nltk.ne_chunk(tagged)
entities[1]


# Stem the tokens so we can later combine terms based on their root words

In [None]:
#performing stemming
def stem_tokens(tokens):
    stemmer = SnowballStemmer("english")
    stemmed_tokens = []
    for t in tokens:
        stemmed_tokens.append(stemmer.stem(t))
    return stemmed_tokens

tokens = stem_tokens(tokens)
tokens

## Let's try to retag the stemmed items.

In [None]:
tagged = nltk.pos_tag(tokens)
tagged


## It did an okay job, but not perfect.  For example it thinks 'ship' is a Noun, when in this context it's a Verb. 

## Let's look at what these tags mean.

In [None]:

nltk.help.upenn_tagset()

## Let's combine like terms and sort

In [None]:
def combine_tokens(tokens):
    dict = {}
    for t in tokens:
        count = 1
        if t in dict.keys():
            count = dict[t] + 1
        dict[t] = count
    return dict

tokens_dict = combine_tokens(tagged)
sorted(tokens_dict.items(), key=lambda x: x[1], reverse=True)

## These tokens are our features that we should use to classify the document type against ones that we already know about