In [1]:
# Imports
import HSToken as tokenizer
import HSGoogleHelper as googleAPI
import HSUtilCSV as csvHelper
import HSDocClassifier as classifier

import unicodedata     
import nltk


In [2]:
def extract_features(image):
    
    #Call Google API
    google_detection= googleAPI.get_text_from_google(image)
    text_areas = googleAPI.collect_text_areas(google_detection)
    count_low, count_med, count_high = googleAPI.count_text_areas_by_size(text_areas)
    
    #Tokenize and turn into Features
    text = google_detection.text
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    
    tokens, numbers = tokenizer.tokenize(text)
    total_len = len(tokens)
    tokens, count_prod, count_invoice = tokenizer.recognize_my_items(tokens)
    stemmed_tokens, tagged_tokens = tokenizer.stem_and_tag(tokens)

    nouns, verbs, adjectives, conjunctions, numbers = tokenizer.count_types(tagged_tokens)
    
    features = {}
    features["product_ids"]=count_prod
    features['numbers'] = numbers
    features['invoice_ids']=count_invoice
    features['nouns']=nouns
    features['verbs']=verbs
    features['adjectives']= adjectives
    features['conjunctions']= conjunctions
    features['small_blocks']=count_low
    features['med_blocks']=count_med
    features['large_blocks']=count_high
    features['total_words']=total_len
    return features

In [3]:
#features = extract_features("../RPA/invoice.jpg")
features = extract_features("./emails/email01.png")
features

{'adjectives': 35,
 'conjunctions': 0,
 'invoice_ids': 0,
 'large_blocks': 12,
 'med_blocks': 4,
 'nouns': 82,
 'numbers': 2,
 'product_ids': 0,
 'small_blocks': 1,
 'total_words': 142,
 'verbs': 13}

In [4]:
formatted_features = classifier.format_features(features)
formatted_features

[[0, 2, 0, 82, 13, 35, 0, 1, 4, 12, 142]]

In [5]:
labels = ['email','invoice']

In [6]:
prediction, confidence = classifier.classify(features, labels)

INFO:tensorflow:Restoring parameters from ./models/classifier.ckpt


In [7]:
print("Predicted class '{}' with {} confidence".format(prediction, confidence))

Predicted class 'email' with 0.987806379795 confidence
