In [1]:
from google.cloud import vision
from google.cloud import datastore
from google.oauth2 import service_account
import io
import os # for use with setting env variables
import re
from datetime import datetime


In [None]:
""" TODO
* Discount line logic
* Create an interface where the user can manually give an item its category:
    * Remember ID and update the category_id, keep other values as the same
"""

In [22]:
debug = False
developing = True
local_run = True
key_path = "/Volumes/GoogleDrive/My Drive/00. My Documents/03. Internt/24. Expense analyzer/config_files/expense-analyzer-260008-0cac2ecd3671.json"

In [24]:
# Only used in dev environment
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
datastore_client = datastore.Client(
    credentials=credentials
)
vision_client = vision.ImageAnnotatorClient(
    credentials=credentials
)

In [4]:
OVERLAPPING_ALLOCATION_THRESHOLD = 0.3

# The entity kind in datastore to query to find previous assignments
DATASTORE_KIND_CATEGORY_ASSIGNMENT = "category_item_mapping"

In [10]:
def crop_and_rotate(path):
    return 1


def detect_text(path):
    """Detects text in the file."""

    # vision_client = vision.ImageAnnotatorClient()
    """
    blob = storage_client.bucket(bucket_name).get_blob(file_name)
    blob_uri = f'gs://{bucket_name}/{file_name}'
    blob_source = {'source': {'image_uri': blob_uri}}
    # Ignore already-blurred files
    if file_name.startswith('blurred-'):
        print(f'The image {file_name} is already blurred.')
        return

    print(f'Analyzing {file_name}.')
    result = vision_client.safe_search_detection(blob_source)
    """

    """ ONLY USED FOR LOCAL FILES """
    if local_run:
        with io.open(path, 'rb') as image_file:
            content = image_file.read()

        image = vision.types.Image(content=content)
    else:
        image = {'source': {'image_uri': path}}

    response = vision_client.text_detection(image=image)
    texts = response.text_annotations

    if debug:
        print('Texts:')
        for text in texts:
            print('\n"{}"'.format(text.description))

            vertices = (['({},{})'.format(vertex.x, vertex.y)
                        for vertex in text.bounding_poly.vertices])

            print('bounds: {}'.format(','.join(vertices)))

    return response

In [11]:
#  ### PARSER FOR COOP
def article_lines_coop(response):
    # Locate the relevant range to extract items
    start_y_coordinate = -1     # Determine which point to start extract items
    end_y_coordinate = -1       # Determine which point to stop extract items

    receipt_id_and_datetime = ""

    for text in response.text_annotations:
        if len(text.description) > 100:
            # Full text, extract the receipt ID and datetime
            full_text = text.description

            # Start substring from the search term
            search_term = "Salgskvittering"
            start_index = full_text.find(search_term) + len(search_term)

            # End the substring at next newline
            end_index = start_index + full_text[start_index:].find("\n")

            # This now contains id, date, time, separated by space
            receipt_id_and_datetime = full_text[start_index:end_index].strip()
        elif "Salgskv" in text.description:
            start_y_coordinate = max(vertex.y for vertex in text.bounding_poly.vertices)
            if debug:
                print("Found starting point at {}, after text {}".format(start_y_coordinate, text.description))
        elif "Totalt" in text.description:
            end_y_coordinate = min(vertex.y for vertex in text.bounding_poly.vertices)
            if debug:
                print("Found ending point at {}, after text {}".format(end_y_coordinate, text.description))

    # Iterate through all lines, extract only those with item y coordinate larger than start and smaller than end
    relevant_items = []
    for text in response.text_annotations:
        if text.bounding_poly.vertices[0].y > start_y_coordinate and text.bounding_poly.vertices[0].y < end_y_coordinate:
            # print("Found an item line!: {}".format(text.description))
            relevant_items.append(text)

    return relevant_items, receipt_id_and_datetime


# Key = line_number, value = item
# Idea: For each bounding box, calculate the mid y coordinate. If this coordinate is inside the bounding box of
# another, then these are on the same line.
def allocate_lines_coop(items):
    """
    :param items: input is a list of relevant text boxes from Google vision, containing the text found and bounding polygon
    :return: returns a dictionary, with all items allocated to a line_id containing all elements on that same line, sorted by their x-coordinates
    """

    receipt_lines = {}
    # Key is the line number
    # Each value has the format [item]

    # Loop over all found text boxes
    for item in items:
        y_first_coor = item.bounding_poly.vertices[0].y
        y_fourth_coor = item.bounding_poly.vertices[3].y
        y_mean_coor = (y_first_coor + y_fourth_coor)/2
        height = y_fourth_coor - y_first_coor

        overlap_up = -1
        overlap_down = -1

        if len(receipt_lines) == 0:
            receipt_lines[0] = []
            receipt_lines[0].append(item)

        else:
            inserted = False

            # Loop through all allocated/identified lines
            for line in receipt_lines:
                # See if item belongs to an existing line
                # Compare against y coordinates of first item on line
                first_line_item = receipt_lines[line][0]
                first_line_item_y1 = first_line_item.bounding_poly.vertices[0].y
                first_line_item_y4 = first_line_item.bounding_poly.vertices[3].y

                # if mean coordinate is within min and max of line, add it to the line
                if first_line_item_y1 <= y_mean_coor <= first_line_item_y4:
                    receipt_lines[line].append(item)
                    inserted = True
                    break

                # These are used to calculate overlap between lines
                last_line_item = receipt_lines[line][-1]
                last_line_item_y1 = last_line_item.bounding_poly.vertices[0].y
                last_line_item_y4 = last_line_item.bounding_poly.vertices[3].y

                # Calculate a match% against each other line, to see if picture is slightly squished
                # Calculate against the item to the far right in the current line
                if last_line_item_y1 <= y_first_coor <= last_line_item_y4:
                    # Some overlap detected. item is below the line in comparison
                    overlap_down = float(last_line_item_y4 - y_first_coor) / height
                    if debug:
                        print("Found {}% overlap under between text {} and {} on line {}".format(overlap_down, item.description, first_line_item.description, line))

                if last_line_item_y1 <= y_fourth_coor <= last_line_item_y4:
                    # Some overlap detected. item is above the line in comparison
                    overlap_up = float(y_fourth_coor - last_line_item_y1) / height
                    if debug:
                        print("Found {}% overlap over between text {} and {} (first item) on line {}".format(overlap_up, item.description, first_line_item.description, line))

                # If any of the matches are above X%, allocate it to that line
                if overlap_down > OVERLAPPING_ALLOCATION_THRESHOLD or overlap_up > OVERLAPPING_ALLOCATION_THRESHOLD:
                    receipt_lines[line].append(item)
                    inserted = True
                    break

            # No match found against previous lines. Create a new line
            if not inserted:
                new_line_num = len(receipt_lines)
                receipt_lines[new_line_num] = []
                receipt_lines[new_line_num].append(item)

    # Sort each line by the x coordinates
    for line in receipt_lines:
        receipt_lines[line].sort(key=lambda item: item.bounding_poly.vertices[0].x)

    return receipt_lines


def lines_to_text(receipt_lines):
    """
    :param receipt_lines: takes a dictionary as input, where the key is a
            line_id and the value are objects containing the
            element text and bounding polygon
    :return: A list of text strings concatenated for each line, instead of
             google vision objects
    """
    receipt_text = []
    for line in receipt_lines:
        text = ""
        for item in receipt_lines[line]:
            text += " " + item.description
        receipt_text.append(text.lower().strip())
    return receipt_text

In [161]:
def query_preparation(receipt_text):
    """
    :param receipt_text: list of text strings containing article text and price
    :return: a list of dict line items, with keys:
            item_name: The full item name
            item_count: The number of items
            type: 
            price_gross: The total price before discount
            price_net: The total price after discount
            unit_price_net: The unit price after dicount
            discount_amt: The total discount
            discount_type: Percentage, fixed, mix and match, etc    
    """

    """
    Regex:
        "\d+" matches one or more digits
        "." followed by any charcter
        "\d+" one or more digits
        "$" at the end of the line
    """
    price_pattern = "(\d+.\d+)$"
    discont_pattern = ""
    articles_querified = []

    number_of_lines = 0
    
    curr_item_line = {}
    
    for article in receipt_text:
        if debug:
            print("------------")
            print(article)

        if "rabatt" in article:
            # TODO: x = re.spltt(discount_pattern, article)
            # articles.append(["discount", x[0], x[1]])
            
            discount_line = article.split(" ")
            # Will have the structure: ['rabatt:', 'nok', '13.16', '(40%', 'av', '32.90)']
            
            if len(discount_line) >= 3:
                discount_amt = discount_line[2]
                
                # A line starting with "Rabatt" belongs to the last item identified
                curr_item_line["discount_amt"] = discount_amt
                curr_item_line["discount_type"] = "percentage"
            else:
                print("Found an unknown discount format".format(article))
            
            if debug:
                print("Found a discount line")
                print(article.split(" "))

        elif "antall" in article:
            # Do something TODO
            
            antall_line = article.split(" ")
            # Will have the structure: ['antall:', '2', 'stk', '1.60', 'kr/stk']
            
            if len(discount_line) >= 4:
                item_count = antall_line[1]
                unit_price_net = antall_line[3]
                
                # A line starting with "antall" belongs to the last item identified
                curr_item_line["item_count"] = item_count
                curr_item_line["unit_price_net"] = unit_price_net
                         
            if debug:
                print("Found a antall line")
                print(article.split(" "))

        elif "artikler" not in article:
            curr_item_line = {}
            x = re.split(price_pattern, article, 2)

            # Element 3, index 2, is always empty string
            if len(x) == 3:
                # actual article
                item = x[0].strip()
                price = x[1].strip()
                
                curr_item_line["item_name"] = item
                curr_item_line["price_net"] = price
                curr_item_line["price_gross"] = price
                curr_item_line["unit_price_net"] = price
                curr_item_line["item_count"] = 1
                
                articles_querified.append(curr_item_line)
                if debug:
                    print("len was 3")
                    print("Appending item '{}' with price '{}'. Full split is '{}'".format(item, price, x))
            else:
                # Typically weight times price per kg.
                if developing:
                    print("Unparsable line: {}".format(article))
        else:
            if developing:
                print("Unparsable line 2: {}".format(article))
                
    # Finally, iterate through the set of lines and update fields that are calculated by discounts
    for article in articles_querified:
        if "discount_amt" in article and "price_net" in article:
            
            discount = article.get("discount_amt").strip()
            price_net = article.get("price_net").strip()
            
            # Try convert to floats
            try:
                discount = float(discount)
                price_net = float(price_net)
                float_success = True
            except ValueError:
                float_success = False
                print("Unable to parse either {} or {}".format(discount, price_net))
            
            # Add discount to calculate gross price. Check parsability
            if float_success:
                article["price_gross"] = str(round((discount + price_net)*100)/100)
    
    return articles_querified

In [13]:
def fetch_item_category(item_name):
    """ Search through similar items and reuse their category
        Give it a category of 0 if it not seen before
    """

    query = datastore_client.query(kind=DATASTORE_KIND_CATEGORY_ASSIGNMENT)
    # query.add_filter("item_name", "=", item_name)

    # Create a filter on the key
    first_key = datastore_client.key(DATASTORE_KIND_CATEGORY_ASSIGNMENT, item_name)
    query.key_filter(first_key, '=')

    # Fetch only one result
    q_result = query.fetch(limit=1)

    category_id = 0
    for res in q_result:
        category_id = res["cat_id"]

    return category_id

In [155]:
def writeToDatastore(articles_querified, added_by, trans_datetime, receipt_id):
    """ Writes the arcitles to the datastore. On the way, look up the category
        mapping if this item has been categorized before. If not found, it will
        be created with an id of -1, so that it can be updated later
    :param articles_querified: list of text strings containing article text and price
    :return: none
    """

    kind = 'transaction'  # The kind for the new entity
    now = datetime.now()  # Registered datetime
    
    task_list = []        # Used for bulk upload to datastore
    
    # Loop over all articles and insert one by one
    for article in articles_querified:
        item = article.get("item_name", 0)
        category_id = fetch_item_category(item)
        if debug:
            print("Assignning category {} to item {}".format(category_id, item))

        # Only happens if we have not seen this item before. Then we add it to
        # unmapped items with an id of -1.
        if category_id == 0:
            cat_task_key = datastore_client.key(DATASTORE_KIND_CATEGORY_ASSIGNMENT, item)
            cat_task = datastore.Entity(key=cat_task_key)
            cat_task["cat_id"] = -1
            datastore_client.put(cat_task)
            print('Saved {}: {}'.format(cat_task.key.name, cat_task['cat_id']))

        # The Cloud Datastore key for the new entity. Creating with partial key
        task_key = datastore_client.key(kind)

        # Prepares the new entity
        task = datastore.Entity(key=task_key)
        task['added_by'] = added_by
        task['cat_id'] = category_id
        task['discount_amt'] = article.get("discount_amt", 0)
        task['discount_type'] = article.get("discount_type", 0)
        task['item_id'] = 0  # Missing
        task['item_name'] = item
        task['price_gross'] = article.get("price_gross", 0)
        task['price_net'] = article.get("price_net", 0)
        task["item_count"] = article.get("item_count",0)
        task["unit_price_net"] = article.get("unit_price_net", 0)
        task['registered_datetime'] = now
        task['trans_date'] = trans_datetime
        task['receipt_id'] = receipt_id

        if debug:
            print("Writting to datastore:", task)

        # Saves a single entity:
        # datastore_client.put(task)
        task_list.append(task)
        print('Added {}: {}'.format(task.key.name, task['item_name']))
        
        # End for loop
        
    # Saves multiple entities at once: 
    datastore_client.put_multi(task_list)

In [112]:
path = r"C:\Users\NO007454\Documents\03. Internt\24. Expense analyzer\test_images\IMG_1010.JPEG"
path = r"C:\Users\NO007454\Documents\03. Internt\24. Expense analyzer\test_images\IMG_1012.JPEG"
path = "/Volumes/GoogleDrive/My Drive/00. My Documents/03. Internt/24. Expense analyzer/test_images/IMG_1012.JPEG"
path = "/Volumes/GoogleDrive/My Drive/00. My Documents/03. Internt/24. Expense analyzer/test_images/IMG_20200211.png"


In [113]:
os.listdir("../test_images")

['IMG_1010.JPEG',
 'IMG_1062.JPEG',
 'IMG_1044.JPEG',
 'IMG_1012.JPEG',
 'IMG_1011.JPEG',
 'IMG_0156.HEIC',
 'IMG_20200211.png']

In [162]:
response = detect_text(path)

In [163]:
relevant_lines, receipt_id_and_datetime = article_lines_coop(response)
receipt_lines = allocate_lines_coop(relevant_lines)
actual_lines = lines_to_text(receipt_lines)

In [164]:
articles_querified = query_preparation(actual_lines)

Unparsable line 2: artikler) 523.02


In [165]:
# Extract the receipt id and datetime before saving to datastore
receipt_id = receipt_id_and_datetime.split(" ")[0].strip()
receipt_date = receipt_id_and_datetime.split(" ")[1].strip()
datetime_object = datetime.strptime(receipt_date, '%m.%d.%Y')

In [166]:
writeToDatastore(articles_querified, "testuser", datetime_object, receipt_id)

Added None: b.d. oyster sauce
Added None: buer potetlomper
Added None: bærepose mega
Added None: coca cola uts 1.5l
Added None: pant
Added None: coop fiber frøbrød
Added None: fullkornris 500g
Added None: gilde karbonader
Added None: gilde krydderskinke
Added None: grand. nybakt skinke
Added None: mills postei 185g
Added None: naan bread garlic
Added None: penne rigate fullkor
Added None: philadelphia 200g
Added None: plantego paprika
Added None: saritas tandoor saus
Added None: tikka masala saritas
Added None: *zend.fresh+whi.75ml


In [None]:
# Functions that have changed: query_preparation
# Need to change: writeToDatastore, populate fields based on dictionary