In [53]:
# get receipt data

import json

with open("ocr_results.json") as thefile:
    receipts = json.load(thefile)

In [60]:
# search patterns

import re

NO_NAME = "_no_name_found_"
NO_TIMESTAMP = "_no_timestapmp_found_"
NO_ITEMS_FOUND = "_no_items_found_"

def getStoreName(str):
  regexes = {
    "KIWI": r"KIWI",
    "SPAR": r"SPAR",
    "REMA": r"REMA",
    "BUNNPRIS" : r"BUNNPRIS",
    "COOP":r"COOP",
    "MENY":r"MENY",
    "LEROYMAT": r"LERØY",
    "EUROPRIS": r"EUROPRIS",
  }
  for k,pattern in regexes.items():
    if(re.search(pattern, str)):
      return k
  return "_no_name_found_"

def getTimeStamp(str):
  # observed formats
  # the spacing between data and time can be arbitrary
  #DD.MM.YY HH:MM
  #DD.MM.YY HH:MM:SS
  #YYYY-MM-DD HH:MM
  #DD/MM/YYYY HH:MM
  #DD.MM.YYYY HH:MM:SS
  
  # day
  # 01-09 - 0[1-9]
  # 10-29 -> [1-2][0-9]
  # 30-31 -> 3[0-1]
  DD = "0[1-9]|[0-2][0-9]|3[0-1]"

  # month
  # 01-09 -> 0[0-9]
  # 10-12 -> 1[0-2]
  MM = "0[0-9]|1[0-2]"

  # year
  # 00-29 -> [0-2][0-9]
  # +20 -> 20
  YY = "[0-2][0-9]"
  YYYY = "20"+YY

  # HH24
  # 00-19 -> [0-1][0-9]
  # 20-24 -> 2[0-24]
  hh = "[0-1][0-9]|2[0-4]"

  # MM & SS
  # 00-59 -> [0-5][0-9]
  mm = ss = "[0-5][0-9]"

  DDMMYY_YY = f"(?:{DD})[ .](?:{MM})[ .](?:{YY}|{YYYY})"
  hhmm_ss = f"\s+(?:{hh})[:.](?:{mm})(?:[:.]{ss})?"
 
  # can match yyyy-mm\dd etc.. but whatever...
  YYYYMMDD = f"{YYYY}(?:[-\\])(?:{mm})(?:[-\\])(?:{DD})"


  format_1_2 = DDMMYY_YY+hhmm_ss
  format_3_4 = YYYYMMDD+hhmm_ss

  patterns = (
    re.compile(format_1_2),
    re.compile(format_3_4)
  )

  for pattern in patterns:
    match = re.search(pattern,str)
    if(match): 
      return match[0]
  return NO_TIMESTAMP

def getItems(str):
  ITEM_MVA_COST = "(.*) \d?\d%\s?(\d{0,3}[,.]\d0)"
  entry = re.findall(re.compile(ITEM_MVA_COST), str)

  return entry if entry else NO_ITEMS_FOUND



In [61]:
print(getStoreName(receipts[66][1]))
print(getTimeStamp(receipts[66][1]))
print(getItems(receipts[66][1]))


KIWI
02.04.22 19:26
[('KNEIPP 7506 FIRST PRICE', '6,90'), ('SØRLANDSIS KROKAN 2L', '39,90'), ('KOKESJOKOLADE LYS 1006', '9,40'), ('HARSPRAY 300ML F.PRICE', '12,40'), ('Bete US U/SUKKER BX', '6,90'), ('TACOSAUS STERK 2306', '9,90'), ('TONATER HAKKEDE 3906 F.', '8,40')]


In [64]:
per_store = {} # (k store, v)

for receipt in receipts:
  filename = receipt[0]
  receipt_str = receipt[1]
  
  name = getStoreName(receipt_str)
  time_stamp = getTimeStamp(receipt_str)
  items = getItems(receipt_str)

  per_store.setdefault(name,[]).append({"ts": time_stamp, "name": name, "filename": filename, "items": items })

In [73]:
by_name = [(name,data) for name,data in per_store.items()]
by_name.sort(key=lambda t: len(t[1]),reverse=True)

print(f"receits n={len(receipts)}")

for name,data in by_name:
  print((name,len(data)))

receits n=974
('KIWI', 458)
('REMA', 150)
('SPAR', 112)
('_no_name_found_', 80)
('MENY', 75)
('COOP', 70)
('BUNNPRIS', 25)
('LEROYMAT', 4)


In [83]:
for name, data in by_name:
  if(name == "LEROYMAT"):
    print(f"---- {name} ----")
    for entry in data:
      print(f"----{entry['ts']}----")
      for item in entry["items"]:
        print(item)


---- LEROYMAT ----
----01.12.22 16:20----
('+ Pant', '2,00')
('ALOHA TROPISK IPA 0,33L', '41,90')
('+ Pant', '2,00')
----16.04.21 15:52----
('DRAUMUR LAKRIS&MELKESJO', '16,90')
('SCONES M/ROSIN', '35,00')
('SMOOTHIE 75CL GUAVA&GOJ', '36,90')
('+ Pant', '3,00')
----02.12.20 12:34----
('HOLIDAY HAZE 0,33L BX', '44,90')
('JARRULL VEGETAR 80G 12s', '49,20')
----19.12.19 18:01----
('+ Pant', '2,00')
('KLOKK&CO JUICY IPA 0,33', '39,90')
