<a href="https://colab.research.google.com/github/nsajwan/line_item_extraction/blob/master/Line_Item_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bank statements Line Item Extraction
Based on Cascade TableNet - https://github.com/DevashishPrasad/CascadeTabNet 

## 1. Install all the prerequisite
This will take a while, Remember to **Restart Runtime** after this step is done 

In [None]:
!pip install torch==1.4.0+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
!pip install -q mmcv terminaltables
!git clone --branch v1.2.0 'https://github.com/open-mmlab/mmdetection.git'
%cd "mmdetection"
!pip install -r "/content/mmdetection/requirements/optional.txt"
!python setup.py install
!python setup.py develop
!pip install -r {"requirements.txt"}
!pip install pillow==6.2.1 
!pip install mmcv==0.4.3
%cd "/content"
!apt-get install poppler-utils
!pip install pdf2image
!pip install Pillow
!pip install pdfrw

## 2. Clone the Repository

In [1]:
!rm -rf line_item_extraction
!git clone https://github.com/strangest-quark/line_item_extraction.git

Cloning into 'line_item_extraction'...
remote: Enumerating objects: 172, done.[K
remote: Counting objects: 100% (172/172), done.[K
remote: Compressing objects: 100% (147/147), done.[K
remote: Total 172 (delta 73), reused 98 (delta 24), pack-reused 0[K
Receiving objects: 100% (172/172), 12.91 MiB | 4.64 MiB/s, done.
Resolving deltas: 100% (73/73), done.


## 3. Pdf to Images and XML


In [None]:
!rm -rf /content/line_item_extraction/results
import os
from pdf2image import convert_from_path, convert_from_bytes
from PIL import Image
import subprocess

pdf_directory = '/content/line_item_extraction/sample_docs'
res_directory = '/content/line_item_extraction/results'
cmd = 'pdftohtml -i -c -noframes -xml '
cmd2 = 'pdf2txt.py -t xml --line-marg 1.0 --char-margin 1.0 '
results = []

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
      os.makedirs(res_directory+'/'+filename[:-4])
      full_cmd = cmd+pdf_directory+'/'+filename+' '+res_directory+'/'+filename[:-4] + '/' +'poppler.xml'
      #full_cmd = cmd2+ '-o ' + res_directory+'/'+filename[:-4] + '/' +'poppler.xml ' + pdf_directory+'/'+filename
      # Generate poppler xml
      !{full_cmd}
      images = convert_from_path(pdf_directory+'/'+filename)
      i = 0
      # create result dirs
      os.makedirs(res_directory+'/'+filename[:-4]+'/input_img')
      os.makedirs(res_directory+'/'+filename[:-4]+'/output_img')
      os.makedirs(res_directory+'/'+filename[:-4]+'/output_xml')
      for image in images:
        # save images
        image.save(res_directory+'/'+filename[:-4]+'/input_img/'+str(i)+'.png', format='PNG')
        i=i+1



## 3. Download the Pretrained Model


In [3]:
!gdown "https://drive.google.com/u/0/uc?id=1-QieHkR1Q7CXuBu4fp3rYrvDG9j26eFT"

Downloading...
From: https://drive.google.com/u/0/uc?id=1-QieHkR1Q7CXuBu4fp3rYrvDG9j26eFT
To: /content/epoch_36.pth
664MB [00:06, 97.4MB/s]


## 4. Run the Predictions

In [4]:
from mmdet.apis import init_detector, inference_detector, show_result
import matplotlib.pyplot as plt 
import mmcv

def show_result_pyplot(img, result, class_names, out_file, score_thr=0.3, fig_size=(15, 10)):
    print(img,"---" , class_names, "---" ,out_file,"---" , score_thr,"---" , fig_size)
    img = show_result(img, result, class_names, score_thr=score_thr, show=False, out_file=out_file)

# Load model
config_file = '/content/line_item_extraction/config/cascade_mask_rcnn_hrnetv2p_w32_20e.py'
checkpoint_file = '/content/epoch_36.pth'
# build the model from a config file and a checkpoint file
model = init_detector(config_file, checkpoint_file, device='cuda:0')

directory = '/content/line_item_extraction/results/'

results = dict()

for folder in os.listdir(directory):
  result = dict()
  for filename in os.listdir(directory+'/'+folder+'/input_img'):
      if filename.endswith(".png"):
        # Run Inference
        res = inference_detector(model, directory+'/'+folder+'/input_img/'+filename)
        result[filename[:-4]] = res
        # Visualize results - stored in /results/filename/output_img/
        try:
          show_result_pyplot(directory+folder+'/input_img/'+filename, res,('Bordered', 'cell', 'Borderless'), out_file=directory+folder+'/output_img/'+filename, score_thr=0.85)
        except:
          print(filename, folder, " Error - probably empty image")
  results[folder] = result
  print("Success! Segmentation results stored in results/filename/output_img")

/content/line_item_extraction/results/RedactedPayPal_sample3/input_img/2.png --- ('Bordered', 'cell', 'Borderless') --- /content/line_item_extraction/results/RedactedPayPal_sample3/output_img/2.png --- 0.85 --- (15, 10)
/content/line_item_extraction/results/RedactedPayPal_sample3/input_img/1.png --- ('Bordered', 'cell', 'Borderless') --- /content/line_item_extraction/results/RedactedPayPal_sample3/output_img/1.png --- 0.85 --- (15, 10)
/content/line_item_extraction/results/RedactedPayPal_sample3/input_img/0.png --- ('Bordered', 'cell', 'Borderless') --- /content/line_item_extraction/results/RedactedPayPal_sample3/output_img/0.png --- 0.85 --- (15, 10)
Success! Segmentation results stored in results/filename/output_img
/content/line_item_extraction/results/Composite_ATB_Financial_Sample2Acct/input_img/1.png --- ('Bordered', 'cell', 'Borderless') --- /content/line_item_extraction/results/Composite_ATB_Financial_Sample2Acct/output_img/1.png --- 0.85 --- (15, 10)
/content/line_item_extract

After this, use TSR postprocessing for best result

In [None]:
%cd "/content"
from line_item_extraction.border_main import border
import lxml.etree as etree
import cv2
from google.colab.patches import cv2_imshow
from mmdet.apis import inference_detector, show_result, init_detector
import glob
from line_item_extraction.cell_text import borderless

config_fname = "/content/line_item_extraction/config/cascade_mask_rcnn_hrnetv2p_w32_20e.py" 
checkpoint_path = "/content/"
epoch = 'epoch_36.pth'

model = init_detector(config_fname, checkpoint_path+epoch)

for folder in os.listdir(directory):
  for filename in os.listdir(directory+folder+'/input_img'):
      i = directory+folder+'/input_img/'+filename
      print("Full path of image i=",i)
      result = results[folder][filename[:-4]]
      try:
        result = inference_detector(model, i)
        res_border = []
        res_bless = []
        res_cell = []
        root = etree.Element("document")
        ## for border
        for r in result[0][0]:
          if r[4] > .85:
              res_border.append(r[:4].astype(int))
        ## for cells
        for r in result[0][1]:
            if r[4] > .85:
                r[4] = r[4] * 100
                res_cell.append(r.astype(int))
        ## for borderless
        for r in result[0][2]:
            if r[4] > .85:
                res_bless.append(r[:4].astype(int))

        ## if border tables detected
        if len(res_border) != 0:
            ## call border script for each table in image
            for res in res_border:
                try:
                    root.append(border(res, cv2.imread(i)))
                except:
                    pass
        if len(res_bless) != 0:
            if len(res_cell) != 0:
                for no, res in enumerate(res_bless):
                    root.append(borderless(res, cv2.imread(i), res_cell))
        print("myfile=",directory+folder+'/output_xml/'+ i.split('/')[-1][:-3] + 'xml')            
        myfile = open(directory+folder+'/output_xml/'+ i.split('/')[-1][:-3] + 'xml', "w")
        myfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        myfile.write(etree.tostring(root, pretty_print=True, encoding="unicode"))
        myfile.close()
      except:
        print("Error")

In [114]:
import lxml.etree as et
import numpy

import lxml.etree as et
import numpy

def combine_page_xmls():
  for folder in os.listdir(directory):
    document = etree.Element("document")
    for filename in sorted(os.listdir(directory+folder+'/output_xml')):
      print("filename--",filename)
      tree = et.ElementTree(file=directory+folder+'/output_xml/'+filename)
      pageNode = etree.SubElement(document, 'page')
      tables = tree.getroot().getchildren()
      for table in tables:
        pageNode.append(table)

    myfile = open(directory + folder + '/combinedOutput.xml', "w")
    myfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    myfile.write(etree.tostring(document, pretty_print=True, encoding="unicode"))
    myfile.close()
    print(etree.tostring(document, pretty_print=True).decode())

combine_page_xmls()



def getText(textbox):
  if textbox.text !='' and textbox.text is not None:
    return textbox.text
  else:
    return textbox.getchildren()[0].text


def getTextFromPdf(points, pdf, page, folder):
  found = False
  tree = et.ElementTree(file=directory+folder+'/poppler.xml')
  page = tree.getroot().getchildren()[page]
  text = page.getchildren()
  for textbox in text:
    if textbox.tag == 'text':
      top = textbox.attrib['top']
      left = textbox.attrib['left']
      width = textbox.attrib['width']
      height = textbox.attrib['height']
      #print("points1",points)
      coods = (top, left, width, height)
      #print("coods1",coods)
      coods = [float(i) for i in coods]
      #print("coods2",coods)
      points = [float(i) for i in points]
      diff = numpy.sum((numpy.asarray(points)-numpy.asarray(coods))**2)
      if diff<500:
        found = True
        txt = getText(textbox)
        return txt
  if not found:
    return found


filename-- 0.xml
filename-- 1.xml
filename-- 2.xml
<document>
  <page><table>
    <Coords points="46,1356 46,1996 1613,1996 1613,1356"/>
    <cell end-col="0" end-row="0" start-col="0" start-row="0">
      <Coords points="83,1435 83,1455 185,1455 185,1435"/>
    </cell>
    <cell end-col="1" end-row="0" start-col="1" start-row="0">
      <Coords points="220,1356 220,1459 388,1459 388,1356"/>
    </cell>
    <cell end-col="2" end-row="0" start-col="2" start-row="0">
      <Coords points="415,1438 415,1455 615,1455 615,1438"/>
    </cell>
    <cell end-col="4" end-row="0" start-col="3" start-row="0">
      <Coords points="730,1433 730,1457 851,1457 851,1433"/>
    </cell>
    <cell end-col="6" end-row="0" start-col="5" start-row="0">
      <Coords points="1543,1435 1543,1455 1624,1455 1624,1435"/>
    </cell>
    <cell end-col="0" end-row="1" start-col="0" start-row="1">
      <Coords points="84,1473 84,1489 173,1489 173,1473"/>
    </cell>
    <cell end-col="1" end-row="1" start-col="1"

In [118]:
import xml.etree.ElementTree as et
import lxml.etree as etree
from pdfrw import PdfReader
from PIL import Image


for folder in os.listdir(directory):
  try:
    tree = et.ElementTree(file=directory+folder+'/combinedOutput.xml')
  except:
    print(folder+' xml parse error')
    continue
  # if folder != 'Composite_ATB_Financial_Sample2Acct': #Composite_ATB_Financial_Sample2Acct  RedactedPayPal_sample3
  #   continue
  pages = tree.getroot().getchildren()
  foundCount = 0
  notFoundCount = 0
  bank_statement = etree.Element("bank_statement")
  for page in pages:
    for table in page:
      cells = table.getchildren()[1:]
      maxCol = 1;
      for cell in cells:
        if int(cell.attrib['end-col']) > maxCol:
          maxCol = int(cell.attrib['end-col'])
      if maxCol > 2:
        table.set("lineItems", "true")
      else:
        table.set("lineItems", "false")

  pageNo = 0
  for page in pages:
    for table in page:
      if table.attrib['lineItems'] == "true":
        line_items = etree.SubElement(bank_statement, 'line_items')
      else:
        line_items = etree.SubElement(bank_statement, 'others')
      cells = table.getchildren()[1:]
      key = ''
      startRow = 999
      headers = etree.SubElement(line_items, 'headers')
      headerStr = ""
      for cell in cells:
        coords = cell.getchildren()
        points = coords[0].attrib['points']
        point0 = int(points.split(' ')[0].split(',')[0])
        point1 = int(points.split(' ')[0].split(',')[1])
        point2 = int(points.split(' ')[2].split(',')[0])
        point3 = int(points.split(' ')[2].split(',')[1])


        pdf = PdfReader(pdf_directory+'/' + folder + '.pdf')
        try:
          im = Image.open(directory + folder+'/output_img/'+str(pageNo)+'.png')
        except:
          print(folder+' page missing in output')
        text = getTextFromPdf(TranslateFromMachineCode((point0, point1, point2, point3), pdf, im, pageNo), pdf, pageNo, folder)
        if text==-1:
          continue
        try:
          if table.attrib['lineItems'] == "true":
            if cell.attrib['start-row'] == '0':
              if text == False or text is None:
                text = 'unknown'
              headerStr = "".join((headerStr, ', '+str(text)))
            else:
              if startRow != int(cell.attrib['start-row']):
                row = etree.SubElement(line_items, 'row')
              startRow = int(cell.attrib['start-row'])
              if text == False or text is None:
                keys = etree.SubElement(row, 'key')
                keys.text='unknown'
              else:
                keys = etree.SubElement(row, 'key')
                keys.text=text.strip()
          else:
            if cell.attrib['start-col'] == '0':
              if text == False or text is None:
                key = etree.SubElement(line_items, 'unknown')
              else:
                tmpKey = ''.join(filter(str.isalpha, str(text)))
                if tmpKey == '':
                  key = etree.SubElement(line_items, 'unknown')
                else:
                  key = etree.SubElement(line_items, tmpKey)
            else:
              if text == False or text is None:
                key.text='unknown'
              else:
                key.text=text
        except:
          print(folder+' Post processing error')
          continue
        if text != False:
          foundCount = foundCount+1
          textNode = et.Element("text")
          textNode.text = text
          cell.append(textNode)
        else:
          notFoundCount = notFoundCount+1
      if len(headerStr) > 0:
        headers.text =  headerStr
    pageNo = pageNo +1
  print('mapped='+str(foundCount)+' no matches='+str(notFoundCount))
  myfile = open(directory+folder+'/bank_statement.xml', "w")
  myfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
  myfile.write(etree.tostring(bank_statement, pretty_print=True, encoding="unicode"))
  myfile.close()
  print(etree.tostring(bank_statement, pretty_print=True).decode())


def TranslateFromMachineCode(src, pdf, im, pageNo):
        #print('src--->',src, 'pdf-->',pdf,' and im--->', im,'and pageNo-->', pageNo)
        sx0, sy0, sx1, sy1 = src
        pdfWidth, pdfHeight = (int(float(pdf.pages[pageNo].MediaBox[2])),int(float(pdf.pages[pageNo].MediaBox[3])))
        imageWidth, imageHeight = im.size # size of png image 612x1008.  #1700x2800
        sy01 = imageHeight-sy1
        sy11 = imageHeight-sy0
        #print("pdfWidth, pdfHeight-->",pdfWidth, pdfHeight)
        #print("imageWidth, imageHeight-->",imageWidth, imageHeight)
        x0 = (sx0/imageWidth)*pdfWidth
        x1 = (sx1/imageWidth)*pdfWidth
        y0 = (sy0/imageHeight)*pdfHeight
        y1 = (sy1/imageHeight)*pdfHeight
        #converting pixels to points
        top = (y0 * 144 / 96)
        left = (x0 * 144 / 96)
        bottomX = (x1 * 144 / 96)
        bottomY = (y1 * 144 / 96)
        width =  bottomX - left
        height = bottomY - top
        #print('returned coordinates--x0,y0,x1,y1, bottomX, bottomY===>',x0, y0, x1, y1, bottomX, bottomY)
        #print('for coord=',src,'  matched coordinates are --top, left,width,heigth===>',top, left, width, height)
        return (top, left, width, height)

mapped=120 no matches=8
<bank_statement>
  <line_items>
    <headers>, Tran Date, unknown, Reference Number, Description,  Amount</headers>
    <row>
      <key>Tran Date</key>
      <key>05/15/20</key>
      <key>P928300GS01DXNVTM</key>
      <key>unknown</key>
      <key>Amount</key>
      <key>unknown</key>
    </row>
    <row>
      <key>-$276.59</key>
    </row>
    <row>
      <key>unknown</key>
    </row>
    <row>
      <key>Tran Date</key>
      <key>Posting Date</key>
      <key>Reference Number</key>
      <key>Type</key>
      <key>Description</key>
      <key>Amount</key>
    </row>
    <row>
      <key>Tran Date</key>
      <key>04/30/20</key>
      <key>P928300GAEHM6YX6A</key>
      <key>Standard</key>
      <key>KEEPCALLING</key>
      <key>$7.65</key>
    </row>
    <row>
      <key>04/30/20</key>
      <key>04/30/20</key>
      <key>P928300GAEHM6YX6A</key>
      <key>Standard</key>
      <key>KEEPCALLING</key>
      <key>$7.65</key>
    </row>
    <row>
      <key>04/

In [119]:
import pandas as pd
import xml.etree.ElementTree as ET
metrics = []


labelled_data_dir = '/content/line_item_extraction/labelled_data'

for filename in os.listdir(labelled_data_dir):
  if filename.startswith('.'):
    continue
  result = ET.parse('/content/line_item_extraction/results/'+filename[:-4]+'/bank_statement.xml')
  ground_truth = ET.parse(labelled_data_dir+'/'+filename)

  li1 = result.findall('line_items')
  li2 = ground_truth.findall('line_items')

  cell_match = 0
  cell_mismatch = 0
  missing_tables = 0
  missing_rows = 0
  missing_cells = 0
  extra_cells = 0
  recognized_cell_count = 0

  for i in range(len(li2)):
      rows2 = li2[i].findall('row')
      try:
          rows1 = li1[i].findall('row')
      except:
          missing_tables = missing_tables + 1
          continue
      for idx, row in enumerate(rows2):
          cells1 = list(row.itertext())
          cells1 = list(map(str.strip, cells1))
          cells1 = list(map(str.lstrip, cells1))
          cells1 = [x for x in cells1 if x.strip()]
          try:
              cells2 = list(rows1[idx].itertext())
              cells2 = list(map(str.strip, cells2))
              cells2 = [x for x in cells2 if x.strip()]
          except:
              missing_rows = missing_rows + 1
              continue
          print(cells1, cells2)
          for cell in cells2:
              if cell in cells1 and cell != 'unknown':
                  cell_match = cell_match + 1
              else:
                  cell_mismatch = cell_mismatch + 1
          if len(cells2)-len(cells1) > 0:
              missing_cells = missing_cells + len(cells2)-len(cells1)
          elif len(cells2)-len(cells1) < 0:
              extra_cells = extra_cells + len(cells1) - len(cells2)
          recognized_cell_count = recognized_cell_count + len(cells1)
      metrics.append([filename[:-4],missing_tables, missing_rows, missing_cells, extra_cells, recognized_cell_count, cell_match, cell_mismatch])
df = pd.DataFrame(metrics, columns = ['filename', 'missing_tables', 'missing_rows', 'missing_cells', 'extra_cells', 'recognized_cell_count', 'cell_match', 'cell_mismatch'])
df

['04/01/2020', 'Beginning Balance', '$12,449.70'] ['04/01/2020', 'Beginning Balance', '$12,449.70']
['04/01/2020', 'BARRIER PEST MAN SALE', '-$54.13', '$12,395.57'] ['04/01/2020', 'BARRIER PEST MAN SALE', '-$54.13', '$12,395.57']
['04/01/2020', 'BARRIER PEST MAN SALE', '-$54.13', '$12,341.44'] ['04/01/2020', 'BARRIER PEST MAN SALE', '-$54.13', '$12,341.44']
['04/02/2020', '123 ASD FDSALLCSettlement000008054108954', '-$61.23', '$12,280.21'] ['04/02/2020', '123 ASD FDSALLCSettlement000008054108954', '-$61.23', '$12,280.21']
['04/07/2020', 'Owl House Proper Net Settle 000008038342406', '$3,995.00', '$16,275.21'] ['04/07/2020', 'Owl House Proper Net Settle 000008038342406', '$3,995.00', '$16,275.21']
['04/10/2020', 'MVWA UTILITY PMT DIRECT PAY 1-XX-XXXX6-01', '-$98.62', '$16,176.59'] ['04/10/2020', 'MVWA UTILITY PMT DIRECT PAY 1-XX-XXXX6-01', '-$98.62', '$16,176.59']
['04/14/2020', 'WELLINGTON RISK DEBIT WWS XXXXX1159', '-$146.08', '$16,030.51'] ['04/14/2020', 'WELLINGTON RISK DEBIT WWS XX

Unnamed: 0,filename,missing_tables,missing_rows,missing_cells,extra_cells,recognized_cell_count,cell_match,cell_mismatch
0,Redacted_Bank7,0,0,0,0,42,42,0
1,Redacted_Bank7,0,0,0,1,51,45,5
2,RedactedPayPal_sample3,0,1,10,5,69,53,21
3,Composite_ATB_Financial_Sample2Acct,0,0,0,0,10,10,0
