In [68]:
import re
import os
import numpy as np
import pandas as pd
import pdfplumber
import time
from nltk.tokenize import word_tokenize

from IPython.display import display, HTML
from notebook.services.config import ConfigManager

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get all words on page in a list of lists. Each word is represented by:
# [x0, y0, x1, y1, word, bno, lno, wno]
# The first 4 entries are the word's rectangle coordinates, the last 3 are just
# technical info (block number, line number, word number).
# The term 'word' here stands for any string without space.

In [69]:
all_mmv_chassis_sheet = pd.read_csv('../pdf_data_extraction/all_policy/data/ChasisAll.csv')
ALL_MAKES = list( all_mmv_chassis_sheet["make"].unique() )
ALL_MAKE_MODEL = {}

for make in ALL_MAKES:
    ALL_MAKE_MODEL[ make ] = list( all_mmv_chassis_sheet[ all_mmv_chassis_sheet["make"] == "Mahindra" ]["model"].unique() )

In [70]:
def find_insurer_by_site(page):
    INSURERS = {
                "ACKO": "acko.com", "BAJAJ": "bajajallianz.com", "BHARTI": "bharti-axagi.co.in", "CHOLA": "cholainsurance.com",
                "EDELWEISS": "edelweissinsurance.com", "GODIGIT": "godigit.com", "HDFC": "hdfcergo.com", "HDFC": "hdfcgi.com", "ICICI": "icicilombard.com",
                "IFFICO": "iffcotokio.co.in", "KOTAK": "kotakgeneralinsurance.com", "LIBERTY": "libertyinsurance.in",
                "MAGMA": "magmahdi.com" , "NAVI": "navi.com", "NEW INDIA": "newindia.co.in", "NATIONAL": "nationalinsuranceindia.nic.co.in",
                "ORIENTAL": "orientalinsurance.org.in", "RELIANCE": "reliancegeneral.co.in", "ROYAL": "royalsundaram.in", "SBI": "sbigeneral.in",
                "SHRIRAM": "shriramgi.com", "TATA": "tataaig.com", "UNITED": "uiic.co.in"
               }
    
    for insurer in INSURERS:
        page_data = page.search( INSURERS[insurer], regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer

In [71]:
def get_insurer_by_name(page):
    INSURERS = [
        "ACKO", "BAJAJ", "BHARTI", "CHOLA", "EDELWEISS", "GODIGIT", "HDFC", "ICICI", "IFFICO", "KOTAK", "LIBERTY",
        "MAGMA", "NAVI", "NEW INDIA", "NATIONAL", "ORIENTAL", "RELIANCE", "ROYAL", "SBI", "SHRIRAM", "TATA", "UNITED"
    ]
  
    for insurer in INSURERS:
        page_data = page.search( r'for\s[A-Z0-9]*' + insurer, regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer

In [72]:
def find_mobile( page ):
    page_data = page.search( r'\s(\+91\-|0)?[6-9X][0-9X]{9}(,|\s)', regex=True, case=False )
    if len( page_data ) and "mobile" not in data:
        data["mobile"] = page_data[0]["text"].strip()
        data["mobile"] = data["mobile"][0:-1] if data["mobile"][-1] == "," else data["mobile"]

In [73]:
def find_email(page):
    page_data = page.search( r'[a-z0-9\.\-]+@(gmail|yahoo|outlook|hotmail)(\.com|\.co\.in)', regex=True, case=False )
    if len( page_data ) and "email" not in data:
        data["email"] = page_data[0]["text"].strip() if len( page_data ) else None

In [74]:
def find_vehicle_type( page ):
    type_regex = [ r'(two|2)\s?\-?\s?wheeler\s?', r'(four|4)\s?\-?\s?wheeler\s?', r'(MOTORCYCLE|SCOOTER|BIKE)', r'(private)?\s?\-?\s?(car\s|car$)' ]
    for i in range(4):
        if len( page.search( type_regex[i], regex=True, case=False ) ) > 0:
            data["type"] = "4W" if i%2 else "2W" 
            break

In [28]:
def find_policy_type( page ):
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Policy)?\s?\-?\s?Stand\s?\-?\s?alone\s?\-?\s?O(wn)?\s?D(amage)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Stand Alone Own Damage"
        return

    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?(Package|Comprehensive)', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Package Policy"
        return
    
#     page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Comprehensive', regex=True, case=False )
#     if len( page_data ) and "insurance_type" not in data:
#         data["type"] = "2W"
#         data["policy_type"] = "Package Policy"
#         return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Liability\s?\-?\s?(Only)?\s?\-?\s?(Policy)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Third Party"
        return
    
    page_data = page.search( r'Liability\s?\-?\s?(Only)?\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?\s?\-?\s?(for)?\s?Private\s?\-?\s?Car', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Third Party"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Package|Comprehensive)', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Package Policy"
        return

#     page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?Comprehensive', regex=True, case=False )
#     if len( page_data ) and "insurance_type" not in data:
#         data["type"] = "4W"
#         data["policy_type"] = "Package Policy"
#         return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Bundled\s?\-?\s?(cover)', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Bundled Policy"
        return

    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Stand\s?\-?\s?alone\s?\-?\s?O(wn)?\s?D(amage)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Stand alone own damage"
        return
 
    page_data = page.search( r'(Package|Comprehensive)\s?\-?\s?(Bike|Car|Two\s?\-?\s?Wheeler)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Package Policy"
        return
    
#     page_data = page.search( r'Package\s?\-?\s?(Bike|Car|Two\s?\-?\s?Wheeler)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
#     if len( page_data ) and "insurance_type" not in data:
#         data["policy_type"] = "Package Policy"
#         return

    page_data = page.search( r'Liability\s?\-?\s?(Only)?\s?\-?\s?(Bike|Car|Two\s?\-?\s?Wheeler)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Third Party"
        return
    
    page_data = page.search( r'Bundled', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Bundled"
        return
    
    page_data = page.search( r'Act\s?\-?\s?(Only)?\s?\-?\s?(Bike|Car|Two\s?\-?\s?Wheeler)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Third Party"
        return


In [29]:
def find_package_v( page ):
    
    found_word = page.search( "Coverage", regex=True, case=False )
    
    if len( found_word ) > 0:
        next_bbox = [ found_word[0]["x0"] , found_word[0]["bottom"] + 1, found_word[0]["x1"], found_word[0]["bottom"] + 2 ]
    
        found_text, next_bbox = find_top_of_first_value_below( page, next_bbox )
    
        start_loc = { "x0": next_bbox[0], "top": next_bbox[1] }

        # find bottom of value 
        # |________________|__> top of value, start_loc_x0, start_loc_top 
        # |_____value______|__> end_loc_x1, end_loc_bottom
        # |----------------|--> bbox of h=2 from start_loc
        # |________________|__> top of first word below

        blank_space, next_bbox = find_bottom_of_value( page, next_bbox )
        end_loc = { "x1": next_bbox[2], "bottom": next_bbox[3] }

        # find left end of world
        next_bbox = [ start_loc["x0"] - 1, start_loc["top"], start_loc["x0"], end_loc["bottom"]  ]
        start_loc["x0"] = find_left_end_of_value( page, next_bbox )
    
        # find right end of world
        next_bbox = [ end_loc["x1"], start_loc["top"], end_loc["x1"] + 1, end_loc["bottom"]  ]
        end_loc["x1"] = find_right_end_of_value( page, next_bbox )
        
        data[ "policy_type" ] = page.crop( [ start_loc["x0"], start_loc["top"], end_loc["x1"], end_loc["bottom"] ] ).extract_text()

In [30]:
def find_reg_no( page ):
    page_data = page.search( r'(Vehicle|Reg(istration)?)(\s?\.?)\s?(No\s?\.?|Number|Mark)\s*:?\s*' + 
                             r'([A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{2,5}|new)', 
                             regex=True, case=False
                           )

    if len( page_data ) > 0:
        reg_number = re.split('\s|\:', page_data[0]["text"] )[-1]
        data["reg_no"] = reg_number.replace( '-', "" )
        data["data_loc"] = "h" if page_data[0]['bottom'] - page_data[0]['top'] < 10 else "v"

In [31]:
def find_reg_no_using_bbox( page ):
    
    page_data = page.search( r'Registration\s?(No\s?\.?|Number|Mark)?', regex=True, case=False )
    
    if len( page_data ) > 0:
        for i in range( len( page_data ) ):
            for j in range(10):
                h_offset = ( page_data[i]["x1"] - page_data[i]["x0"] ) / len( page_data[i]["text"] ) * j
                v_offset = ( page_data[i]['bottom'] - page_data[i]['top'] ) * 5
        
                bbox_text = page.crop( [
                     max( page_data[i]["x0"] - h_offset, 0 ),
                     page_data[i]['top'],
                     min( page_data[i]["x1"] + h_offset, page.width ),
                     min( page_data[i]['bottom'] + v_offset, page.height )
                 ], strict=True )
            
                text = bbox_text.search( r"([A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{2,5}|new)", regex=True, case=False )

                if len( text ):
                    word = text[0]["text"].replace("\n|\-", "")
                    if word.lower() == "new" or word[-4:].isnumeric():
                        data["reg_no"] = word
                        data["data_loc"] = "v"
                        return

In [32]:
def find_chassis_engine_h( page ):
    word_data = page.search( r'Engine\s(No\s?\.?|Number)?\s*\/\s*Chass?is\s?(No\s?\.?|Number)?\s?:?', regex=True, case=False )
    if len( word_data ) > 0:
        print( "1 ", word_data[0]["text"] )
        found_data = find_chassis_engine_no_h_bbox( page, word_data )
        data["engine_no"], data["chassis_no"] = re.split(r'\/', found_data)

    word_data = page.search( r'Chass?is\s(No\s?\.?|Number)?\s*\/\s*Engine\s?(No\s?\.?|Number)?\s?:?', regex=True, case=False )
    if len( word_data ) > 0:
        print( "2 ", word_data[0]["text"] )
        found_data = find_chassis_engine_no_h_bbox( page, word_data )
        data["chassis_no"], data["engine_no"] = re.split(r'\/', found_data)

    word_data = page.search( r'Engine\s(No\s?\.?|Number)\s*:?', regex=True, case=False )
    if "engine_no" not in data and len( word_data ) > 0:
        print( "3 ", word_data[0]["text"] )
        found_data = find_chassis_engine_no_h_bbox( page, word_data )
        found_data = re.sub( r'\s|\/|\n', "", found_data )
        if not found_data.isalpha():
            data["engine_no"] = found_data
            
    word_data = page.search( r'Chass?is\s(No\s?\.?|Number)\s*:?', regex=True, case=False )
    if "chassis_no" not in data and len( word_data ) > 0:
        print( "4 ", word_data[0]["text"] )
        found_data = find_chassis_engine_no_h_bbox( page, word_data )
        found_data = re.sub( r'\s|\/|\n', "", found_data )
        if not found_data.isalpha():
            data["chassis_no"] = found_data

In [33]:
def find_chassis_engine_no_h_bbox( page, word_data ):
    
    prev_right, prev_bottom = int(word_data[0]['x1']) ,int(word_data[0]['bottom']) 

    start_loc = { "x0": int(word_data[0]['x1']) + 2, "top": int(word_data[0]['top']) }
    end_loc = {}

    next_bbox = [ prev_right + 2, start_loc["top"], prev_right + 4, prev_bottom ]
    
    #find first word after space
    while True:
        found_text = page.crop(next_bbox, strict=True).extract_text()
        if len( found_text ) > 0 or next_bbox[0] == next_bbox[2]:
            break
        next_bbox[0], next_bbox[2] = next_bbox[2], min(next_bbox[2] + 2, page.width)

    #find last word
    consecutive_blank = 0
    while consecutive_blank < 2 and next_bbox[0] != next_bbox[2]:
        found_char = page.crop(next_bbox, strict=True).extract_text()
        if len( found_char ) == 0:
            consecutive_blank += 1
            end_loc = { "x1": min(next_bbox[2] + 2, page.width), "bottom": prev_bottom }
        next_bbox[0], next_bbox[2] = next_bbox[2], min(next_bbox[2] + 2, page.width)
    
    next_char = [ start_loc["x0"], end_loc["bottom"], next_bbox[2], end_loc["bottom"] + 1 ]
    
    #move down
    consecutive_blank = 0
    while consecutive_blank < 2 and next_bbox[1] != next_bbox[3]:
        found_char = page.crop(next_bbox, strict=True).extract_text()
        if len( found_char ) == 0:
            consecutive_blank += 1
        next_bbox[1], next_bbox[3] = next_bbox[3], min(next_bbox[3] + 1, page.height)
        
    end_loc = { "x1": next_bbox[2], "bottom": next_bbox[3] }
        
    bbox_text = page.crop([start_loc["x0"], start_loc["top"], end_loc["x1"], end_loc["bottom"]]).extract_text()
    return re.sub( r'\.|\s|\xa0|\n', "", bbox_text )

In [34]:
def find_chassis_engine_v( page ):
    
    if "engine_no" not in data:
        word_data = page.search( r'Engine\s*(No\s?\.?|Number)\s*:?', regex=True, case=False )
        if len( word_data ) > 0:
            found_data = find_chassis_engine_no_v_bbox( page, word_data )
            found_data = (re.sub( r'[^A-Za-z0-9]+', "", found_data )).strip()
            if found_data != "wrong_data":
                data["engine_no"] = found_data

    if "chassis_no" not in data:
        word_data = page.search( r'Chass?is\s*(No\s?\.?|Number)\s*:?', regex=True, case=False )
        if len( word_data ) > 0:
            found_data = find_chassis_engine_no_v_bbox( page, word_data )
            found_data = (re.sub( r'[^A-Za-z0-9]+', "", found_data )).strip()
            if found_data != "wrong_data":
                data["chassis_no"] = found_data
                
    if "engine_no" not in data:
        word_data = page.search( r'Engine', regex=True, case=False )
        if len( word_data ) > 0:
            found_data = find_chassis_engine_no_v_bbox( page, word_data )
            found_data = (re.sub( r'[^A-Za-z0-9]+', "", found_data )).strip()
            if found_data != "wrong_data":
                data["engine_no"] = found_data

    if "chassis_no" not in data:
        word_data = page.search( r'Chass?is', regex=True, case=False )
        if len( word_data ) > 0:
            found_data = find_chassis_engine_no_v_bbox( page, word_data )
            found_data = (re.sub( r'[^A-Za-z0-9]+', "", found_data )).strip()
            if found_data != "wrong_data":
                data["chassis_no"] = found_data

In [35]:
def find_top_of_first_value_below( page, next_bbox ):
    while next_bbox[1] != next_bbox[3]:
        found_word = page.crop( next_bbox, strict=True ).extract_text()
        if len( found_word ) > 0 or next_bbox[0] == next_bbox[2]:
            found_word = re.sub( r'[^A-Za-z0-9]+', "", found_word )
            return found_word, next_bbox
        next_bbox[1], next_bbox[3] = next_bbox[3], min(next_bbox[3] + 1, page.height)
    return "", next_bbox

def find_bottom_of_value( page, next_bbox ):
    while next_bbox[1] != next_bbox[3]:
        found_word = page.crop( next_bbox, strict=True ).extract_text()
        if len( found_word ) == 0:
            found_word = re.sub( r'[^A-Za-z0-9]+', "", found_word )
            return found_word, next_bbox
        next_bbox[1], next_bbox[3] = next_bbox[3], min(next_bbox[3] + 1, page.height)
    return "", next_bbox
    
def find_left_end_of_value( papge, next_bbox ):
    while next_bbox[0] != next_bbox[2]:
        found_word = page.crop( next_bbox, strict=True ).extract_text()
        if len( found_word ) == 0:
            return next_bbox[2]
        next_bbox[0], next_bbox[2] = max(next_bbox[0] - 1, 0), next_bbox[0]
    return next_bbox[0]
            
def find_right_end_of_value( papge, next_bbox ):
    while next_bbox[0] != next_bbox[2]:
        found_word = page.crop( next_bbox, strict=True ).extract_text()
        if len( found_word ) == 0:
            return next_bbox[2]
        next_bbox[0], next_bbox[2] = next_bbox[2], min(next_bbox[2] + 1, page.width)
    return next_bbox[2]

In [36]:
def find_chassis_engine_no_v_bbox( page, word_data ):
    
    next_bbox = [ word_data[0]['x0'], word_data[0]['bottom']+1, word_data[0]["x1"], word_data[0]["bottom"] + 2 ]
    # step 1: finding top of first key below
    # bbox of h=2 from bottom of key 
    #  ________________
    # |_______key______|__> start_loc_x0, start_loc_bottom
    # |----------------|--> bbox of h=2 from start_loc
    # |________________|__> top of first word below
        
    found_text, next_bbox = find_top_of_first_value_below( page, next_bbox )
    
    # step 2 checking if first below is wraped_key, if wrapped_key find next word below and mark as top for value
    # |_______key______|
    # |___wrapped_key__|__> bottom of wrapped_key (Number or no).
    # |----------------|--> bbox of h=2 from bottom of wrapped_key
    # |________________|__> top of first word below
    
    if found_text.isalpha():
        blank_, next_bbox = find_bottom_of_value( page, next_bbox )
        found_text, next_bbox = find_top_of_first_value_below( page, next_bbox )
    
    start_loc = { "x0": next_bbox[0], "top": next_bbox[1] }
    
    # step 3 find bottom of value 
    # |________________|__> top of value, start_loc_x0, start_loc_top 
    # |_____value______|__> end_loc_x1, end_loc_bottom
    # |----------------|--> bbox of h=2 from start_loc
    # |________________|__> top of first word below
    
    blank_space, next_bbox = find_bottom_of_value( page, next_bbox )
    end_loc = { "x1": next_bbox[2], "bottom": next_bbox[3] }

    # step 4 checking if value is wrapped, if wrapped_value find_bottom of text and update end_loc
    # |______value______|
    # |__wrapped_value__|__> bottom of wrapped_value, end_loc_x1, end_loc_bottom
    
    found_text, next_bbox = find_top_of_first_value_below( page, next_bbox )
    
    if not found_text.isalpha():
        found_text, next_bbox = find_bottom_of_value( page, next_bbox )
        end_loc = { "x1": next_bbox[2], "bottom": next_bbox[3] }
        
# step 5 find left end of world
    next_bbox = [ start_loc["x0"] - 1, start_loc["top"], start_loc["x0"], end_loc["bottom"]  ]
    start_loc["x0"] = find_left_end_of_value( page, next_bbox )
    
# step 6 find right end of world
    next_bbox = [ end_loc["x1"], start_loc["top"], end_loc["x1"] + 1, end_loc["bottom"]  ]
    end_loc["x1"] = find_right_end_of_value( page, next_bbox )
    
    return page.crop( [ start_loc["x0"], start_loc["top"], end_loc["x1"], end_loc["bottom"] ], strict=True ).extract_text()

In [17]:
def find_insured_name( page ):
    page_data = page.search( r'(Mr|Mrs?|M\/s|Ms\.)(\.\s?|\s)[A-Z]{1,20}\s?[A-Z]{0,20}\s?[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        data["insurers_name"] = page_data[0]["text"].replace( "\n", "" )
        return
    
    page_data = page.search( r'(insured|Proposer)(\'s)?\s*name(\s|\s?:\s?)(Mr|Mrs?|M\/s|Ms\.)?(\.\s?|\s)?\s?[A-Z]{1,40}\s?[A-Z]{0,20}\s?[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        insurer_name = re.sub( "Insured(\'s)?\s*Name\s*:?\s*", "", page_data[0]["text"] )
        data["insurers_name"] = insurer_name.replace( "\n", "" )

In [18]:
def find_policy_duration( page ):
    #date_time dd/mm/yyyy hh:mm:ss
    start_date = page.search( r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?\s*' + 
                              r'(00|12):(00|01)(\]|\))?\s?(:00|:01)?(\]|\))?\s?(hrs|hours|AM)?',
                             regex=True, case=False 
                            )    
    end_date  =  page.search( r'[0-9]{1,2}[\-\/\]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?s?' +
                              r'(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)\s?(\]|\))?\s?(hrs|hours|PM)?(\(?Mid\s?night\)?)?',
                             regex=True, case=False
                            )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
        
    
    #time_data  hh:mm:ss dd/mm/yyyy
    start_date = page.search( r'(\[|\()?\s?(00|12):(00|01)(\]|\))?\s?(:00|:01)?\s?(hrs|hours|AM)?\s?(\]|\))?\s*(T|on|of)\s*' +
                              r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}',
                             regex=True, case=False 
                            )
    end_date = page.search( r'(\[|\()?s?(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)s?(\]|\))?\s?(hrs|hours|PM)?\s*(T|on|of)\s*' +
                            r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}(\(?Mid\s?night\)?)?',
                           regex=True, case=False
                          )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
    

In [19]:
# def find_mmv_using_mmv_list( page ):
        
#     for make in MAKES:
#         found_data = page.search( make, regex=True, case=False )
#         if len( found_data ) > 0:
#             for model in 
            
    

In [20]:
def find_mmv_using_chassis( page, chassis_no ):
    found_mmv = all_mmv_chassis_sheet[ all_mmv_chassis_sheet["chasis number"] == chassis_no ]
    for index, row in found_mmv.iterrows():
#         print( "mmv ", row["make"], row["model"], row["variant"], row["fuel type"] )
        data["mmv"] = f'{row["make"]} {row["model"]} {row["variant"]} {row["fuel type"]}'
    

In [None]:
def find_idv_h( page ):
    found_word = page.search( r'Total\s?(Value|IDV)' ?)
    

In [66]:
def find_premium_value( page ):
    found_word = page.search( r'(TOTAL\sPREMIUM\sPayable|FINAL\sAMOUNT)', case=False, regex=True )
    if len( found_word ) > 0:
        for i in range( len( found_word ) ):
            print( page,  found_word[i]["text"], re.sub( r'[^A-Za-z0-9\.]+', "", page.crop( [found_word[i]['x0'], found_word[i]['top'], page.width, found_word[i]['bottom'] ] ) ) )
            return
                
    found_word = page.search( r'((Net|Total)\s?Premium)', case=False, regex=True )
    if len( found_word ) > 0:
        for i in range( len( found_word ) ):
            print( page,  
                  found_word[i]["text"], 
                  re.sub( r'[^A-Za-z0-9\.]+', "", page.crop( [found_word[i]['x0'], found_word[i]['top'], page.width, found_word[i]['bottom'] ] ).extract_text() ) 
                 )
            return

In [67]:
df = pd.DataFrame( columns=[ "file", "pages", "data_loc", "policy_no", "insurers_name", "type", "reg_no", "insurer", "phone", "email",
                             "policy_type", "chassis_no", "engine_no", "mmv", "start", "end", "time" 
                           ] )
files = os.listdir("./all_policy/Motor/all_policy")

count = 0

for file in sorted(files):
# for file in [ "bajaj_3p_4w.pdf", "bajaj_4w_od.pdf", "bajaj_4w_pack.pdf", "bharti_hybrid_4w.pdf", "magma-4w-comp.pdf", "united-2w-comp.pdf", "united-4w-comp.pdf", "united-4w-tp.pdf" ]:
# for file in [ "acko-4w-comp.pdf" ]:
    count += 1
    print( count, file )

    start_time = time.time()
#     if count == 9:
#         break
    
    policy = pdfplumber.open( f'all_policy/Motor/all_policy/{file}' )
    
    insurer_count = {}
    data = {}

    for page in policy.pages:

#         find_insured_name( page ) if "insurers_name" not in data else None
#         find_email( page ) if "email" not in data else None
#         find_mobile( page ) if "mobile" not in data else None
#         find_reg_no( page ) if "reg_no" not in data else None
#         find_vehicle_type( page ) if "type" not in data else None
#         find_insurer_by_site( page ) if "insurer" not in data else None
#         find_policy_type( page ) if "policy_type" not in data else None
#         find_make( page ) if "make" not in data else None
        find_premium_value( page ) if "premium_value" not in data else None

        
#     if "reg_no" not in data:
#         for page in policy.pages:
#             find_reg_no_using_bbox( page )
        
#     for page in policy.pages:
#         if "insurer" not in data:
#             get_insurer_by_name( page )
        
#         if "policy_type" not in data:
#             find_package_v( page )
            
#         if "data_loc" not in data:
#             continue
            
#         if "chassis_no" not in data and data["data_loc"] == "h":
#             find_chassis_engine_h( page ) if "chassis_no" not in data else None
#             find_mmv_using_chassis( page, data["chassis_no"] ) if "chassis_no" in data else None
            
#         if "chassis_no" not in data and data["data_loc"] == "v":
#             find_chassis_engine_v( page ) if "chassis_no" not in data else None
#             find_mmv_using_chassis( page, data["chassis_no"] ) if "chassis_no" in data else None
        

#     for page in policy.pages:
#         if data["policy"] != "Third Party" or data["policy"] != "Liability Only":
#             find_idv_h( page ) if "idv" in data else None

            
#         find_policy_duration( page ) if "duration" not in data else None
            
        
    print( "data", data, "\n\n" )
    end_time = time.time()

    df.loc[ len( df.index ) ] = [ 
        file,
        len( policy.pages ),
        data["data_loc"] if "data_loc" in data else None,
        data["policy_no"] if "policy_no" in data else None,
        data["insurers_name"] if "insurers_name" in data else None,
        data["type"] if "type" in data else None,
        data["reg_no"] if "reg_no" in data else None,
        data["insurer"] if "insurer" in data else None,
        data["mobile"] if "mobile" in data else None,        
        data["email"] if "email" in data else None,
        data["policy_type"] if "policy_type" in data else None,
        data["chassis_no"] if "chassis_no" in data else None,
        data["engine_no"] if "engine_no" in data else None,
        data["mmv"] if "mmv" in data else None,
        data["start"] if "start" in data else None,
        data["end"] if "end" in data else None,
        ( end_time - start_time )
    ]
    
#     https://github.com/jsvine/pdfplumber

1 acko-2w-comp.pdf
<Page:1> Total Premium TotalPremium1538.00
data {} 


2 acko-4w-comp.pdf
<Page:3> Total Premium TotalPremium3611.00
<Page:4> Total Premium TotalPremium3611.00
data {} 


3 acko-4w-tp.pdf
<Page:1> Total Premium TotalPremium2445.00
data {} 


4 bajaj_3p_4w.pdf
<Page:3> Totalpremium Totalpremium3221.00
data {} 


5 bajaj_4w_hybrd.pdf
<Page:3> NetPremium NetPremiumAB68330.00
data {} 


6 bajaj_4w_od.pdf
<Page:3> TotalPremium TotalPremiumexcludingGSTforODcoveragesquotedandagreeduponis
data {} 


7 bajaj_4w_pack.pdf
<Page:2> TotalPremium TotalPremiumexcludingGoodsandServiceTaxGSTforLiabilityandODcoveragesquotedandagreeduponis
<Page:6> TotalPremium TotalPremiumNetPremiumAB5611.00LLtopersonforPaiddriverOpera50.00
data {} 


8 bharti_3p_4w.pdf
<Page:1> Total Premium TotalPremium3800.78
data {} 


9 bharti_4w_comp.pdf
<Page:1> Total Premium TotalPremium7033.68
data {} 


10 bharti_hybrid_4w.pdf


TypeError: expected string or bytes-like object

In [380]:
# df.filter( ["file", "mmv", "pages", "data_loc", "insurers name", "type", "phone", "email", "reg_no", "chassis_no", "engine_no", "policy_type", "insurer" ] ).sort_values( "make" )
# df.filter( ["file", "pages", "reg_no", "mmv", "chassis_no", "engine_no" ] ).sort_values( "chassis_no" )
df.sort_values("policy_type")

Unnamed: 0,file,pages,data_loc,policy_no,insurers_name,type,reg_no,insurer,phone,email,policy_type,chassis_no,engine_no,mmv,start,end,time
56,kotak_hybrid_4w.pdf,5,v,,Mrs. Suniti Tyagi,4W,NEW,KOTAK,7409548465,ayushtyagsa@gmail.com,Bundled,MA3RFL41SLJ193564,K10BN2336503,Maruti S Presso VXI AMT PETROL,,,1.827969
4,bajaj_4w_hybrd.pdf,7,,,GIRISHKUMARTHANKAPPANNAIR Zone,,,BAJAJ,,,Bundled,,,,,,1.3191
83,reliance_hybrid_4w.pdf,9,h,,Mr. SHIVA KUMARA S,4W,NEW,RELIANCE,9426110267,shivakumar267@gmail.com,Bundled,REVTRN11FYXK52821,MAT627223MLF40268,,,,1.385721
9,bharti_hybrid_4w.pdf,2,v,,Mr. Subasish Mohapatra,4W,NEW,BHARTI,9008811448,mohapatrasubasish@yahoo.co.in,Bundled Policy,MEERBC006M7103584,B4DA417E085205,Renault Triber RXZ MT PETROL,,,2.252494
41,iffico_3p_4w.pdf,3,v,,,4W,MH27AC9244,IFFICO,XXXXXXX569,,Liability Only,MA3EAA61S01885909,F8DN4656405,Maruti Alto LXI PETROL,,,2.006417
32,iffco1_4w.pdf,3,v,,,4W,UP93U5817,IFFICO,XXXXXXX741,,Liability Only,1035744,3972486,,,,1.69925
30,iffco10_2w.pdf,3,v,,RAJASEKHAR CH,2W,AP05BK0106,IFFICO,XXXXXXX689,,Liability Only,MBLHA10EG8GJ08102,HA10EB8GJ49879,Hero Honda Passion Plus STD PETROL,,,2.290863
31,iffco11_2w.pdf,4,v,,SUDHIR JAIN,2W,DL12SJ2108,IFFICO,XXXXXXX830,,Package,ME3U3S5C1GL015515,U3S5C1GL343915,Royal Enfield Classic 350 350 PETROL,,,3.694563
33,iffco2_4w.pdf,4,v,,,4W,KL27A1810,IFFICO,XXXXXXX114,,Package,MA6TF694M9HB19867,F14D37255301,Chevrolet Aveo 1.4 PETROL,,,2.327534
34,iffco3_2w.pdf,4,v,,N VARADARAJAN,2W,TN09BV8596,IFFICO,XXXXXXX101,,Package,ME4KC09CBE8664023,KC09E86674245,Honda Cb Unicorn DAZZLER BASIC PETROL,,,2.729211


In [315]:
# x = df[ df["data_loc"] == 'h' ].filter( ["file"] )
# x = list( x["file"].unique() )
# print( x )

In [72]:
def test( page ):
    page_data = page.search( r'Coverage', regex=True, case=False )
    
    if len( page_data ) > 0:
        for i in range( len( page_data ) ):
            for j in range(14):
#                 print( page, page_data[i]['text'] )
                h_offset = ( page_data[i]["x1"] - page_data[i]["x0"] ) / len( page_data[i]["text"] ) * j
                v_offset = ( page_data[i]['bottom'] - page_data[i]['top'] ) * 5
        
                bbox_text = page.crop( [
                     max( page_data[i]["x0"] - h_offset, 0 ),
                     page_data[i]['top'],
                     min( page_data[i]["x1"] + h_offset, page.width ),
                     min( page_data[i]['bottom'] + v_offset, page.height )
                 ], strict=True ).extract_text()
            
                print( repr(bbox_text) )
#                 text = re.findall( r"\n?M[A-EZ][A-HJ-NPR-Z0-9]{14,19}\n?", bbox_text, re.IGNORECASE )

#                 if len( text ):
#                     word = text[0].replace("\n\-", "")
#                     if word.lower() == "new" or word[-4:].isnumeric():
#                         data["chassis_no"] = word
# #                         print( "\n", page, "\t\t\t\t\t\tfound chassis_no ", repr(word), "\n" )
#                         return

In [91]:
# files = list( df[ df["policy_type"].isnull() == True ]["file"] )
# i = 0
# for file in files:
# # for file in ["national-4w-comp.pdf"]:
#     i = i + 1
#     print( i, file )
#     policy = pdfplumber.open( f'all_policy/Motor/all_policy/{file}' )

# # repr(policy.pages[0].extract_text() )
# # policy.pages[0].search( r'chassis no.\sM[A-EZ][A-HJ-NPR-Z0-9]{9}', regex=True, case=False )[0]['text']
#     for page in policy.pages:
#         print(page)
#         test( page )


In [None]:
!git add PDF_Reading_Sheet.ipynb
!git commit -m "adding "
!git push