In [1]:
import re
import os
import numpy as np
import pandas as pd
import pdfplumber
import time
from nltk.tokenize import word_tokenize

from IPython.display import display, HTML
from notebook.services.config import ConfigManager

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get all words on page in a list of lists. Each word is represented by:
# [x0, y0, x1, y1, word, bno, lno, wno]
# The first 4 entries are the word's rectangle coordinates, the last 3 are just
# technical info (block number, line number, word number).
# The term 'word' here stands for any string without space.

In [2]:
def find_insurer_by_site(page):
    INSURERS = {
                "ACKO": "acko.com", "BAJAJ": "bajajallianz.com", "BHARTI": "bhartiaxa.com", "CHOLA": "cholainsurance.com",
                "EDELWEISS": "edelweissinsurance.com", "GODIGIT": "godigit.com", "HDFC": "hdfcergo.com", "ICICI": "icicilombard.com",
                "IFFICO-TOKIO": "iffcotokio.co.in", "KOTAK": "kotakgeneralinsurance.com", "LIBERTY": "libertyinsurance.in",
                "MAGMA": "magmahdi.com" , "NAVI": "navi.com", "NEW INDIA": "newindia.co.in", "NATIONAL": "nationalinsuranceindia.nic.co.in",
                "ORIENTAL": "orientalinsurance.org.in", "RELIANCE": "reliancegeneral.co.in", "ROYAL": "royalsundaram.in", "SBI": "sbigeneral.in",
                "SHRIRAM": "shriramgi.com", "TATA": "tataaig.com", "UNITED": "uiic.co.in"
               }
    
    for insurer in INSURERS:
        page_data = page.search( INSURERS[insurer], regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer

In [3]:
def get_insurer_by_name(page):
    INSURERS = [
        "ACKO", "BAJAJ", "BHARTI", "CHOLA", "EDELWEISS", "GODIGIT", "HDFC", "ICICI", "IFFICO-TOKIO", "KOTAK", "LIBERTY",
        "MAGMA", "NAVI", "NEW INDIA", "NATIONAL", "ORIENTAL", "RELIANCE", "ROYAL", "SBI", "SHRIRAM", "TATA", "UNITED"
    ]
  
    for insurer in INSURERS:
        page_data = page.search( insurer, regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer
            

In [4]:
def find_mobile( page ):
    page_data = page.search( r'\s(\+91\-|0)?[6-9X][0-9X]{9}(,|\s)', regex=True, case=False )z
    if len( page_data ) and "mobile" not in data:
        data["mobile"] = page_data[0]["text"].strip()
        data["mobile"] = data["mobile"][0:-1] if data["mobile"][-1] == "," else data["mobile"]

In [5]:
def find_email(page):
    page_data = page.search( r'[a-z0-9\.\-]+@+(gmail|yahoo|outlook|hotmail)+(\.com|\.co\.in)+', regex=True, case=False )
    if len( page_data ) and "email" not in data:
        data["email"] = page_data[0]["text"].strip() if len( data ) else None

In [6]:
def find_vehicle_type( page ):
    type_regex = [ r'(two|2)\s?\-?\s?wheeler\s?', r'(four|4)\s?\-?\s?wheeler\s?', r'(MOTORCYCLE|SCOOTER|BIKE)', r'(private)?\s?\-?\s?(car\s|car$)' ]
    for i in range(4):
        if len( page.search( type_regex[i], regex=True, case=False ) ) > 0:
            data["type"] = "4W" if i%2 else "2W" 
            break
    
#     (policy|package|hybrid|comprehensive|certificate|secure)

In [7]:
def find_policy_type( page ):
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Policy)?\s?\-?\s?Stand\s?\-?\s?alone\s?\-?\s?O(wn)?\s?D(amage)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Stand Alone Own Damage"
        return

    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Package\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Package Policy"
        return
    
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Comprehensive\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Comprehensive Policy"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Liability\s?\-?\s?(Only)?\s?\-?\s?(Policy)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Liability\s?\-?\s?(Only)?\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?\s?\-?\s?Private\s?\-?\s?Car', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Package\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Package Policy"
        return

    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Comprehensive\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Comprehensive Policy"
        return
 
    page_data = page.search( r'Comprehensive\s?\-?\s?(Bike|Car)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Comprehensive Policy"
        return
    
    page_data = page.search( r'Package\s?\-?\s?(Bike|Car)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Package Policy"
        return

In [8]:
def find_reg_no( page ):
    page_data = page.search( r'(Vehicle|Reg(istration)?|\s?\.?)\s?(No\s?\.?|Number|Mark)\s*:?\s*' + 
                             r'([A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{2,5}|new)', 
                             regex=True, case=False
                           )
    if len( page_data ) > 0:
        reg_number = re.split('\s|\:', page_data[0]["text"] )[-1]
        data["reg_no"] = reg_number.replace( '-', "" )

In [9]:
def find_chassis_no( page ):
    page_data = page.search( r'Chass?is\s(No\s?\.?|Number)\s*:?\s*M[A-EZ][A-HJ-NPR-Z0-9]{14,19}', regex=True, case=False )
    if len( page_data ) > 0:
        chassis_number = re.split('\s|:', page_data[0]["text"] )[-1]
        data["chassis_no"] = chassis_number.replace( '-', "" )

In [10]:
def find_chassis_no_with_slash( page ):
    key = page.search( r'Chass?is\s(No\s?\.?|Number)\s?\/', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'M[A-EZ][A-HJ-NPR-Z0-9]{15,19}\s?\/', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return
    
    key = page.search( r'/\s?Chass?is\s(No\s?\.?|Number)', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'/\s?M[A-EZ][A-HJ-NPR-Z0-9]{15,19}', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return

In [11]:
def find_insured_name( page ):
    page_data = page.search( r'(Mr|Mrs?|M\/s|Ms\.)(\.\s?|\s)[A-Z]{1,20}\s?[A-Z]{0,20}\s?[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        data["insurers_name"] = page_data[0]["text"].replace( "\n", "" )
        return
    
    page_data = page.search( r'insured(\'s)?\s*name(\s|\s?:\s?)(Mr|Mrs?|M\/s|Ms\.)?(\.\s?|\s)?\s?[A-Z]{1,40}\s?[A-Z]{0,20}\s?[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        insurer_name = re.sub( "Insured(\'s)?\s*Name\s*:?\s*", "", page_data[0]["text"] )
        data["insurers_name"] = insurer_name.replace( "\n", "" )

In [12]:
def find_policy_duration( page ):
    #date_time dd/mm/yyyy hh:mm:ss
    start_date = page.search( r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?\s*' + 
                              r'(00|12):(00|01)(\]|\))?\s?(:00|:01)?(\]|\))?\s?(hrs|hours|AM)?',
                             regex=True, case=False 
                            )    
    end_date  =  page.search( r'[0-9]{1,2}[\-\/\]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?s?' +
                              r'(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)\s?(\]|\))?\s?(hrs|hours|PM)?(\(?Mid\s?night\))?',
                             regex=True, case=False
                            )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
        
    
    #time_data  hh:mm:ss dd/mm/yyyy
    start_date = page.search( r'(\[|\()?\s?(00|12):(00|01)(\]|\))?\s?(:00|:01)?\s?(hrs|hours|AM)?\s?(\]|\))?\s*(T|on|of)\s*' +
                              r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}',
                             regex=True, case=False 
                            )
    end_date = page.search( r'(\[|\()?s?(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)s?(\]|\))?\s?(hrs|hours|PM)?\s*(T|on|of)\s*' +
                            r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}(\(?Mid\s?night\))?',
                           regex=True, case=False
                          )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
    

In [13]:
def find_reg_no_using_bbox( page ):
    
    page_data = page.search( r'Registration\s?(No\s?\.?|Number|Mark)?', regex=True, case=False )
    
    if len( page_data ) > 0:
        for i in range( len( page_data ) ):
            for j in range(10):
#                 print( page, page_data[i]['text'] )
                h_offset = ( page_data[i]["x1"] - page_data[i]["x0"] ) / len( page_data[i]["text"] ) * j
                v_offset = ( page_data[i]['bottom'] - page_data[i]['top'] ) * 5
        
                bbox_text = page.crop( [
                     max( page_data[i]["x0"] - h_offset, 0 ),
                     page_data[i]['top'],
                     min( page_data[i]["x1"] + h_offset, page.width ),
                     min( page_data[i]['bottom'] + v_offset, page.height )
                 ], strict=True ).extract_text()
            
#                 print( repr(bbox_text) )
                text = re.findall( r"\n([A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{0,5}\n?[0-9]{0,5}|new)", bbox_text, re.IGNORECASE )

                if len( text ):
                    word = text[0].replace("\n\-", "")
                    if word.lower() == "new" or word[-4:].isnumeric():
                        data["reg_no"] = word
#                         print( "\n", page, "\t\t\t\t\t\tfound reg_no ", repr(word), "\n" )
                        return

In [14]:
def find_chassis_no_bbox( page ):
    
    page_data = page.search( r'Chass?is\s?(No\s?\.?|Number)\s?:?', regex=True, case=False )
    
    if len( page_data ) > 0:
        for i in range( len( page_data ) ):
            for j in range(14):
#                 print( page, page_data[i]['text'] )
                h_offset = ( page_data[i]["x1"] - page_data[i]["x0"] ) / len( page_data[i]["text"] ) * j
                v_offset = ( page_data[i]['bottom'] - page_data[i]['top'] ) * 5
        
                bbox_text = page.crop( [
                     max( page_data[i]["x0"] - h_offset, 0 ),
                     page_data[i]['top'],
                     min( page_data[i]["x1"] + h_offset, page.width ),
                     min( page_data[i]['bottom'] + v_offset, page.height )
                 ], strict=True ).extract_text()
            
#                 print( repr(bbox_text) )
                text = re.findall( r"\n?M[A-EZ][A-HJ-NPR-Z0-9]{14,19}\n?", bbox_text, re.IGNORECASE )

                if len( text ):
                    word = text[0].replace("\n\-", "")
                    if word.lower() == "new" or word[-4:].isnumeric():
                        data["chassis_no"] = word
#                         print( "\n", page, "\t\t\t\t\t\tfound chassis_no ", repr(word), "\n" )
                        return

In [16]:
df = pd.DataFrame( columns=[ "file", "pages", "insurers name", "type", "reg_no", "insurer", "phone", "email", "policy_type", "chassis_no", "start", "end", "time" ] )
files = os.listdir("./all_policy/Motor/all_policy")

count = 0

for file in files:
# for file in [ "hdfc10.pdf" ]:
    count += 1
    print( count, file )
    
    start_time = time.time()
    
    policy = pdfplumber.open( f'all_policy/Motor/all_policy/{file}' )
    
    insurer_count = {}
    data = {}

    for page in policy.pages:
        
        page_text = page.extract_text()
        
        find_insured_name( page ) if "insurers_name" not in data else None
        find_insurer_by_site( page ) if "insurer" not in data else None
        find_mobile( page ) if "mobile" not in data else None
        find_email( page ) if "email" not in data else None
        find_policy_type( page ) if "policy_type" not in data else None
        find_reg_no( page ) if "reg_no" not in data else None
        find_chassis_no( page ) if "chassis_no" not in data else None
        find_vehicle_type( page ) if "type" not in data else None
        find_policy_duration( page ) if "duration" not in data else None
        
    for page in policy.pages:
        if "insurer" not in data:
            get_insurer_by_name( page )
            
        if "chassis_no" not in data:
            find_chassis_no_with_slash( page )
        
        if "chassis_no" not in data:
            find_chassis_no_bbox( page )
            
        if "reg_no" not in data:
            find_reg_no_using_bbox( page )
            
            
    print( "data", data, "\n" )
    end_time = time.time()

    df.loc[ len( df.index ) ] = [ 
        file,
        len( policy.pages ),
        data["insurers_name"] if "insurers_name" in data else None,
        data["type"] if "type" in data else None,
        data["reg_no"] if "reg_no" in data else None,
        data["insurer"] if "insurer" in data else None,
        data["mobile"] if "mobile" in data else None,        
        data["email"] if "email" in data else None,
        data["policy_type"] if "policy_type" in data else None,
        data["chassis_no"] if "chassis_no" in data else None,
        data["start"] if "start" in data else None,
        data["end"] if "end" in data else None,
        ( end_time - start_time )
    ]
#     break
    
#     https://github.com/jsvine/pdfplumber

1 hdfc13_4w.pdf
data {'insurers_name': 'MR MAYUR MADHUKAR PATIL', 'mobile': '9096749797', 'email': 'mayur.m.patil9@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'MH11AW9594', 'chassis_no': 'MCA11835E07029773EMZ', 'start': '03 Nov, 2021 00:01 hrs', 'insurer': 'HDFC'} 

2 reliance11_4w.pdf
data {'insurers_name': 'MS.PRIYA AWASTHI ', 'mobile': '8433707105', 'email': 'dhirajawasthi@yahoo.com', 'reg_no': 'GJ15CK8239', 'type': '4W', 'start': ' 00:00 Hrs on 21-Jan-2022', 'insurer': 'RELIANCE', 'policy_type': 'Package Policy', 'end': 'Mid Night of 20/01/2023'} 

3 hdf_4w_comp.pdf
data {'mobile': '8861598378', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'KA04MS3235', 'start': '13 Oct, 2021 00:01 hrs', 'insurer': 'HDFC'} 

4 iffco1_4w.pdf
data {'mobile': 'XXXXXXX741', 'start': '07/02/2022 00:00:00\n', 'end': '06/02/2023 23:59:59\n ', 'insurer': 'IFFICO-TOKIO', 'type': '4W', 'reg_no': 'UP93U5817'} 

5 icici1_2w.pdf
data {'insurer': 'ICICI', 'mobile': '935627870

data {'insurers_name': 'MR.SITA RAM BAIRWA', 'mobile': '8433707105', 'email': 'jp9433344@gmail.com', 'type': '2W', 'policy_type': 'Stand Alone Own Damage', 'reg_no': 'RJ14AJ7933', 'start': ' 00:00 Hrs on 04-Feb-2022', 'insurer': 'RELIANCE', 'chassis_no': 'ME4JC67DBK8003293', 'end': 'Mid Night of 03/02/2023'} 

33 acko-4w-comp.pdf
data {'insurer': 'ACKO', 'email': 'iXXXXXXXXXXXXXXXXXXXa@yahoo.co.in', 'type': '4W', 'policy_type': 'Package Policy', 'insurers_name': 'MANJU ISHWERI NARAYAN', 'reg_no': 'MH03BS0277', 'chassis_no': 'MAT612354EKD08300', 'mobile': 'XXXXXX2426'} 

34 iffco5_2w.pdf
data {'insurers_name': 'ALAGAPPAN AL ', 'mobile': 'XXXXXXX195', 'reg_no': 'KC20E80135', 'type': '2W', 'start': '02/01/2022 00:00:00\n', 'end': '01/01/2023 23:59:59\n ', 'insurer': 'IFFICO-TOKIO'} 

35 reliance10_2w.pdf
data {'insurers_name': 'MR.AKASH KUMAR SINGH', 'mobile': '8433707105', 'email': 'akashikka@gmail.com', 'type': '2W', 'policy_type': 'Package Policy', 'reg_no': 'KA51Y5671', 'start': ' 00:

data {'insurers_name': 'Mr. LAVJIBHAI MOHANBHAI HAPALIYA', 'insurer': 'ROYAL', 'mobile': 'xxxxxx5540', 'start': ' 00:00:00 hours on 07/04/2022', 'end': '23:59:59 hours on 06/04/2023', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'GJ5JP5327'} 

65 hdfc12_2w.pdf
data {'insurer': 'HDFC', 'mobile': '9902059738', 'email': 'jagadeeshae7272@gmail.com', 'type': '2W', 'policy_type': 'Comprehensive Policy', 'reg_no': 'KA53EM8787', 'start': '16 Jan, 2022 00:01 hrs'} 

66 hdfc11_4w.pdf
data {'insurers_name': 'MR BEERAIAH B ', 'mobile': '9731138386', 'email': 'prakashmbangalore@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'KA53C7581', 'chassis_no': 'MAT600185GPJ34888', 'start': '03 Feb, 2022 00:01 hrs', 'insurer': 'HDFC'} 

67 iffco8_2w.pdf
data {'insurers_name': 'MADHAV RADHEYSHYAM KATIYAR', 'mobile': 'XXXXXXX720', 'reg_no': 'JF50EU80864', 'type': '2W', 'start': '27/01/2022 00:00:00\n', 'end': '26/01/2023 23:59:59\n ', 'insurer': 'IFFICO-TOKIO', 'policy_type': '

In [417]:
df = df.sort_values("reg_no")
# df.filter( ["file", "start", "end"] )
l = list( set( df[ df["chassis_no"].isnull() == True ]["file"] ) )
print( l )
print( len( l ) )

['iffco3_2w.pdf', 'iffco10_2w.pdf', 'bajaj_3p_4w.pdf', 'reliance11_4w.pdf', 'bharti_od_4w.pdf', 'iffco5_2w.pdf', 'united-2w-comp.pdf', 'bharti_hybrid_4w.pdf', 'bajaj_4w_pack.pdf', 'kotak_4w_comp.pdf', 'iffco11_2w.pdf', 'kotak_hybrid_4w.pdf', 'iffco6_2w.pdf', 'national-2w-comp.pdf', 'hdfc_4w_tp.pdf', 'iffico_4w_package.pdf', 'united-4w-tp.pdf', 'hdfc2_4w.pdf', 'magma-4w-comp.pdf', 'edel-4w-comp.pdf', 'bharti_4w_comp.pdf', 'united-4w-comp.pdf', 'iffco2_4w.pdf', 'royalsundaram-4w-comp.pdf', 'iffco4_2w.pdf', 'kotak_4w_3p.pdf', 'iffco1_4w.pdf', 'bajaj_4w_od.pdf', 'reliance9_car.pdf', 'newindia-4w-comp.pdf', 'bharti_3p_4w.pdf', 'kotak6_2w.pdf', 'iffico_3p_4w.pdf', 'kotak8_4w.pdf', 'iffico_4w_package2.pdf', 'kotak11_2w.pdf', 'reliance13_4w.pdf', 'iffco9_2w.pdf', 'iffico_4w_od.pdf', 'hdf_4w_comp.pdf', 'kotak5_2w.pdf', 'hdfc12_2w.pdf', 'kotak9_2w.pdf', 'iffco7_4w.pdf', 'kotak2_2w.pdf', 'kotak7_2w.pdf', 'kotak3_2w.pdf', 'national-4w-comp.pdf', 'kotak10_2w.pdf', 'acko-4w-tp.pdf', 'hdfc1_4w.pdf', 

In [678]:
def test( page ):
    
    page_data = page.search( r'Chass?is\s?(No\s?\.?|Number)\s?:?', regex=True, case=False )
    print( page, len( page_data ) )
    if len( page_data ) > 0:
        for i in range( len( page_data ) ):
            for j in range(14):
#             print( page, page_data[i]['text'] )
                h_offset = ( page_data[i]["x1"] - page_data[i]["x0"] ) / len( page_data[i]["text"] ) * j
                v_offset = ( page_data[i]['bottom'] - page_data[i]['top'] ) * 5
        
                bbox_text = page.crop( [
                     max( page_data[i]["x0"] - h_offset, 0 ),
                     page_data[i]['top'],
                     min( page_data[i]["x1"] + h_offset, page.width ),
                     min( page_data[i]['bottom'] + v_offset, page.height )
                 ], strict=True ).extract_text()
            
                print( repr(bbox_text) )
                text = re.findall( r"M[A-EZ][A-HJ-NPR-Z0-9\n]{14,19}", bbox_text, re.IGNORECASE )

                if len( text ):
                    word = text[0].replace("\n\-", "")
                    if word.lower() == "new" or word[-4:].isnumeric():
                        data["chassis_no"] = word
                        print( "\n", page, "\t\t\t\t\t\tfound chassis_no ", repr(word), "\n" )
                        return


In [683]:
# files = list( df[ df["chassis_no"].isnull() == True ]["file"] )
i = 0
# for file in files:
for file in ["national-4w-comp.pdf"]:
    i = i + 1
    print( i, file )
    policy = pdfplumber.open( f'all_policy/Motor/all_policy/{file}' )

# repr(policy.pages[0].extract_text() )
# policy.pages[0].search( r'chassis no.\sM[A-EZ][A-HJ-NPR-Z0-9]{9}', regex=True, case=False )[0]['text']
    for page in policy.pages:
        print(page)
        test( page )


1 national-4w-comp.pdf
<Page:1>
<Page:1> 1
'Chassis No\nB51BLBM382\nEngine No\n4LABM79389'
'Chassis No\nBB51BLBM382\nEngine No\nG4LABM793892'
'Chassis No\nBB51BLBM3824\nEngine No\nG4LABM793892'
'Chassis No\nLBB51BLBM38243\nEngine No\nG4LABM793892'
'Chassis No\nALBB51BLBM382438\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
'Chassis No\nMALBB51BLBM382438M\nEngine No\nG4LABM793892'
<Page:2>
<Page:2> 0


In [21]:
# df.sort_values("chassis_no")

In [22]:
# df[ df["chassis_no"].isnull() == True ]['file'] 

In [None]:
 'bharti_3p_4w.pdf',   yeah with after/
 'national-4w-comp.pdf', yes 18

In [682]:
len("MALBB51BLBM382438M")

18

In [None]:
# !git add PDF_Reading_Sheet.ipynb
# !git commit -m "added policy_duration extractor"
# !git push