In [180]:
import re
import os
import numpy as np
import pandas as pd
import pdfplumber
import time
from nltk.tokenize import word_tokenize

from IPython.display import display, HTML
from notebook.services.config import ConfigManager

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get all words on page in a list of lists. Each word is represented by:
# [x0, y0, x1, y1, word, bno, lno, wno]
# The first 4 entries are the word's rectangle coordinates, the last 3 are just
# technical info (block number, line number, word number).
# The term 'word' here stands for any string without space.

In [117]:
def find_insurer_by_site(page):
    INSURERS = {
                "ACKO": "acko.com", "BAJAJ": "bajajallianz.com", "BHARTI": "bhartiaxa.com", "CHOLA": "cholainsurance.com",
                "EDELWEISS": "edelweissinsurance.com", "GODIGIT": "godigit.com", "HDFC": "hdfcergo.com", "ICICI": "icicilombard.com",
                "IFFICO-TOKIO": "iffcotokio.co.in", "KOTAK": "kotakgeneralinsurance.com", "LIBERTY": "libertyinsurance.in",
                "MAGMA": "magmahdi.com" , "NAVI": "navi.com", "NEW INDIA": "newindia.co.in", "NATIONAL": "nationalinsuranceindia.nic.co.in",
                "ORIENTAL": "orientalinsurance.org.in", "RELIANCE": "reliancegeneral.co.in", "ROYAL": "royalsundaram.in", "SBI": "sbigeneral.in",
                "SHRIRAM": "shriramgi.com", "TATA": "tataaig.com", "UNITED": "uiic.co.in"
               }
    
    for insurer in INSURERS:
        page_data = page.search( INSURERS[insurer], regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer

In [118]:
def get_insurer_by_name(page):
    INSURERS = [
        "ACKO", "BAJAJ", "BHARTI", "CHOLA", "EDELWEISS", "GODIGIT", "HDFC", "ICICI", "IFFICO-TOKIO", "KOTAK", "LIBERTY",
        "MAGMA", "NAVI", "NEW INDIA", "NATIONAL", "ORIENTAL", "RELIANCE", "ROYAL", "SBI", "SHRIRAM", "TATA", "UNITED"
    ]
  
    for insurer in INSURERS:
        page_data = page.search( insurer, regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer
            

In [119]:
def find_mobile( page ):
    page_data = page.search( r'\s(\+91\-|0)?[6-9X][0-9X]{9}(,|\s)', regex=True, case=False )
    if len( page_data ) and "mobile" not in data:
        data["mobile"] = page_data[0]["text"].strip()
        data["mobile"] = data["mobile"][0:-1] if data["mobile"][-1] == "," else data["mobile"]

In [120]:
def find_email(page):
    page_data = page.search( r'[a-z0-9\.\-]+@+(gmail|yahoo|outlook|hotmail)+(\.com|\.co\.in)+', regex=True, case=False )
    if len( page_data ) and "email" not in data:
        data["email"] = page_data[0]["text"].strip() if len( data ) else None

In [121]:
def find_vehicle_type( page ):
    type_regex = [ r'(two|2)\s?\-?\s?wheeler\s?', r'(four|4)\s?\-?\s?wheeler\s?', r'(MOTORCYCLE|SCOOTER|BIKE)', r'(private)?\s?\-?\s?(car\s|car$)' ]
    for i in range(4):
        if len( page.search( type_regex[i], regex=True, case=False ) ) > 0:
            data["type"] = "4W" if i%2 else "2W" 
            break
    
#     (policy|package|hybrid|comprehensive|certificate|secure)

In [122]:
def find_policy_type( page ):
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Policy)?\s?\-?\s?Stand\s?\-?\s?alone\s?\-?\s?O(wn)?\s?D(amage)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Stand Alone Own Damage"
        return

    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Package\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Package Policy"
        return
    
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Comprehensive\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Comprehensive Policy"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Liability\s?\-?\s?(Only)?\s?\-?\s?(Policy)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Liability\s?\-?\s?(Only)?\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?\s?\-?\s?Private\s?\-?\s?Car', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Package\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Package Policy"
        return

    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Comprehensive\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Comprehensive Policy"
        return
 
    page_data = page.search( r'Comprehensive\s?\-?\s?(Bike|Car)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Comprehensive Policy"
        return
    
    page_data = page.search( r'Package\s?\-?\s?(Bike|Car)?\s?\-?\s?(Insurance)?\s?\-?\s?Policy', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Package Policy"
        return

In [123]:
def find_reg_no( page ):
    page_data = page.search( r'Registration\s(No\s?\.?|Number|Mark)\s*:?\s*[A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{2,5}', regex=True, case=False )
    if len( page_data ) > 0:
        reg_number = re.split('\s|\:', page_data[0]["text"] )[-1]
        data["reg_no"] = reg_number.replace( '-', "" )

In [124]:
def find_chassis_no( page ):
    page_data = page.search( r'Chass?is\s(No\s?\.?|Number)\s*:?\s*M[A-EZ][A-HJ-NPR-Z0-9]{14,19}', regex=True, case=False )
    if len( page_data ) > 0:
        chassis_number = re.split('\s|:', page_data[0]["text"] )[-1]
        data["chassis_no"] = chassis_number.replace( '-', "" )

In [125]:
def find_chassis_no_with_slash( page ):
    key = page.search( r'Chass?is\s(No\s?\.?|Number)\s?/', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'M[A-EZ][A-HJ-NPR-Z0-9]{15,19}\s?/', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return
    
    key = page.search( r'/\s?Chass?is\s(No\s?\.?|Number)', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'/\s?M[A-EZ][A-HJ-NPR-Z0-9]{15,19}', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return

In [126]:
def find_insured_name( page ):
    page_data = page.search( r'(Mr|Mrs?|M\/s)(\.\s?|\s)[A-Z]{1,20}\s[A-Z]{0,20}\s[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        data["insurers_name"] = page_data[0]["text"]
        return
    
    page_data = page.search( r'insured(\'s)?\s*name(\s|\s?:\s?)(Mr|Mrs?|M\/s|Ms\.)?(\.\s?|\s)?s*[A-Z]{1,20}\s[A-Z]{0,20}\s[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        insurer_name = re.sub( "Insured(\'s)?\s*Name\s*:?\s*", "", page_data[0]["text"] )
        data["insurers_name"] = insurer_name

In [211]:
def find_policy_duration( page ):
    #date_time dd/mm/yyyy hh:mm:ss
    start_date = page.search( r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?\s*' + 
                              r'(00|12):(00|01)(\]|\))?\s?(:00|:01)?(\]|\))?\s?(hrs|hours|AM)?',
                             regex=True, case=False 
                            )    
    end_date  =  page.search( r'[0-9]{1,2}[\-\/\]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?([0-9]{4})(\s*|T)(\[|\()?s?' +
                              r'(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)\s?(\]|\))?\s?(hrs|hours|PM)?(\(?Mid\s?night\))?',
                             regex=True, case=False
                            )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
        
    
    #time_data  hh:mm:ss dd/mm/yyyy
    start_date = page.search( r'(\[|\()?\s?(00|12):(00|01)(\]|\))?\s?(:00|:01)?\s?(hrs|hours|AM)?\s?(\]|\))?\s*(T|on|of)\s*' +
                              r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}',
                             regex=True, case=False 
                            )
    end_date = page.search( r'(\[|\()?s?(\(?Mid\s?night\)?|11:59|11:59:59|23:59:59)s?(\]|\))?\s?(hrs|hours|PM)?\s*(T|on|of)\s*' +
                            r'[0-9]{1,2}[\-\/\,]?\s?[A-Z0-9]{1,3}[\-\/\,]?\s?[0-9]{4}(\(?Mid\s?night\))?',
                           regex=True, case=False
                          )
    if len( start_date ) > 0 and "start" not in data :
        data["start"] = start_date[0]['text']
    if len( end_date ) > 0 and "end" not in data :
        data["end"] = end_date[0]['text']
    

In [215]:
df = pd.DataFrame( columns=[ "file", "pages", "insurers name", "type", "reg_no", "insurer", "phone", "email", "policy_type", "chassis_no", "start", "end", "time" ] )
files = os.listdir("./all_policy/Motor/misc")

count = 0

for file in files:
# for file in [ "hdfc10.pdf" ]:
    count += 1
    print( count, file )
    
    start_time = time.time()
    
    policy = pdfplumber.open( f'all_policy/Motor/misc/{file}' )
    
    insurer_count = {}
    data = {}

    for page in policy.pages:
        find_insured_name( page )
        find_insurer_by_site( page ) if "insurer" not in data else None
        find_mobile( page ) if "mobile" not in data else None
        find_email( page ) if "email" not in data else None
        find_policy_type( page ) if "policy_type" not in data else None
        find_reg_no( page ) if "reg_no" not in data else None
        find_chassis_no( page ) if "chassis_no" not in data else None
        find_vehicle_type( page ) if "type" not in data else None
        find_policy_duration( page ) if "duration" not in data else None
        
    for page in policy.pages:
        if "insurer" not in data:
            get_insurer_by_name( page )
            
        if "chassis_no" not in data:
            find_chassis_no_with_slash( page )


    print( "data", data, "\n" )
    end_time = time.time()

    df.loc[ len( df.index ) ] = [ 
        file,
        len( policy.pages ),
        data["insurers_name"] if "insurers_name" in data else None,
        data["type"] if "type" in data else None,
        data["reg_no"] if "reg_no" in data else None,
        data["insurer"] if "insurer" in data else None,
        data["mobile"] if "mobile" in data else None,        
        data["email"] if "email" in data else None,
        data["policy_type"] if "policy_type" in data else None,
        data["chassis_no"] if "chassis_no" in data else None,
        data["start"] if "start" in data else None,
        data["end"] if "end" in data else None,
        ( end_time - start_time )
    ]
#     break
    
#     https://github.com/jsvine/pdfplumber

1 hdfc13_4w.pdf
data {'insurers_name': 'MR MAYUR MADHUKAR PATIL', 'mobile': '9096749797', 'email': 'mayur.m.patil9@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'MH11AW9594', 'chassis_no': 'MCA11835E07029773EMZ', 'start': '03 Nov, 2021 00:01 hrs', 'insurer': 'HDFC'} 

2 reliance11_4w.pdf
data {'insurers_name': 'MS.PRIYA AWASTHI ', 'mobile': '8433707105', 'email': 'dhirajawasthi@yahoo.com', 'reg_no': 'GJ15CK8239', 'type': '4W', 'start': ' 00:00 Hrs on 21-Jan-2022', 'insurer': 'RELIANCE', 'policy_type': 'Package Policy', 'end': 'Mid Night of 20/01/2023'} 

3 iffco1_4w.pdf
data {'mobile': 'XXXXXXX741', 'start': '07/02/2022 00:00:00\n', 'end': '06/02/2023 23:59:59\n ', 'insurer': 'IFFICO-TOKIO', 'type': '4W'} 

4 icici1_2w.pdf
data {'insurer': 'ICICI', 'mobile': '9356278702', 'reg_no': 'TS09EE9725', 'chassis_no': 'ME4JF502LET601661', 'type': '2W', 'email': '2000@YAHOO.COM', 'policy_type': 'Package Policy', 'start': '02, 2022 12:00:00 '} 

5 reliance2_2w.pdf
data {'in

data {'insurers_name': 'Mr. Bharat Yashwant Bangar', 'insurer': 'KOTAK', 'mobile': '9594880123', 'type': '2W', 'email': 'DOSTITOUR@GMAIL.COM', 'policy_type': 'Comprehensive Policy', 'start': '11/10/2021 00:00 ', 'end': '10/10/2022Midnight\n '} 

33 hdfc9_2w.pdf
data {'insurer': 'HDFC', 'mobile': '7979971843', 'email': 'navneetbhg2020@gmail.com', 'type': '2W', 'policy_type': 'Comprehensive Policy', 'reg_no': 'JH01BC4249', 'chassis_no': 'ME4JC36KKD7311586', 'start': '16 Jan, 2022 00:01 hrs'} 

34 kotak5_2w.pdf
data {'insurers_name': 'Mr. Sudesh R Morajkar', 'insurer': 'KOTAK', 'mobile': '9822122510', 'type': '2W', 'email': 'SUDESH.ARTIS@GMAIL.COM', 'policy_type': 'Comprehensive Policy', 'start': '07/01/2022 00:00 ', 'end': '06/01/2023Midnight\n '} 

35 united-4w-tp.pdf
data {'insurer': 'UNITED', 'mobile': '7904149939', 'type': '4W', 'start': ' 00:00 Hrs on 03-Mar-2022', 'end': 'Midnight on 02-Mar-2023', 'email': 'veludivxece2018@gmail.com', 'policy_type': 'Liability Policy'} 

36 newindi

data {'insurers_name': 'MS.ASTALAKSHMI R ', 'mobile': '8433707105', 'email': 'SKFINANCE01@GMAIL.COM', 'type': '2W', 'policy_type': 'Stand Alone Own Damage', 'reg_no': 'TN11AK9898', 'start': ' 00:00 Hrs on 18-Feb-2022', 'insurer': 'RELIANCE', 'chassis_no': 'ME1SED14AJ0042881', 'end': 'Mid Night of 17/02/2023'} 

67 reliance6_4w.pdf
data {'insurers_name': 'MR.RAKESH BALDEVBHAI PATEL', 'mobile': '8433707105', 'email': 'rakesh11566@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'GJ06DG2771', 'start': ' 00:00 Hrs on 21-Jan-2022', 'insurer': 'RELIANCE', 'chassis_no': 'MALBB51BR9M122064M'} 



In [216]:
df = df.sort_values("file")
# df.filter( ["file", "start", "end"] )
df


Unnamed: 0,file,pages,insurers name,type,reg_no,insurer,phone,email,policy_type,chassis_no,start,end,time
52,acko-2w-comp.pdf,2,Geetinder Singh,2W,PB10FS2630,ACKO,XXXXXX9111,gXXXXXXXXXXXXi@yahoo.com,Comprehensive Policy,ME3U3K5C0GD042682,,,0.635562
23,acko-4w-comp.pdf,6,MANJU ISHWERI NARAYAN,4W,MH03BS0277,ACKO,XXXXXX2426,iXXXXXXXXXXXXXXXXXXXa@yahoo.co.in,Package Policy,MAT612354EKD08300,,,3.486937
17,acko-4w-tp.pdf,2,TEISOVINUO LOTHA,4W,NL07T0267,ACKO,XXXXXX8324,sXXXXXXXXXXXXXXXXXXXX6@gmail.com,Liability Policy,,,,0.584177
15,chola-2w-comp.pdf,2,,2W,TN10BC4732,CHOLA,9841198260,,,MD626AG78J2D14142,17/04/2022 00:01 hours,midnight on 16/04/2023,0.470728
37,edel-4w-comp.pdf,3,Mr. Pronojit Sen,4W,,EDELWEISS,,,Package Policy,,[00:00] of 12-Apr-2022,,1.364286
62,hdfc10_4w.pdf,2,MR MANJU DEVI,4W,HP12D2172,HDFC,9816086889,chandan.ranote@gmail.com,Liability Policy,MALAA51HR9M486097L,"16 Jan, 2022 00:01 hrs",,0.7089
46,hdfc11_4w.pdf,3,MR BEERAIAH B,4W,KA53C7581,HDFC,9731138386,prakashmbangalore@gmail.com,Package Policy,MAT600185GPJ34888,"03 Feb, 2022 00:01 hrs",,1.364823
45,hdfc12_2w.pdf,3,,2W,KA53EM8787,HDFC,9902059738,jagadeeshae7272@gmail.com,Comprehensive Policy,,"16 Jan, 2022 00:01 hrs",,2.69736
0,hdfc13_4w.pdf,3,MR MAYUR MADHUKAR PATIL,4W,MH11AW9594,HDFC,9096749797,mayur.m.patil9@gmail.com,Package Policy,MCA11835E07029773EMZ,"03 Nov, 2021 00:01 hrs",,2.554466
59,hdfc1_4w.pdf,3,MR PARIKSHIT MALHOTRA,4W,HP33F5772,HDFC,9418450772,drpm1972@gmail.com,Comprehensive Policy,,"18 Jan, 2022 00:01 hrs",17/01/2023 Midnight,1.230346


In [202]:
len( "17/04/2022" )

10