In [75]:
import re
import os
import numpy as np
import pandas as pd
import pdfplumber
import time
from nltk.tokenize import word_tokenize

from IPython.display import display, HTML
from notebook.services.config import ConfigManager

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get all words on page in a list of lists. Each word is represented by:
# [x0, y0, x1, y1, word, bno, lno, wno]
# The first 4 entries are the word's rectangle coordinates, the last 3 are just
# technical info (block number, line number, word number).
# The term 'word' here stands for any string without space.

In [76]:
def get_insurer_by_site(page):
    INSURERS = {
                "ACKO": "acko.com", "BAJAJ": "bajajallianz.com", "BHARTI": "bhartiaxa.com", "CHOLA": "cholainsurance.com",
                "EDELWEISS": "edelweissinsurance.com", "GODIGIT": "godigit.com", "HDFC": "hdfcergo.com", "ICICI": "icicilombard.com",
                "IFFICO-TOKIO": "iffcotokio.co.in", "KOTAK": "kotakgeneralinsurance.com", "LIBERTY": "libertyinsurance.in",
                "MAGMA": "magmahdi.com" , "NAVI": "navi.com", "NEW INDIA": "newindia.co.in", "NATIONAL": "nationalinsuranceindia.nic.co.in",
                "ORIENTAL": "orientalinsurance.org.in", "RELIANCE": "reliancegeneral.co.in", "ROYAL": "royalsundaram.in", "SBI": "sbigeneral.in",
                "SHRIRAM": "shriramgi.com", "TATA": "tataaig.com", "UNITED": "uiic.co.in"
               }
    
    for insurer in INSURERS:
        page_data = page.search( INSURERS[insurer], regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer

In [77]:
def get_insurer_by_name(page):
    INSURERS = [
        "ACKO", "BAJAJ", "BHARTI", "CHOLA", "EDELWEISS", "GODIGIT", "HDFC", "ICICI", "IFFICO-TOKIO", "KOTAK", "LIBERTY",
        "MAGMA", "NAVI", "NEW INDIA", "NATIONAL", "ORIENTAL", "RELIANCE", "ROYAL", "SBI", "SHRIRAM", "TATA", "UNITED"
    ]
  
    for insurer in INSURERS:
        page_data = page.search( insurer, regex=True, case=False )
        if len( page_data ) and "insurer" not in data:
            data["insurer"] = insurer
            

In [78]:
def find_mobile( page ):
    page_data = page.search( r'\s(\+91\-|0)?[6-9X][0-9X]{9}(,|\s)', regex=True, case=False )
    if len( page_data ) and "mobile" not in data:
        data["mobile"] = page_data[0]["text"].strip()
        data["mobile"] = data["mobile"][0:-1] if data["mobile"][-1] == "," else data["mobile"]

In [79]:
def find_email(page):
    page_data = page.search( r'[a-z0-9\.\-]+@+(gmail|yahoo|outlook|hotmail)+(\.com|\.co\.in)+', regex=True, case=False )
    if len( page_data ) and "email" not in data:
        data["email"] = page_data[0]["text"].strip() if len( data ) else None

In [80]:
def find_vehicle_type( page ):
    type_regex = [ r'(two|2)\s?\-?\s?wheeler\s?', r'(four|4)\s?\-?\s?wheeler\s?', r'(MOTORCYCLE|SCOOTER|BIKE)', r'(private)?\s?\-?\s?(car\s|car$)' ]
    for i in range(4):
        if len( page.search( type_regex[i], regex=True, case=False ) ) > 0:
            data["type"] = "4W" if i%2 else "2W" 
            break
    
#     (policy|package|hybrid|comprehensive|certificate|secure)

In [81]:
def find_policy_type( page ):
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Policy)?\s?\-?\s?Stand\s?\-?\s?alone\s?\-?\s?O(wn)?\s?D(amage)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
        data["policy_type"] = "Stand Alone Own Damage"
        return

#     page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?Package\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?(Package|Comprehensive)\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "2W"
#         data["policy_type"] = "Package Policy"
        data["policy_type"] = "Comprehensive Policy" if "comprehensive" in str(page_data[0]["text"]).lower else "Package Policy"
        return
    
#     page_data = page.search( r'Two\s?\-?\s?Wheeler\s?\-?\s?(Vehicles)?\s?\-?\s?\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?', regex=True, case=False )
#     if len( page_data ) and "insurance_type" not in data:
#         data["policy_type"] = "Comprehensive Policy"
#         return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Liability\s?\-?\s?(Only)?\s?\-?\s?(Policy)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?(Package|Comprehensive)\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["type"] = "4W"
        data["policy_type"] = "Comprehensive Policy" if "comprehensive" in str(page_data[0]["text"]).lower else "Package Policy"
        return

#     page_data = page.search( r'Private\s?\-?\s?Car\s?\-?\s?Comprehensive\s?\-?\s?(Policy)?\s?\-?\s?(Insurance)?', regex=True, case=False )
#     if len( page_data ) and "insurance_type" not in data:
#         data["policy_type"] = "Comprehensive Policy"
#         return
    
    page_data = page.search( r'Liability\s?\-?\s?(Only)?\s?\-?\s?(Insurance)?\s?\-?\s?(Policy)?\s?\-?\s?(Only)?\s?\-?\s?Private\s?\-?\s?Car', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Liability Policy"
        return
    
    page_data = page.search( r'Comprehensive\s?\-?\s?Policy\s?\-?\s?', regex=True, case=False )
    if len( page_data ) and "insurance_type" not in data:
        data["policy_type"] = "Comprehensive Policy"
        return

In [82]:
def find_reg_no( page ):
    page_data = page.search( r'Registration\s(No\s?\.?|Number|Mark)\s*:?\s*[A-Z]{2}\s?\-?\s?[0-9]{1,2}\s?\-?\s?[A-Z]{1,3}\s?\-?\s?[0-9]{2,5}', regex=True, case=False )
    if len( page_data ) > 0:
        reg_number = re.split('\s|\:', page_data[0]["text"] )[-1]
        data["reg_no"] = reg_number.replace( '-', "" )

In [83]:
def find_chassis_no( page ):
    page_data = page.search( r'Chass?is\s(No\s?\.?|Number)\s*:?\s*M[A-EZ][A-HJ-NPR-Z0-9]{14,19}', regex=True, case=False )
    if len( page_data ) > 0:
#         data["chassis_no"] = page_data[0]["text"]
#         print( "chassis", page_data[0]["text"] )
        chassis_number = re.split('\s|:', page_data[0]["text"] )[-1]
        data["chassis_no"] = chassis_number.replace( '-', "" )

In [84]:
def find_chassis_no_with_slash( page ):
    key = page.search( r'Chass?is\s(No\s?\.?|Number)\s?/', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'M[A-EZ][A-HJ-NPR-Z0-9]{15,19}\s?/', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return
    
    key = page.search( r'/\s?Chass?is\s(No\s?\.?|Number)', regex=True, case=False )
    if len( key ):
        chassis_number = page.search( r'/\s?M[A-EZ][A-HJ-NPR-Z0-9]{15,19}', regex=True, case=False )
        if len( chassis_number ) > 0:
            data["chassis_no"] = chassis_number[0]["text"].replace( r'\/', "" )
            return

In [102]:
def find_insured_name( page ):
    page_data = page.search( r'(Mr|Mrs?|M\/s)(\.\s?|\s)[A-Z]{1,20}\s[A-Z]{0,20}\s[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
#         print( page_data[0]["text"] )
        data["insurers_name"] = page_data[0]["text"]
        return
    
    page_data = page.search( r'insured(\'s)?\s*name(\s|\s?:\s?)(Mr|Mrs?|M\/s|Ms\.)?(\.\s?|\s)?s*[A-Z]{1,20}\s[A-Z]{0,20}\s[A-Z]{0,20}', regex=True, case=False )
    if len( page_data ) and "insurers_name" not in data:
        insurer_name = re.sub( "Insured(\'s)?\s*Name\s*:?\s*", "", page_data[0]["text"] )
#         str(page_data[0]["text"]).replace( "", "", regex=True )
        data["insurers_name"] = insurer_name
#         print( "insurer name :  ", insurer_name )


In [86]:

def find_period(page):
#     date_time_regex = /([0-9]{2}[\-|\/|\s][[0-9]{2}|[a-z]{3}]+[\-|\/|\s|\,]+[20]?[1,2]+[0-9]+)[\s|\-|t]+([0-9]{2}\:[0-9]{2}\:?[0-9]{2}?)?\s?[hrs|hours]*/
#     time_date_regex = /([0-9]{2}\:[0-9]{2}\:?[0-9]{2}?)?\s?[hrs|hours]*\s?[of|on]*\s([0-9]{2}[\-|\/|\s][[0-9]{2}|[a-z]{3}]+[\-|\/|\s|\,]+[20]?[1,2]+[0-9]+)[\s|\-|\n]+/
    
    x = page.search( 'from', regex=True, case=False )
    print( page, [x[0]["text"], x[0]["x0"], x[0]["top"],  x[0]["x1"],  x[0]["bottom"] ] if len( x ) else "" )
    
    x = page.search( 'to', regex=True, case=False )
    print( page, [x[0]["text"], x[0]["x0"], x[0]["top"],  x[0]["x1"],  x[0]["bottom"] ] if len( x ) else "" )

In [103]:
df = pd.DataFrame( columns=[ "file", "pages", "insurers name", "type", "reg_no", "insurer", "phone", "email", "policy_type", "chassis_no", "time" ] )
files = os.listdir("./all_policy/Motor/misc")

count = 0

for file in files:
# for file in [ "hdfc10.pdf" ]:
    count += 1
    print( count, file )
    
    start_time = time.time()
    
    policy = pdfplumber.open( f'all_policy/Motor/misc/{file}' )
    
    insurer_count = {}
    data = {}

    for page in policy.pages:
        find_insured_name( page )
        get_insurer_by_site( page ) if "insurer" not in data else None
        find_mobile( page ) if "mobile" not in data else None
        find_email( page ) if "email" not in data else None
        find_vehicle_type( page ) if "type" not in data else None
        find_policy_type( page ) if "policy_type" not in data else None
        find_reg_no( page ) if "reg_no" not in data else None
        find_chassis_no( page ) if "chassis_no" not in data else None

        
    for page in policy.pages:
        if "insurer" not in data:
            get_insurer_by_name( page )
            
        if "chassis_no" not in data:
            find_chassis_no_with_slash( page )


    print( "data", data, "\n" )
    end_time = time.time()

    df.loc[ len( df.index ) ] = [ 
        file,
        len( policy.pages ),
        data["insurers_name"] if "insurers_name" in data else None,
        data["type"] if "type" in data else None,
        data["reg_no"] if "reg_no" in data else None,
        data["insurer"] if "insurer" in data else None,
        data["mobile"] if "mobile" in data else None,        
        data["email"] if "email" in data else None,
        data["policy_type"] if "policy_type" in data else None,
        data["chassis_no"] if "chassis_no" in data else None,
        ( end_time - start_time )
    ]
#     break
    
#     https://github.com/jsvine/pdfplumber

1 hdfc13_4w.pdf
data {'insurers_name': 'MR MAYUR MADHUKAR PATIL', 'mobile': '9096749797', 'email': 'mayur.m.patil9@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'MH11AW9594', 'chassis_no': 'MCA11835E07029773EMZ', 'insurer': 'HDFC'} 

2 reliance11_4w.pdf
data {'insurers_name': 'MS.PRIYA AWASTHI ', 'mobile': '8433707105', 'email': 'dhirajawasthi@yahoo.com', 'type': '4W', 'reg_no': 'GJ15CK8239', 'insurer': 'RELIANCE', 'policy_type': 'Package Policy'} 

3 iffco1_4w.pdf
data {'mobile': 'XXXXXXX741', 'insurer': 'IFFICO-TOKIO', 'type': '4W'} 

4 icici1_2w.pdf
data {'insurer': 'ICICI', 'mobile': '9356278702', 'type': '2W', 'reg_no': 'TS09EE9725', 'chassis_no': 'ME4JF502LET601661', 'email': '2000@YAHOO.COM', 'policy_type': 'Package Policy'} 

5 reliance2_2w.pdf
data {'insurers_name': 'MR.B SENTHIL KUMAR', 'mobile': '8433707105', 'email': 'senthilkumar@gmail.com', 'type': '2W', 'policy_type': 'Stand Alone Own Damage', 'reg_no': 'TN12AH5335', 'insurer': 'RELIANCE', 'chassis

data {'insurers_name': 'M/S RAJ KUMAR KAIN', 'mobile': '7982382088', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'DL08CNB8249', 'insurer': 'HDFC'} 

43 iffco10_2w.pdf
data {'insurers_name': 'RAJASEKHAR CH ', 'mobile': 'XXXXXXX689', 'type': '2W', 'insurer': 'IFFICO-TOKIO'} 

44 iffco4_2w.pdf
data {'insurers_name': 'RAKESH KUMAR MAURYA', 'mobile': 'XXXXXXX540', 'type': '2W', 'insurer': 'IFFICO-TOKIO'} 

45 royalsundaram-4w-comp.pdf
data {'insurers_name': 'Mr. LAVJIBHAI MOHANBHAI HAPALIYA', 'insurer': 'ROYAL', 'mobile': 'xxxxxx5540', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'GJ5JP5327'} 

46 hdfc12_2w.pdf
data {'insurer': 'HDFC', 'mobile': '9902059738', 'email': 'jagadeeshae7272@gmail.com', 'type': '2W', 'policy_type': 'Comprehensive Policy', 'reg_no': 'KA53EM8787'} 

47 hdfc11_4w.pdf
data {'insurers_name': 'MR BEERAIAH B ', 'mobile': '9731138386', 'email': 'prakashmbangalore@gmail.com', 'type': '4W', 'policy_type': 'Package Policy', 'reg_no': 'KA53C7581', 'ch

In [47]:
re.split('(Insured|Name|:\s)', "Insured's Name: MADHAV RADHEYSHYAM KATIYAR" )[-1]

'MADHAV RADHEYSHYAM KATIYAR'

In [108]:
df = df.sort_values("time")
# df.filter( ["file", "chassis_no"] )
df


Unnamed: 0,file,pages,insurers name,type,reg_no,insurer,phone,email,policy_type,chassis_no,time
26,newindia-2w-comp.pdf,3,RUPAM DAS,2W,AS01DB1446,NEW INDIA,XXXXXX6455,noonmati020@gmail.com,Package Policy,MB8NG4BAHG8239880/,0.422167
15,chola-2w-comp.pdf,2,,2W,TN10BC4732,CHOLA,9841198260,,,MD626AG78J2D14142,0.486808
17,acko-4w-tp.pdf,2,TEISOVINUO LOTHA,4W,NL07T0267,ACKO,XXXXXX8324,sXXXXXXXXXXXXXXXXXXXX6@gmail.com,Liability Policy,,0.579974
35,newindia-4w-comp.pdf,7,GAURAV GOEL,4W,HR26CD8127,NEW INDIA,XXXXXX8030,goel209@yahoo.co.in,Package Policy,,0.616491
11,magma-4w-comp.pdf,2,Mrs. Mrsjayshreeben Kamleshkumar Prajapati,4W,,MAGMA,9725056102,primekamlesh@gmail.com,Package Policy,,0.631324
5,newindia-4w-tp.pdf,7,BIKRAM HAZARIKA,4W,AS12M6347,NEW INDIA,XXXXXX0262,akashchetry799@gmail.com,Liability Policy,MA3ERLF1S00399960/,0.633295
52,acko-2w-comp.pdf,2,Geetinder Singh,2W,PB10FS2630,ACKO,XXXXXX9111,gXXXXXXXXXXXXi@yahoo.com,,ME3U3K5C0GD042682,0.650385
3,icici1_2w.pdf,2,,2W,TS09EE9725,ICICI,9356278702,2000@YAHOO.COM,Package Policy,ME4JF502LET601661,0.67928
20,navi-4w-tp.pdf,8,,4W,KA03ML1507,NAVI,,,Liability Policy,MALAM51CR9M333424,0.760944
6,national-4w-comp.pdf,2,Mr. Edward I,4W,,NATIONAL,9916098981,edwardlawson87@gmail.com,Package Policy,,0.788348


In [73]:
len( "ASHWATHANARAYAN" )

15