In [1]:
#import the required modules


import re
from PIL import Image
import cv2
import pytesseract
from pytesseract import pytesseract, Output
import PIL
from PIL import Image, ImageFilter
from datetime import datetime
from dateutil import parser
import spacy
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Load the sample image

In [2]:
#Method to load the input image of the document
#This method only loads the document in image fromat such as jpg, png, jpeg

def load_image(img_path):
    image = cv2.imread(img_path)
    return image

## Preprocessing

###  Apply Blur to smooth image

In [3]:
def apply_smoothing(img):
#     image= Image.fromarray(img.astype('uint8'))
#     new_image = image.filter(ImageFilter.UnsharpMask(radius=100, percent=100))
    new_image = cv2.blur(img, (3,3))
    return new_image

### Convert to Grayscale

In [4]:
def convert_grayscale(img):
    img = cv2.cvtColor(img,
                   cv2.COLOR_BGR2GRAY)
    return img

### Adjust the brightness and contrast 


In [5]:
def adjust_saturation(img):
    alpha = 1.7 # Contrast control (1.0-3.0)
    beta = 10 # Brightness control (0-100)
    img = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)
    return img

### Apply Adaptive Threshold on image

In [6]:
def apply_threshold(img):
    img = cv2.adaptiveThreshold(img,255,
          cv2.ADAPTIVE_THRESH_MEAN_C,\
          cv2.THRESH_BINARY,11,2)
    return img

### Combine preprocessing

In [7]:
#Method to combine all the above preprocessing methods and returns final preprocessed image for text extraction

def combine_preprocessing(img):
    img = convert_grayscale(img)
    img = adjust_saturation(img)
    img = apply_threshold(img)
    img = apply_smoothing(img)
    return img

## Text Extraction from preprocessed image

In [8]:
#Method usese the pytesseract module to extract the text from preprocessed image

def extract_text(img):
    custom_config = r'--oem 3 --psm 6 '
    text1 = pytesseract.image_to_string(img) # first method to convert the image to string by detecting text
    text2 = pytesseract.image_to_string(img, config=custom_config)
    final_text = text1 + text2
#     text_list = list(set(text.split())
#     final_text = ' '.join(text_list)
    return final_text
    
    

## Specific methods for lisence information extraction from the extracted text

### Extract Citizenship Number

In [9]:
def extract_citizenship_no(text, text_list):
#     try:
    pattern1 = "\d{2}[.-]\d{2}[.-]\d{2}[.-]\d{5}"
    match = re.findall(pattern1, text)
    if match:
        citizen_no = match[0]
    else:
        pattern2 = "\d+[/]\d+"
        match = re.findall(pattern2, text)
        if match:
            citizen_no = match[0]
        else:
            for t in text_list:
                if 'Citizen' in t: 
                    pattern3 = '\d+'
                    match = re.findall(pattern3, t)
                    if match:
                        citizen_no = match[0]
                        break
                else:
                    citizen_no = None
    return citizen_no

### Extract Lisence Number

In [10]:
def extract_lic_no(text):
    pattern1 = "\d{2}[.-]\d{2}[.-]\d{8}"
    match = re.findall(pattern1, text)
    if match:
        l_no = match[0]
    else:
        pattern2 = "\d{3}[.-]\d{6}"
        match = re.findall(pattern2, text)
        if match:
            l_no = match[0]
        else:
            l_no = None
    return l_no

### Extract Phone Number

In [11]:
def extract_ph_no(text):
    pattern1 = "\d{10}"
    match = re.findall(pattern1, text)
    if match:
        ph_no = match[0]
    else:
        pattern2 = "\d{9}"
        match = re.findall(pattern2, text)
        if match:
            ph_no = match[0]
        else:
            ph_no = None
    return ph_no

### Extract Blood Group

In [12]:
def extract_bg(text, text_list):
    pattern1 = "(A|B|AB|O)(\+|-)"
    match = re.findall(pattern1, text)
    if match:
        bg = match[0][0]+match[0][1]
    else:
        for t in text_list:
            if 'BG' in t:
                pattern2 = '(A|B|AB|O) '
                match = re.findall(pattern2, t)
                if match:
                    bg = match[0]
                    break
            else:
                bg = None         
    return bg

### Extract the name of the Lisence Holder

In [13]:
def extract_name(text, text_list):
    for t in text_list:
        if 'Name' in t:
            pattern = '[A-Z]+|'
            match = re.findall(pattern, t)
            if match:
                name_list = [m for m in match if len(m) > 2]
#                 print('now', name_list)
                if len(name_list)> 2:
                    first_name = name_list[0]
                    middle_name = name_list[1]
                    last_name = ' '.join(name_list[2:])
                elif len(name_list) == 2:
                    first_name = name_list[0]
                    middle_name = ''
                    last_name = name_list[1]
                else:
                    first_name = name_list[0]
                    middle_name = ''
                    last_name = ''
            break
# #             else:
# #                 first_name = ''
# #                 middle_name = ''
# #                 last_name = ''
        else:
            first_name = ''
            middle_name = ''
            last_name = ''


    return first_name, middle_name, last_name

### Extract Address

In [14]:
def extract_address(text, text_list):
    for t in text_list:
        if 'Address' in t:  
            text_pos = text_list.index(t)
            add = t.split(' ')
            add = [re.sub(':', '', a) for a in add]
            pos = add.index('Address')
            add_list = add[pos+1:]
            add_list = [add for add in add_list if len(add) > 1]
            address = ' '.join(add_list)+' '+text_list[text_pos+1]
            break
        else:
            address = None
    return address

### Extract Lisence Office

In [15]:
def extract_lisence_office(text, text_list):
    for t in text_list:
        if 'Office' in t:
            off = t.split(' ')
            pos = off.index('Office')
            off_list = off[pos+1:]
            off_list = [off for off in off_list if len(off) > 1]
            office = ' '.join(off_list)
            break
        else:
            office = None
    return office

### Extract Lisence Category

In [16]:
def extract_category(text, text_list):
    try:
        for t in text_list:
            if 'Category' in t:
                t_list = t.split()
                t_list = [re.sub(':', '', tl) for tl in t_list]
                cat_pos = t_list.index('Category')

                t_list = t_list[cat_pos+1:]
                cat_t = ' '.join(t_list)

                pattern = '[A-Z][,.-][A-Z]|[A-Z]' 
                match = re.findall(pattern, cat_t)
                cat = [m for m in match if m!= '']
                category = ', '.join(cat)
                break
            else:
                category = None
    except: 
        category = None
    return category

### Extract Date of Expiry

In [17]:
def extract_doe(text, text_list):
    for t in text_list:
        if "D.O.E" in t:
            pattern1 = "\d{2}[.-]\d{2}[.-]\d{4}"
            match = re.findall(pattern1, t)
            if match:
                doe = match[0]
                break
        else:
            doe = None
        if doe == None:
            pattern2 = "\d{2}[.-]\d{2}[.-]\d{4} "
            match = re.findall(pattern2, text)
            if match:
                for m in match:
                    date_now = datetime.now()
                    try:
                        date = parser.parse(m)
                    except:
                        continue
                    if date > date_now:
                        doe = str(date)[:10]
                        break
            else:
                doe = None
                    
    return doe

### Extract Date of Birth

In [18]:
from dateutil.relativedelta import relativedelta

def get_date_difference(date1, date2):
    difference = abs(relativedelta(date1, date2).years)
    return difference


def extract_dob(text, text_list):
    for t in text_list:
        if "D.O.B" in t:
            pattern1 = "\d{2}[.-]\d{2}[.-]\d{4}"
            match = re.findall(pattern1, t)
            if match:
                dob = match[0]
                break
        else:
            dob = None
    if dob == None:
        pattern2 = "\d{2}[.-]\d{2}[.-]\d{4}"
        match = re.findall(pattern2, text)
        match = list(set(match))
        if match:
            for m in match:
                date_now = datetime.now()
                try:
                    date = parser.parse(m)
                except:
                    continue
                difference = get_date_difference(date, date_now) 
                if difference > 18 and difference < 100:
                    dob = str(date)[:10]
                    break
        else:
            dob = None

                    
    return dob

### Extract Date of Issue

In [19]:
def extract_doi(text, text_list):
    for t in text_list:
        if "D.O.I" in t:
            pattern1 = "\d{2}[.-]\d{2}[.-]\d{4}"
            match = re.findall(pattern1, t)
            if match:
                doi = match[0]
                break
        else:
            doi = None
    if doi == None:
        pattern2 = "\d{2}[.-]\d{2}[.-]\d{4}"
        match = re.findall(pattern2, text)
        match = list(set(match))
        if match:
            for m in match:
                date_now = datetime.now()
                try:
                    date = parser.parse(m)
                except:
                    continue
                difference = get_date_difference(date, date_now) 
                if date < date_now and difference < 25:
                    doi = str(date)[:10]
                    break
        else:
            doi = None

                    
    return doi

### Combination of all the information extracted

In [20]:
#This method combines all above methods to extract the particular data from the text extracted from the document

def combine_extracted_info(text):
    info = {}
    text_list = text.split('\n')
#     print(text_list)
    text_list = [tex for tex in text_list if tex != '']
    citizenship_no = extract_citizenship_no(text, text_list)
    info['citizenship_no'] = citizenship_no
    lic_no = extract_lic_no(text)
    info['lisence_no'] = lic_no
    ph_no = extract_ph_no(text)
    info['phone_no'] = ph_no
    bg = extract_bg(text, text_list)
    info['blood_group'] = bg
    first_name, middle_name, last_name = extract_name(text, text_list)
    info['first_name'] = first_name
    info['middle_name'] = middle_name
    info['last_name'] = last_name
    li_office = extract_lisence_office(text, text_list)
    info['lisence_office'] = li_office
    address = extract_address(text, text_list)
    info['address'] = address
    category = extract_category(text, text_list)
    info['category'] = category
    dob = extract_dob(text, text_list)
    info['date_of_birth'] = dob
    doi = extract_doi(text, text_list)
    info['date_of_issue'] = doi
    doe = extract_doe(text, text_list)
    info['date_of_expiry'] = doe
    return info

## Main method to collect all the result

In [21]:
# This is the main method which incorporate all above methods. 
#This take image of the document as input and outputs the extracted information in systematic way

def main(image_path):
    image = load_image(image_path)
    image = combine_preprocessing(image)
    text = extract_text(image)
    info = combine_extracted_info(text)
    return info


In [22]:
#Final Result
result = main('taskA-sample-data/main-sample.jpg')

## Result

### Exact Data

In [23]:
#Data Exactly present in the document 

exact_data = {
'citizenship_no': '251059/6599',
 'lisence_no': '03-06-00354234',
 'phone_no': '9869061498',
 'blood_group': 'AB+',
 'first_name': 'KIRAN',
 'middle_name': '',
 'last_name': 'LAMA',
 'lisence_office': 'Thulobharyang',
 'address': 'Kakani - 08, Nuwakot, Bagmati,Nepal',
 'category':'A',
 'date_of_birth': '10-11-1993',
 'date_of_issue': '31-12-2017',
 'date_of_expiry': '30-12-2022'
}


### Extracted Data

In [24]:
#Data Extracted from the Document
result

{'citizenship_no': '251059/6599',
 'lisence_no': '03-06-00354234',
 'phone_no': '9869061498',
 'blood_group': 'AB+',
 'first_name': 'KIRAN',
 'middle_name': '',
 'last_name': 'LAMA',
 'lisence_office': 'te Thulobharyang',
 'address': 'Kakani 08, Nuwakot, Bagmati',
 'category': None,
 'date_of_birth': None,
 'date_of_issue': '2017-12-31',
 'date_of_expiry': '39.42-2022'}

## Evaluation

### Compute Similarity between two text data

In [25]:
def compute_similarity(text1, text2):
    nlp = spacy.load('en_core_web_sm')
    if text1 == None or text2 == None:
        similarity = 0.0
    else:
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        similarity = doc1.similarity(doc2)
    return similarity * 100
    

### Evaluate Each data field and calculate their confidence

In [26]:
def evaluate_result(exact_data, result):
    eval_report = {}
    data_fields = list(exact_data.keys())
    for field in data_fields:
        eval_report[field] = compute_similarity(exact_data[field], result[field])
    return eval_report
        

### Evaluation Report

In [27]:
evaluate_result(exact_data, result)

  similarity = doc1.similarity(doc2)


{'citizenship_no': 100.0,
 'lisence_no': 100.0,
 'phone_no': 100.0,
 'blood_group': 100.0,
 'first_name': 100.0,
 'middle_name': 100.0,
 'last_name': 100.0,
 'lisence_office': 74.9599814786962,
 'address': 91.20217842351352,
 'category': 0.0,
 'date_of_birth': 0.0,
 'date_of_issue': 97.64284563983327,
 'date_of_expiry': 92.06335584414785}