# Importing Libraries

In [1]:
import cv2
import numpy as np
import ftfy
import pytesseract
from PIL import Image
import regex
from fuzzywuzzy import fuzz

# OCR

In [2]:
def OCR(imgpath):
    img = cv2.imread(imgpath)
    # img = cv2.resize(img, None, fx=2, fy=2,interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
    text = pytesseract.image_to_string(img, lang = 'eng+hin')
    text = ftfy.fix_text(text)
    text = ftfy.fix_encoding(text)
    return text

# Fuzzy substring match

In [3]:
def fuzzMatch( major, minor, errs = 4, threshold = 65 ):
    errs_ = 0
    major = major.lower()
    minor = minor.lower()
    s = regex.search(f"({minor}){{e<={errs_}}}", major)
    while s is None and errs_ <= errs:
        errs_ += 1
        s = regex.search(f"({minor}){{e<={errs_}}}", major)
    if(fuzz.token_set_ratio(minor,s)>threshold):
        return 1
    else:
        return 0

# Defining identification words

In [4]:
idKeyWords = {"PAN":["income","permanent","account"],"DL":["driving","drive","dl no","transport"], "Passport": ["republic","passport"],"Aadhaar":["aadhaar","आधार","uidai"]}

# Classification

In [5]:
def classify(text):
    for key in idKeyWords:
        for word in idKeyWords[key]:
            if(fuzzMatch(text,word)):
                return key
    return "other"    
    

In [None]:
# def isAadhar(ocrTxt):
#     res = ocrTxt.split()
#     for i in range(len(res) - 3):
#         check = 1
#         for j in range(3):
#             check = check and len(res[i+j]) == 4 and res[i+j].isdigit()
#         if(check):
#             return 1

# main function

In [6]:
def segregate(filepaths):
    classifiedDict = {"PAN":[],"DL":[],"Passport":[],"Aadhaar":[],"other":[]}
    for path in filepaths:
        ocrTxt = OCR(path).lower()
        doctype = classify(ocrTxt)
        if(doctype=="other"):
            res = ocrTxt.split()
            for i in range(len(res) - 3):
                check = 1
                for j in range(3):
                    check = check and len(res[i+j]) == 4 and res[i+j].isdigit()
                # check = check and ("male" in res or "female" in res)
                if(check):
                    doctype = "Aadhaar"
        classifiedDict[doctype].append(path)
    return classifiedDict

# Example

In [7]:
filepaths = ["aadhar.png","aadharlk.png","aadharlkb.png","image.jpg"]

In [8]:
classifieddict = segregate(filepaths)

In [9]:
classifieddict

{'PAN': ['image.jpg'],
 'DL': [],
 'Passport': [],
 'Aadhaar': ['aadhar.png', 'aadharlk.png', 'aadharlkb.png'],
 'other': []}

In [10]:
txt = OCR("aadharlkb.png").lower()

In [11]:
classify(txt)

'Aadhaar'

In [12]:
res = txt.split()


In [13]:
for i in range(len(res) - 3):
    check = 1
    for j in range(3):
        check = check and len(res[i+j]) == 4 and res[i+j].isdigit()
    if(check):
        print(res[i])

5249


In [14]:
("male" in res or "female" in res)

False

In [15]:
res

['भारतीय',
 'विशिष्ट',
 'पहचान',
 'प्राधिकरण',
 'unique',
 '1ठशञाशीदयांगा',
 '#प019',
 'छ',
 'का',
 'saba',
 'खरे,',
 'हाउस',
 'न.',
 '-26,',
 'गंगा',
 'विहार,',
 'करोली',
 'माता',
 'io,',
 'मंदिर',
 'के',
 'पास,',
 'महलगाँव,',
 'गिर्द,',
 'ग्वालियर,',
 'मध्य',
 'प्रदेश',
 '-',
 '474002',
 'address:',
 'umashankar',
 'khare,',
 'house',
 'no.',
 '-26,',
 'ganga',
 'vihar,',
 'near',
 'karoli',
 'mata',
 'mandir,',
 'mahalgaon,',
 'gird,',
 'gwalior,',
 'madhya',
 'pradesh',
 '-',
 '474002',
 '5249',
 '1788',
 '9093',
 'pci',
 '::',
 '9189',
 '8672.',
 '4995.',
 '5674',
 'gbi007_',
 '|',
 'a',
 'helpe@uidai.gov.iin',
 '|',
 'gd',
 'www.uidai.gov.in']

In [16]:
txt

'भारतीय विशिष्ट पहचान प्राधिकरण\nunique 1ठशञाशीदयांगा #प019 छ का\n\nsaba खरे, हाउस न. -26, गंगा विहार, करोली माता io,\nमंदिर के पास, महलगाँव, गिर्द, ग्वालियर,\nमध्य प्रदेश - 474002\n\naddress:\n\numashankar khare, house no. -26,\nganga vihar, near karoli mata mandir,\nmahalgaon, gird, gwalior,\n\nmadhya pradesh - 474002\n\n5249 1788 9093\n\npci :: 9189 8672. 4995. 5674\n\ngbi007_ | a helpe@uidai.gov.iin | gd www.uidai.gov.in\n\n'