# Importing Libraries

In [1]:
import cv2
import numpy as np
import ftfy
import pytesseract
from PIL import Image
import regex
from fuzzywuzzy import fuzz

# OCR

In [20]:
def OCR(imgpath):
    img = cv2.imread(imgpath)
    # img = cv2.resize(img, None, fx=2, fy=2,interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
    text = pytesseract.image_to_string(img, lang = 'eng+hin')
    text = ftfy.fix_text(text)
    text = ftfy.fix_encoding(text)
    return text

# Fuzzy substring match

In [3]:
def fuzzMatch( major, minor, errs = 4, threshold = 65 ):
    errs_ = 0
    major = major.lower()
    minor = minor.lower()
    s = regex.search(f"({minor}){{e<={errs_}}}", major)
    while s is None and errs_ <= errs:
        errs_ += 1
        s = regex.search(f"({minor}){{e<={errs_}}}", major)
    if(fuzz.token_set_ratio(minor,s)>threshold):
        return 1
    else:
        return 0

# Defining identification words

In [10]:
idKeyWords = {"PAN":["income","permanent","account"],"DL":["driving","drive","dl no","transport"], "Passport": ["republic","passport"],"Aadhaar":["aadhaar","आधार"]}

# Classification

In [5]:
def classify(text):
    for key in idKeyWords:
        for word in idKeyWords[key]:
            if(fuzzMatch(text,word)):
                return key
    return "other"    
    

# main function

In [6]:
def segregate(filepaths):
    classifiedDict = {"PAN":[],"DL":[],"Passport":[],"Aadhaar":[],"other":[]}
    for path in filepaths:
        ocrTxt = OCR(path)
        doctype = classify(ocrTxt)
        if(doctype=="other"):
            res = ocrTxt.split()
            for i in range(len(res) - 1):
                check = 1
                for j in range(3):
                    check = check and len(res[i+j]) == 4 and res[i+j].isdigit()
                check = check and ("male" in res or "MALE" in res)
                if(check):
                    doctype = "Aadhaar"
        classifiedDict[doctype].append(path)
    return classifiedDict

# Example

In [21]:
filepaths = ["aadhar.png","aadharlk.png","aadharlkb.png","image.jpg"]

In [22]:
classifieddict = segregate(filepaths)

In [23]:
classifieddict

{'PAN': ['image.jpg'],
 'DL': [],
 'Passport': [],
 'Aadhaar': ['aadhar.png', 'aadharlk.png', 'aadharlkb.png'],
 'other': []}

In [18]:
txt = OCR("counteredge.jpg")

In [11]:
fuzzMatch(txt,"आधार")

1

In [19]:
txt

'a. ख़रे\nLyric Khare\nजन्म लिथे/008: 17/03/2002\nघुरुष/ MALE\n\n5249 1788 9093 (ae\nFraT आध्यार, म्नेरी पहलच्यान्त\n\n'