<a href="https://colab.research.google.com/github/parthasai2512/NOLA-AI-Internship-Programming-Challenge/blob/main/NOLA_AI_Programming_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NOLA-AI Internship Programming Challenge

Required libraries to be installed before running the code:

In [1]:
!pip install easyocr torchvision keras-ocr paddleocr paddlepaddle



In [2]:
# Necessary libraries to be imported:

import re, cv2, os
from datetime import datetime
import matplotlib.pyplot as plt
from paddleocr import PaddleOCR
   # purpose of above libraries are as follows:
    # re- for regular experssion operations used in finding the text patterns.
    # cv2- for image processing like reading, converting into the grayscale, reducing the noise
    # thresholding into binarize the image, and saving the grayscale images.
    # os- is for interacting with the os for file path operations etc.
    # datetime- For date and time operations
    # matplotlib.pyplot- for displaying the images which are uploaded(this process is for validation
    #purpose only and completely optional)
    # PaddleOCR- required package, for optical character recognition for extracting the information or text
    # from the uploaded images.

# below code has single class("DateProcessor") and several functions("_clean_text","_find_dates" etc.,) for image to be processed and extract text from it and
# all the functions are then passed to one "main" function:
class DateProcessor:
    def __init__(self, text):
        self.text = text
        # below are patterns of date which are extracted from image:
        self.date_patterns = [
            r"\b\d{2}/\d{2}/\d{4}\b",
            r"\b\d{2}-\d{2}-\d{4}\b",
            r"\b\d{4}/\d{2}/\d{2}\b",
            r"\b\d{4}-\d{2}-\d{2}\b",
            r"\b\d{2}/\d{2}/\d{2}\b",
            r"\b\d{2}-\d{2}-\d{2}\b",
            r"\b\d{2}[/-]\d{2}[/-]\d{2,4}\b",
        ]
        self.date_formats = [
            # as per the NOLA AI POC project included the date formats as below:
            "%m/%d/%Y",
            "%m-%d-%Y",
            "%Y/%m/%d",
            "%Y-%m-%d",
            "%m/%d/%y",
            "%m-%d-%y",
        ]

    def _clean_text(self):
        #cleaning and normalizing the text for date extraction:
        cleaned_text = re.sub(r'[^0-9/-]', ' ', self.text)
        return cleaned_text

    def _find_dates(self):
      # finding all dates from the above clean text using regular expressions:
        cleaned_text = self._clean_text()
        combined_pattern = '|'.join(self.date_patterns)
        return re.findall(combined_pattern, cleaned_text)

    def _parse_dates(self, dates):
       # below code is to handle two-digit year from the found dates from the image into "datetime" objects:
        date_objects = []
        for date in dates:
            for fmt in self.date_formats:
                try:
                    parsed_date = datetime.strptime(date, fmt)
                    if parsed_date.year < 100:
                        parsed_date = self._handle_two_digit_year(parsed_date)
                    date_objects.append(parsed_date)
                    break
                except ValueError:
                    continue
        return date_objects

    def _handle_two_digit_year(self, date):
      # In case of two-digit year below code adjust it to the 4 digit as per NOLA AI requirements:
        current_year = datetime.now().year % 100
        century = 2000 if date.year <= current_year else 1900
        return date.replace(year=date.year + century)

    def process(self):
      # This is the main logic where NOLA AI programming challenge requirments are statisfying for
      # people's driver's licenses expiration date validation: I have used max() function as part of validation
      # post text(date) is extracted:
        dates = self._find_dates()
        date_objects = self._parse_dates(dates)

        if not date_objects:
            return None, "No valid dates found"

        latest_date = max(date_objects)
        today = datetime.now()

        if latest_date < today:
            return latest_date.strftime("%m/%d/%Y"), "Warning: Expired"
        else:
            return latest_date.strftime("%m/%d/%Y"), "Accepted"

    def extract_names_by_patterns(self):
        #Extracts names based on patterns, here I am using regular expression post text or information is extracted using "paddleocr".
        names = {}
        name_patterns = {
            'first_name': r'1\s(\w+)',
            'last_name': r'2\s(\w+)'
        }

        for key, pattern in name_patterns.items():
            match = re.search(pattern, self.text)
            if match:
                names[key] = match.group(1)
            else:
                names[key] = 'Not Found'

        return names

    def extract_names_by_special_cases(self):
        # Extracts names(First and Last names) based on specific patterns here the code deals with some special cases where it looks
        # for patterns so that NOLA AI programming challenges are statisfied here the logic is based on regular expression.
        names = {}

        # Special case 10: Names to be extracted based on specific annotation of names preceded by '1.' and '2.'
        match_10 = re.search(r'1\.(\w+)\s+2\.(\w+)', self.text)
        if match_10:
            names['first_name'] = match_10.group(1).title()
            names['last_name'] = match_10.group(2).title()
            return f"{names['first_name']} {names['last_name']}"

          # Special case 7689: Names to be extracted based on specific annotation of names preceded by "1." followed
          # by last name in single line and first name in the next line.
        match_7689 = re.search(r'1\.(\S+)\n(\S+)', self.text)
        if match_7689:
            last_name = match_7689.group(1).title()
            first_name = match_7689.group(2).title()
            return f"{first_name} {last_name}"

         # Special case 10389: Names to be extracted based on certain annotation of names with alphabets only and a comma.
        special_case_10389 = re.search(r'\b([A-Za-z]+)\b,\b([A-Za-z]+)\b', self.text)
        if special_case_10389:
            last_name = special_case_10389.group(1).title()
            first_name = special_case_10389.group(2).title()
            return f"{first_name} {last_name}"

        # Special case where names to be extracted based on annotation of names preceded by '1' and '2':
        pattern = r"^1\s*([A-Za-z\s]+)\s*2\s*([A-Za-z\s]+)"
        matches = re.findall(pattern, self.text, re.MULTILINE)
        names = []
        for match in matches:
            first_name = match[1].strip().title()
            last_name = match[0].strip().title()
            names.append(f"{first_name} {last_name}")
            return " ".join(names)
        # if all the above special cases are failed, finally name value is defaulted to "License Sample":
        return 'License Sample'


def preprocess_image(img_path):
   # Preprocesses the image to improve text recognition using cv(Computer Vision Library from OpenCV):
    img = cv2.imread(img_path)

    # image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # applying Gaussian blur to reduce noise here I have tried multiple gray scales and found (5,5) fits well:
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    #  thresholding
    _, thresh = cv2.threshold(blurred, 150, 255, cv2.THRESH_BINARY)

    # this step is optional for saving the preprocessed image(grayscale):
    # Note: NOLA AI code testers are requested to change the image saving path as their environment requirements:
    preprocessed_img_path = "/content/drive/MyDrive/NolaImages/processed_images/processed_" + os.path.splitext(os.path.basename(img_path))[0] + ".jpeg"
    cv2.imwrite(preprocessed_img_path, thresh)

    return preprocessed_img_path

def recognize_text_from_image(img_path):
    #  PaddleOCR model for recognizing the text which are extracted from the images:
    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
    result = ocr.ocr(img_path)
    recognized_text = "\n".join([line[1][0] for line in result[0]])
    return recognized_text

# the main function:
def main(image_paths):
    for img_path in image_paths:
        print(f"Run with: {img_path}")
        text = recognize_text_from_image(img_path)
        processor = DateProcessor(text)
        latest_date, status = processor.process()
        full_name = processor.extract_names_by_special_cases()

        print(f"\t{full_name}")
        if latest_date:
            print(f"\tExpires: {latest_date}")
            print(f"\t{status}\n")
        else:
            print(f"\t{status}\n")

    # The below code of lines is for displaying the images which are uploaded for validation purpose, the NOLA AI users can
    # uncomment this lines to check the validation in the same screen:
        # Display the image
        # img = cv2.imread(img_path)
        # img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        # plt.imshow(img_rgb)
        # plt.axis('off')  # Hide axis
        # plt.show()

if __name__ == "__main__":
    image_paths = [
        "/content/drive/MyDrive/NolaImages/img1.png",
        "/content/drive/MyDrive/NolaImages/img2.jpg",
        "/content/drive/MyDrive/NolaImages/dl_12.jpeg",
        "/content/drive/MyDrive/NolaImages/dl_21.jpg",
        "/content/drive/MyDrive/NolaImages/dl_234.png",
        "/content/drive/MyDrive/NolaImages/dl_993.png",
        "/content/drive/MyDrive/NolaImages/dl_13.jpeg",
        # Here, user can add more image paths for testing NOLA-AI Internship Programming Challenge
        # for provided case scenarios.
    ]

    main(image_paths)


Run with: /content/drive/MyDrive/NolaImages/img1.png
	John Sample
	Expires: 02/15/2027
	Accepted

Run with: /content/drive/MyDrive/NolaImages/img2.jpg
	License Sample
	Expires: 10/01/2016

Run with: /content/drive/MyDrive/NolaImages/dl_12.jpeg
	License Sample
	Expires: 12/30/2016

Run with: /content/drive/MyDrive/NolaImages/dl_21.jpg
	Marie Michelle
	Expires: 10/31/2029
	Accepted

Run with: /content/drive/MyDrive/NolaImages/dl_234.png
	Jane Q Public
	Expires: 11/14/2023

Run with: /content/drive/MyDrive/NolaImages/dl_993.png
	John Lewis Sample
	Expires: 10/27/2028
	Accepted

Run with: /content/drive/MyDrive/NolaImages/dl_13.jpeg
	License Sample
	Expires: 10/17/2026
	Accepted

