In [27]:
from paddleocr import PaddleOCR, PPStructure, draw_ocr, save_structure_res
from PIL import Image
import numpy as np
import os
import regex
import fitz
import cv2

In [77]:
subjects = [
    {'name': 'general paper', 'level': 1, 'grade': 'a'}, 
    {'name': 'economics', 'level': 1, 'grade': 'a'}, 
    {'name': 'chemistry', 'level': 2, 'grade': 'a'}, 
    {'name': 'physics', 'level': 2, 'grade': 'a'}, 
    {'name': 'mathematics', 'level': 2, 'grade': 'a'}, 
    {'name': 'project work', 'level': 1, 'grade': 'a'}
]
grade_to_h1rp = {
    'a': 10,
    'b': 8.75,
    'c': 7.5,
    'd': 6.25,
    'e': 5,
    's': 2.5,
    'u': 0
}
def calculate_rank_points(subjects):
    h1_subjs = [subj for subj in subjects if subj['level'] == 1]
    h1_subjs.sort(key=lambda x: x['grade'])
    h2_subjs = [subj for subj in subjects if subj['level'] == 2]
    h2_subjs.sort(key=lambda x: x['grade'])

    if len(h2_subjs) == 4: # 4H2 2H1 case, treat weakest H2 as a H1
        h2_subjs[-1]['level'] = 1
        h1_subjs.append(h2_subjs[-1]) # Move to h1 array
        del h2_subjs[-1]

    rp = 0.0

    for h1 in h1_subjs:
        rp += grade_to_h1rp[h1['grade']]
    for h2 in h2_subjs:
        rp += grade_to_h1rp[h2['grade']]*2
    
    return rp
        
calculate_rank_points(subjects)


90.0

In [12]:
GLOBAL_INPUT_DIR = '../data/input/'
GLOBAL_OCR = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
GLOBAL_PPSTRUCTURE = PPStructure(layout=False, lang='en')



In [58]:
result = GLOBAL_PPSTRUCTURE('../data/by_type/6_alvl/alvl/img/alvlcert.jpeg', return_ocr_result_in_table=True)
full_str = ' '.join([x[0] for x in result[0]['res']['rec_res']]).lower()
subj_str = regex.search(r'(grade\s*authority\s*(.*)\s*director\-general){e<=7}', full_str).group(2)
# subj_list = regex.split(r'\s*cambridge\s*', subj_str, maxsplit=0)
subj_str

'general paper h1 a cambridge chemistry h2 a cambridge biology h2 economics a cambridge h2 mathematics a cambridge "proteomics h2 a cambridge h3 dist cambridge '

In [79]:
input_path = os.path.join(GLOBAL_INPUT_DIR, os.listdir(GLOBAL_INPUT_DIR)[0])

def churn(input_file):
    # Reject input file if it is not a PDF
    if os.path.splitext(input_file)[-1] != '.pdf':
        raise Exception('Input file is not a PDF.')
    
    # OCR the entire PDF
    result = GLOBAL_OCR.ocr(input_file, cls=True)

    # Consider the first page of the PDF to classify the general type of document.
    # The general classification will be used to do a type-specific analysis which will give the desired output.
    first_page_texts = [line[1][0] for line in result[0]]
    merged_text = ' '.join(first_page_texts).lower()
    category_counts = {}
    categories = {
        'nlvl' : r'normal\s*\(\w+\)\s*level', # for n-level
        'olvl' : r'ordinary\s*level', # for o-level
        'ite' : r'national\s*ite\s*certificate\sin', # for NITEC or Higher NTIEC
        'poly' : r'polytechnic', # for poly
        'ted' : r'technical\s*engineer\s*diploma', # for TED
        'alvl' : r'advanced\s*level', # for a-level
        'ib' : r'baccalaureate', # for IB
        'nush' : r'nus\s*high' # for NUS High
    }
    for category in categories:
        pattern = '(' + categories[category] + '){e<=1}'
        category_counts[category] = len(regex.findall(pattern, merged_text))
    max_count_category = max(category_counts, key=category_counts.get)
    if max_count_category == 'psle':
        print('psle')
    elif max_count_category == 'nlvl':
        print('nlvl')
    elif max_count_category == 'olvl':
        print('olvl')
    elif max_count_category == 'ite':
        # Further differentiate between NITEC and H.NITEC
        if len(regex.findall(r'(higher\s*national\s*ite\s*certificate\sin){e<=1}', merged_text)):
            # its a Higher NITEC
            print('higher_nitec')
        else:
            print('nitec')
    elif max_count_category == 'poly':
        # Further differentiate between the 5 local polys
        polys = {
            'rp': r'republic',
            'np': r'ngee\s*ann',
            'nyp': r'nanyang',
            'tp': r'temasek',
            'sp': r'singapore'
        }
        poly_counts = {}
        for poly in polys:
            poly_pattern = '(' + polys[poly] + '){e<=1}'
            poly_counts[poly] = len(regex.findall(poly_pattern, merged_text))
        max_count_poly = max(poly_counts, key=poly_counts.get)
        if max_count_poly == 'rp':
            print('rp')
        elif max_count_poly == 'np':
            print('np')
        elif max_count_poly == 'nyp':
            print('nyp')
        elif max_count_poly == 'tp':
            print('tp')
        elif max_count_poly == 'sp':
            print('sp')
    elif max_count_category == 'ted':
        print('ted')
    elif max_count_category == 'alvl':
        print('alvl')
    elif max_count_category == 'ib':
        print('ib')
    elif max_count_category == 'nush':
        print('nush')

churn(input_path)


olvl
