# 0. Import Libraries

In [1]:
import re
from PyPDF2 import PdfReader
from spacy import displacy
import spacy

# 1. Load spacy

In [2]:
nlp = spacy.load('en_core_web_md')

## Entity ruler

In [3]:
ruler = nlp.add_pipe('entity_ruler', before='ner')

## Skills/Certificate/Degrees

In [4]:
import jsonlines

degree_path = 'app/data/degrees.jsonl'
skill_path = 'app/data/skills.jsonl'
certificate_path = 'app/data/certificate.jsonl'

# Load patterns from degrees.jsonl
patterns = []
# Read the JSONL file and load patterns
with jsonlines.open(degree_path) as reader:
    for line in reader:
        patterns.append(line)

# Read the JSONL file and load patterns
with jsonlines.open(skill_path) as reader:
    for line in reader:
        patterns.append(line)

# Read the JSONL file and load patterns
with jsonlines.open(certificate_path) as reader:
    for line in reader:
        patterns.append(line)

# Add combined patterns to the EntityRuler
ruler.add_patterns(patterns)


## Names

In [5]:
# Define name patterns
patterns = [{"label": 'NAME', 
            "pattern": [{'POS': 'PROPN'}, {'POS': 'PROPN'}]},  # First name and Last name
            {"label": 'NAME', 
            "pattern":[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]},  # First name, Middle name, and Last name
            {"label": 'NAME', 
            "pattern":[{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]}  # First name, Middle name, Middle name, and Last name
]

ruler.add_patterns(patterns)

## Email

In [6]:
# Emails contain one or more letters/symbols : [a-zA-Z0-9._%+-]+
# before the '@', must include the '@' : @
# the domain name: [a-zA-Z0-9.-]+
# the mandatory ' . ' : \\.
# and then two or letters indicating the extension (eg. .com, .eu) : [a-zA-Z]{2,}

patterns = [{"label": 'EMAIL', 
            "pattern": [{"TEXT": {"REGEX": "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"}}]}]

ruler.add_patterns(patterns)

## Websites

In [7]:
patterns = [{"label": 'WEBSITE', 
            "pattern": [{"TEXT": {"REGEX": "^(http://)|(https://)"}}]}]

ruler.add_patterns(patterns)

## Mobile numbers

In [8]:
patterns = [{"label": "MOBILE", 
            "pattern": [{"TEXT": {"REGEX": "((\d){7})"}}]}]

ruler.add_patterns(patterns)

## Company

In [9]:
# Match any that contains company, corporation, inc, or ltd
patterns = [{"label": "COMPANY", "pattern": [
        {"IS_ALPHA": True, "OP": "*"},  
        {"LOWER": {"IN": ["company", "corporation", "inc", "ltd"]}},  
        {"IS_ALPHA": True, "OP": "?"},  
    ]}]

ruler.add_patterns(patterns)

## 3. Extract functions

In [10]:
class ResumeInfo():
    def __init__(self):
        self.name = None
        self.email = None
        self.mobile = None
        self.websites = []
        self.education = []
        self.degree = []
        self.experience = []
        self.skills = []

In [11]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [12]:
def extract_info(filepath, display=False):
    reader = PdfReader(filepath)
    page = reader.pages[0]
    text = page.extract_text()
    doc = nlp(preprocessing(text))
    flag1 = True
    flag2 = True
    flag3 = True
    education_re = r"(education|university|institute|college)\b" # Find educational organizations
    person =  ResumeInfo()

    for i, ent in enumerate(doc.ents):
        match ent.label_:
            case 'NAME':
                if flag1:
                    person.name = ent.text
                    flag1 = False
            case 'EMAIL':
                if flag2:
                    person.email = ent.text
            case 'ORG':
                if re.search(education_re,ent.text):
                # print(ent.text)
                    edu = {"institute" : ' '.join(ent.text.strip().split())}
                    if doc.ents[i+1].label_ == 'GPE':
                        edu["location"] = doc.ents[i+1].text
                    person.education.append(edu)
            case 'SKILL':
                person.skills.append(ent.text)
            case 'WEBSITE':
                person.websites.append(ent.text)
            case 'MOBILE':
                if flag3:
                    person.mobile = ent.text
                    flag3 = False
            case 'COMPANY':
                person.experience.append(ent.text)
            case 'DEGREE':
                person.degree.append(ent.text)
    if display:
        colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
        options = {"colors": colors}

        displacy.render(doc, style='ent', options=options)
    return person

In [13]:
def extract_attributes(person):
    for key, value in person.__dict__.items():
        print(f"{key}: {value}")

# 2. Testing

In [14]:
person1 = extract_info("app/data/sample1.pdf", True)

In [15]:
person2 = extract_info("app/data/chaklam_resume.pdf", True)

In [16]:
extract_attributes(person2)

name: chaklam silpasuwanchai
email: chaklam@ait.asia
mobile: https://doi.org/10.1145/3359593
websites: ['http://chaklam.com', 'http://github.com/chaklam-silpasuwanchai']
education: [{'institute': '•asian institute'}, {'institute': '•asian institute'}]
degree: []
experience: []
skills: ['email', 'mobile', 'aws', 'azure', 'natural language processing', 'deep learning', 'software engineering', 'architecture', 'human computer interaction', 'projects', 'ai', 'legal', 'ai', 'eeg', 'blood glucose', 'raman spectroscopy', 'raman spectroscopy', 'blood glucose', 'deep learning', 'image analysis', 'working experience', 'engineering', 'technology', 'it', 'business technology', '2017', 'statistics', 'technology', '2017', 'technology', 'engineering', 'computer science', '2017', 'engineering', 'computer science', '2009', '2011', 'international', 'computer science', '2004', '5', 'publications', '11', '1', 'masking', 'social media', 'computational linguistics', '2', 'equation', 'uncertainty quantificati

In [17]:
person3 = extract_info('app/data/sample2.pdf')
extract_attributes(person3)

name: dfas europe
email: None
mobile: None
websites: []
education: []
degree: []
experience: ['m duplicate obligation experience company name', 'accounting adjustment ensure deams fiscal year end requirement complete these action recognize critical successful review report issue air force operational test evaluation center afotec result air force receive authority continue deployment deams company name']
skills: ['financial planning', 'reporting', 'analysis', 'defense', 'account reconciliation', 'financial reporting', 'critical thinking', 'accounting', 'analysis', 'system', 'erp', 'enterprise resource planning', 'software', 'resolve', 'general ledger', 'accounting', 'collaboration', '5', '2011', 'enterprise resource planning', 'office', 'defense', 'enterprise', 'accounting management', 'system', 'resolve', 'general ledger', 'teammate', 'pay', 'cash', 'budget', 'resolve', 'customer', 'system', 'testing', 'system', 'functional management', 'office', 'system', 'addition', 'design', 'resol

In [18]:
person4 = extract_info('app/data/sample3.pdf')
extract_attributes(person4)

name: hayden smith
email: haydensmith@email.com
mobile: None
websites: []
education: []
degree: []
experience: []
skills: ['501', '1', 'tip', 'email', 'live', 'r', 'ear', '11', 'service work', 'retail', 'soccer', 'contribute', 'communication', 'tip', 'availability', '8', 'tip', 'availability', 'skills', 'customer service', 'soccer', 'numeracy', 'cash handling', 'communication', 'supervisor', 'soccer', 'coaching', 'tip', '5', 'bullet', 'word', 'volunteering', 'sporting', '11', 'math', 'recreation', 'tip', 'professional development', 'training', 'form', 'accreditation', 'training']
