### Resume Parsing

In [811]:
# !python -m spacy download en_core_web_lg


In [812]:
# !pip install PyPDF2

### Load skill data

In [859]:
import re
import fitz
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher


##### Text Extraction 
Extracts text from a PDF resume file using PyMuPDF

In [860]:
text = ''
doc = fitz.open('data/sample.pdf') # open a document
for page in doc: # iterate the document pages
  text += ' '+page.get_text() # get plain text encoded as UTF-8



In [861]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

##### Preprocessing

Performs text preprocessing on the spaCy Doc object, including lemmatization, lowercasing and removal of stopwords, punctuation, symbols, and spaces.

In [876]:
def preprocessing(doc):
    stopwords    = list(STOP_WORDS)
    clean_tokens = []
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                processed_text = re.sub(
                    '[^a-zA-Z0-9@( )\s-]',
                    " ",
                token.lemma_)
                processed_text = processed_text.lower().strip()
                clean_tokens.append(processed_text)
    return " ".join(clean_tokens)
        
        

In [877]:
processed_text = preprocessing(doc)
print(processed_text)

copyright @template net strong experience work web design software include dreamweaver indesign flash etc familiarity health system health care organization familiarity quarkxpress indesign photoshop illustrator flash powerpoint word excel outlook outstanding knowledge graphic design computer software exceptional knowledge adobe acrobat illustrator photoshop indesign program mac platform profound knowledge website design animation experience web base advertising sound knowledge social networking channel proficient understanding branding work witgraphicstandards style sheet develop right product client extreme ability plan design create electronic artwork variety project job concept completion kathy j hughe graphic design artist contact professional skill career objective 3929 worley avenue lynchburg va 24501 434)-426 9987 kathy@jklmail com kathy com strong experience work web design software include dreamweaver indesign flash etc exceptional knowledge adobe acrobat illustrator photosho

##### Email Extraction

Extracts the email address from the spaCy token with the help of **like_email** method of spacy. 

In [878]:
def extract_email(doc):
    for token in doc:
        if token.like_email:
            return token.text
        else:
            email = None
    return email

extract_email(doc)

##### Phone Number Extraction

Extracts the all types of phone number from the provided text using a regular expression.

In [879]:
def extract_phone_number(text):
    phone_number_pattern = re.compile(
        r'(?:(?:\+\d{1,2}\s?)?[\(\[\{]?\d{3}[\)\]\}]?[\s\-]?\d{3}[\s\-]?\d{4})|'
        r'(?:\d{3}[\s\-]?\d{3}[\s\-]?\d{4})|'
        r'(?:\(\d{3}\)\s?\d{3}[\s\-]?\d{4})'
    )
    matches = re.findall(phone_number_pattern, text)
    if matches:
        return matches[0]
    else:
        return None

In [880]:
extract_phone_number(processed_text)

'434)-426 9987'

##### Skill Extraction

Extracts skills from the text using an spacy's entity ruler. With the help of **skills.jsonl** file ruler is added to spacy object
Also preprocessing is performed.


In [881]:
def extract_skills(text):
    skill_path = 'skills.jsonl'
    try:
        ruler = nlp.add_pipe("entity_ruler")
        ruler.from_disk(skill_path)
    except ValueError:
        pass
    try:
        doc = nlp(text)
        skills = list(doc.ents)
        gpe_entities = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        gpe_entities = [gpe.lower() for gpe in gpe_entities]
        skills = [sk.text for sk in skills if sk.text.lower() not in gpe_entities]
        skills = list(set(skills))
        if skills:
            pattern2 = "(\d{1})"
            skills =  [sk for sk in skills if len(re.findall(pattern2, sk)) < 2]
            return skills
        else:
            return []
    except:
        return []
 

In [882]:
extract_skills(processed_text)



['lynchburg va', 'mac platform familiarity']

##### Address Extraction

Extracts the address from the text with the help of spacy ner **GPE**.


In [869]:
def extract_address(text):
    doc = nlp(text)
    address = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    if address:
        return ' '.join(list(set(address[:3])))
    else:
        return None
    
extract_address(processed_text)



##### Certification Extraction

Extracts certifications and their details from the provided text using a Matcher. Also date filter is applied to get the certification data and also to slice the title text.

In [883]:
def extract_certifications(text):
    matcher = Matcher(nlp.vocab)
    #pattern
    certification_pattern = [
        {"LOWER": {"in": ["certification", "certificate", "course", "certifications"]}}, 
        {"ENT_TYPE": {"in": ["ORG", "PRODUCT"]}, "OP": "?"}, 
        {"ENT_TYPE": "DATE", "OP": "?"},
    ]
    matcher.add("CertificationPattern", [certification_pattern])
    doc = nlp(text)
    matches = matcher(doc)
    matched_spans = [doc[start:] for match_id, start, end in matches]
    print(matched_spans)
    certificates = []
    if matched_spans:
        for span in matched_spans:
            # print(span.text)
            cert_details = {}
            matched_cert = span.text
            doc = nlp(matched_cert)
            for ent in doc.ents:
                if ent.label_ == 'DATE':
                    date_span = span.text.find(ent.text)
                    cert_details['title'] = span.text[9:date_span]
                    cert_details['date'] = ent.text
                    certificates.append(cert_details)
                    break
                else:
                    cert_details['title'] = span.text[:5]
                    cert_details['date'] = None
        return certificates
    else:
        return certificates
    
extract_certifications(processed_text)

[]




[]

In [884]:
from spacy import displacy
colors = {"DATE": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"colors": colors}
doc = nlp(processed_text)
displacy.render(doc, style='ent', options=options)



### Putting it all together

In [885]:
class ResumeParser:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def extract_pdf(self):
        resume_text = ''
        doc = fitz.open('data/sample.pdf')
        for page in doc:
            resume_text += ' '+page.get_text()
        return resume_text
    
    def preprocessing(self, doc):
        stopwords    = list(STOP_WORDS)
        clean_tokens = []
        for token in doc:
            if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
                token.pos_ != 'SPACE':
                    processed_text = re.sub(
                        '[^a-zA-Z0-9@( )\s-]',
                        " ",
                    token.lemma_)
                    processed_text = processed_text.lower().strip()
                    clean_tokens.append(processed_text)
        return " ".join(clean_tokens)
    
    def extract_email(self, doc):
        for token in doc:
            if token.like_email:
                return token.text
            else:
                email = None
        return email
    
    def extract_phone_number(self, text):
        phone_number_pattern = re.compile(
            r'(?:(?:\+\d{1,2}\s?)?[\(\[\{]?\d{3}[\)\]\}]?[\s\-]?\d{3}[\s\-]?\d{4})|'
            r'(?:\d{3}[\s\-]?\d{3}[\s\-]?\d{4})|'
            r'(?:\(\d{3}\)\s?\d{3}[\s\-]?\d{4})'
        )
        matches = re.findall(phone_number_pattern, text)
        if matches:
            return matches[0]
        else:
            return None
    
    def extract_address(self, doc):
        # doc = self.nlp(text)
        address = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
        if address:
            return ' '.join(list(set(address[:3])))
        else:
            return None
    
    def extract_skills(self, text):
        skill_path = 'skills.jsonl'
        try:
            ruler = self.nlp.add_pipe("entity_ruler")
            ruler.from_disk(skill_path)
        except ValueError:
            pass
        try:
            doc = self.nlp(text)
            skills = list(doc.ents)
            gpe_entities = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
            gpe_entities = [gpe.lower() for gpe in gpe_entities]
            skills = [sk.text for sk in skills if sk.text.lower() not in gpe_entities]
            skills = list(set(skills))
            if skills:
                pattern2 = "(\d{1})"
                skills =  [sk for sk in skills if len(re.findall(pattern2, sk)) < 2]
                return [sk for sk in skills if len(sk) > 2]
                # print(n_skills)
                # return [sk for sk in skills if len([sk in string.digits]) > 1 == True]
            else:
                return []
        except:
            return []
        
    def extract_certifications(self, text):
        matcher = Matcher(self.nlp.vocab)
        #pattern
        certification_pattern = [
            {"LOWER": {"in": ["certification", "certificate", "course", "certifications"]}}, 
            {"ENT_TYPE": {"in": ["ORG", "PRODUCT"]}, "OP": "?"}, 
            {"ENT_TYPE": "DATE", "OP": "?"},
        ]
        matcher.add("CertificationPattern", [certification_pattern])
        doc = nlp(text)
        matches = matcher(doc)
        matched_spans = [doc[start:] for match_id, start, end in matches]
        certificates = []
        if matched_spans:
            for span in matched_spans:
                # print(span.text)
                cert_details = {}
                matched_cert = span.text
                doc = nlp(matched_cert)
                for ent in doc.ents:
                    if ent.label_ == 'DATE':
                        date_span = span.text.find(ent.text)
                        cert_details['title'] = span.text[:date_span]
                        cert_details['date'] = ent.text
                        certificates.append(cert_details)
                        break
                    else:
                        cert_details['title'] = span.text[:5]
                        cert_details['date'] = None
            return certificates
        else:
            return certificates

    def parse(self):
        resume_details = {}
        resume_text = self.extract_pdf()
        doc = self.nlp(resume_text)
        processed_text = self.preprocessing(doc)
        resume_details['email'] = self.extract_email(doc) 
        resume_details['phone_number'] = self.extract_phone_number(processed_text)
        resume_details['address'] = self.extract_address(doc)
        resume_details['skills'] = self.extract_skills(processed_text)
        resume_details['certifications'] = self.extract_certifications(processed_text)
        return  resume_details
        


In [886]:
parser = ResumeParser()
print(parser.parse())

{'email': 'kathy@jklmail.com', 'phone_number': '434)-426 9987', 'address': 'Profound Lynchburg', 'skills': ['lynchburg va', 'mac platform', 'mac platform familiarity'], 'certifications': []}


