## 2 Feb - Resume Parser

**This notebook is to run trials of my codes for the resume parser app.**

In [1]:
# Import necessary libraries
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from PyPDF2 import PdfReader 

In [2]:
# Load NLP model
nlp = spacy.load('en_core_web_md')

In [3]:
# Add entity ruler to NLP
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [4]:
# Load skills and education lables to the ruler
ruler.from_disk('skills_and_education.jsonl')

<spacy.pipeline.entityruler.EntityRuler at 0x25a36b44380>

In [5]:
# Load sample resume
reader = PdfReader("resume_3.pdf")

In [6]:
# Extract text
text = str()

for i in range(len(reader.pages)):
    page = reader.pages[i]
    
    if i == 0:
        text += page.extract_text()
    else:
        text += ' ' + page.extract_text()

In [7]:
# Preprocess the text
def preprocessing(sentence):
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and \
            token.pos_ != 'SPACE' and token.pos_ != 'SYM':
                cleaned_tokens.append(token.text.lower().strip())
                
    return " ".join(cleaned_tokens)

text = preprocessing(text)

In [8]:
# Extract skills and education from the text
def get_skills_education(text):
    doc = nlp(text)
    skills = []
    education = []

    for i in range(len(doc.ents)):
        
        if doc.ents[i].label_ == "SKILL":
            skills.append(doc.ents[i].text)
        
        if doc.ents[i].label_ == "EDUCATION_PRO":
            education.append(doc.ents[i].text)
        
        if doc.ents[i].label_ == "EDUCATION_OF" and doc.ents[i+1].label_ == "SKILL" and doc.ents[i+2].label_ == "SKILL":
            education.append(doc.ents[i].text + ' of ' + doc.ents[i+1].text + ' in ' + doc.ents[i+2].text)
        elif doc.ents[i].label_ == "EDUCATION_OF" and doc.ents[i+1].label_ == "SKILL" and doc.ents[i+2].label_ != "SKILL":
            education.append(doc.ents[i].text + ' of ' + doc.ents[i+1].text)
        
        if doc.ents[i].label_ == "EDUCATION_IN" and doc.ents[i+1].label_ == "SKILL":
            education.append(doc.ents[i].text + ' in ' + doc.ents[i+1].text)

    return list(set(skills)), list(set(education))

skills, education = get_skills_education(text)


print('Skills')
print('******')
print(skills)
print('Education')
print('*********')
print(education)

Skills
******
['business administration', 'database', 'electrical engineering', 'finance', 'data model', 'testing', 'aeronautics', 'software', 'business', 'support', 'engineering', 'python']
Education
*********
['master of business administration', 'bachelor of engineering in electrical engineering']
