## 2 Feb - Resume Parser

**This notebook is to run trials of my codes for the resume parser app.**

In [1]:
# Import necessary libraries
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from PyPDF2 import PdfReader 

In [2]:
# Load NLP model
nlp = spacy.load('en_core_web_md')

In [3]:
# Add entity ruler to NLP
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [16]:
# Load skills and education lables to the ruler
ruler.from_disk('skills_and_education.jsonl')

<spacy.pipeline.entityruler.EntityRuler at 0x1eae1f05c40>

In [34]:
# Load sample file
reader = PdfReader("resume_5.pdf")

In [35]:
# Extract text
text = str()

for i in range(len(reader.pages)):
    page = reader.pages[i]
    
    if i == 0:
        text += page.extract_text()
    else:
        text += ' ' + page.extract_text()

In [36]:
# Preprocess the text
def preprocessing(sentence):
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and \
            token.pos_ != 'SPACE' and token.pos_ != 'SYM':
                cleaned_tokens.append(token.text.lower().strip())
                
    return " ".join(cleaned_tokens)

text = preprocessing(text)

In [37]:
# Extract skills and education from the text
def get_skills_education(text):
    doc = nlp(text)
    skills = []
    education = []

    for i in range(len(doc.ents)):
        ent = doc.ents[i]

        if i < (len(doc.ents)-1):
            next_ent = doc.ents[i+1]
        
        if ent.label_ == "SKILL":
            skills.append(ent.text)
        
        if ent.label_ == "EDUCATION_PRO":
            education.append(ent.text)
        
        if ent.label_ == "EDUCATION_OF" and next_ent.label_ == "SKILL":
            education.append(ent.text + ' of ' + next_ent.text)
        
        if ent.label_ == "EDUCATION_IN" and next_ent.label_ == "SKILL":
            education.append(ent.text + ' in ' + next_ent.text)

    return list(set(skills)), education

skills, education = get_skills_education(text)


print('Skills')
print('******')
print(skills)
print('Education')
print('*********')
print(education)

Skills
******
['testing', 'computability', 'sorting', 'engineering', 'communications', 'theoretical computer science', 'cryptography', 'kong', 'inference', 'mobile', 'correctness', 'oracle', 'recursion', 'compass', 'electrical engineering', 'languages', 'algorithms', 'computer science', 'security', 'distributed computing', 'software', 'computation', 'computational complexity theory', 'business', 'computer security', 'markov chain', 'ai']
Education
*********
['b.s in electrical engineering', 'm.s in electrical engineering']


In [38]:
from spacy import displacy
doc = nlp(text)
displacy.render(doc, style='ent')