In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from PyPDF2 import PdfReader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy

**Load Spacy Model**

In [None]:
class ResumeParser(object):
    def __init__(self):
        self.nlp = spacy.load('en_core_web_md')
        self.personal_skills = ["communication", "teamwork", "leadership", "problem-solving", "attention to detail", "time management","accountant","erp","critical thinking"]

    def preprocessing(self, sentence):
        stopwords = list(STOP_WORDS)
        doc = self.nlp(sentence)
        clean_tokens = []

        for token in doc:
            if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
                    token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())

        return " ".join(clean_tokens)

    def NER_Visualization(self, text):
        doc = self.nlp(text)
        displacy.render(doc, style="ent", jupyter=True)

    def entity_count(self, text):
        doc = self.nlp(text)
        entity_counts = {}

        for ent in doc.ents:
            entity_counts[ent.label_] = entity_counts.get(ent.label_, 0) + 1

        return entity_counts

    def extract_relevant_entity(self, doc):
        Doc = self.nlp(doc)
        for ent in Doc.ents:
            print(ent.text, ent.label_)

    def identify_personal_skills(self, text):
        doc = self.nlp(text)
        found_skills = set()

        for token in doc:
            if token.text.lower() in self.personal_skills:
                found_skills.add(token.text.lower())

        return found_skills

    def evaluate(self, skillSet):
      evaluation = 0
      for item in skillSet:
        if(item in self.personal_skills):
          evaluation += 1
      return print(f'{evaluation/len(self.personal_skills) * 100} %')


**Uploading PDF File**

An Example of a Resume

In [None]:
path = '/content/drive/MyDrive/NLP_A4/Resume/data/data/ACCOUNTANT/10554236.pdf'

Loadig the resume and get the first page

In [None]:

reader = PdfReader(path)
page = reader.pages[0]
text = page.extract_text()

In [None]:
resume = ResumeParser()

**NER Visualization**

In [None]:
resume.NER_Visualization(text)

**Extract Relevant Information**

In [None]:
resume.extract_relevant_entity(text)

the Department of Defense ORG
ERP ORG
Enterprise Resource Planning ORG
General Ledger ORG
DEAMS GPE
360B MONEY
first ORDINAL
fiscal year-end DATE
2012 DATE
DFAS Europe ORG
HQ USAFE ORG
July 2011 DATE
November 2012 DATE
State
Enterprise Resource Planning Office ORG
ERO ORG
the Defense Enterprise Accounting and Management System ORG
DEAMS GPE
ERO ORG
the DEAMS General Ledger ORG
daily DATE
DEAMS GPE
the DEAMS Functional
Management Office ORG
the DEAMS Program Management Office ORG
Ledger PERSON
360B MONEY
first ORDINAL
fiscal year-end DATE
2012 DATE
fiscal year 2010 DATE
2011 DATE
DEAMS GPE
fiscal year-end DATE
the Air Force Operational Test and Evaluation Center
(AFOTEC ORG
the Air Force ORG
DEAMS GPE
April 2010 DATE
June 2011 DATE
1st ORDINAL
Air Communications Operation Squadron ORG
1ACOS CARDINAL
$4.6M MONEY
four CARDINAL
USAFE Directorate of Intelligence ORG
USAFE ORG
USAFE Directorate of Air and Space Operations ORG
USAFE ORG
USAFE Directorate of Communications ORG
USAFE ORG
the 43

**Counting entity**

In [None]:
resume.entity_count(text)

{'ORG': 33,
 'GPE': 5,
 'MONEY': 3,
 'ORDINAL': 3,
 'DATE': 15,
 'PERSON': 2,
 'CARDINAL': 5}

**Identify Personal Skills**

In [None]:
skillset = resume.identify_personal_skills(text)

In [None]:
skillset

{'accountant', 'erp'}

**Evaluate skill performance**

In [None]:
resume.evaluate(skillset)

22.22222222222222 %
