In [38]:
# !pip install spacy or pip install -U 'spacy[cuda-autodetect]'

# !python -m spacy download en_core_web_sm #trained using cnn

# !python -m spacy download en_core_web_md #has word embedding (gloVe); trained using cnn

# !python -m spacy download en_core_web_trf #everything is trained using transformer

## 1. Loading Data

In [2]:
import pandas as pd
import numpy as np

df_resume = pd.read_csv("/content/resume.csv")

In [3]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
# df_resume = df_resume.copy().iloc[:1000, ]  # optional if your computer is fast, no need
df_resume = df_resume.copy()
df_resume.shape

(2484, 4)

## 2. Loading Skills and Education Data

In [6]:
import spacy

nlp = spacy.load('en_core_web_md')
skill_path = "/content/education_skill.jsonl"

In [7]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [8]:
doc = nlp("Chaky loves deep learning.")
doc.ents

(Chaky, deep learning)

## 3. Extracting Skills and Education 

In [9]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
167,15471999,DESIGNER STYLIST Summary Hi...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER
359,18001081,TEACHER Summary Kind a...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
2057,12545844,PUBLIC RELATIONS ACCOUNT COORDINATOR ...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS
859,29165698,GENERAL MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [10]:
from spacy.lang.en.stop_words import STOP_WORDS

# before that, let's clean our resume.csv dataframe
def preprocessing(sentence):
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM':
                cleaned_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(cleaned_tokens)

In [11]:
# random sampling
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         CONSULTANT           Professional Overview    Masters of Social Work Performed medical social work services for over 10 years, in multiple environments, providing patients and their families with comprehensive psychosocial assessment, crisis counseling, grief counseling, end of life counsel'

In [12]:
preprocessing(random_resume[:300])

'consultant professional overview masters social work perform medical social work service 10 year multiple environment provide patient family comprehensive psychosocial assessment crisis counseling grief counseling end life counsel'

In [13]:
# let's apply to the whole dataframe
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [14]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
167,15471999,DESIGNER STYLIST Summary Hi...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER,designer stylist summary high achieve sales as...
359,18001081,TEACHER Summary Kind a...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER,teacher summary kind compassionate elementary ...
2057,12545844,PUBLIC RELATIONS ACCOUNT COORDINATOR ...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS,public relation account coordinator summary pu...
859,29165698,GENERAL MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,general manager summary dedicate enthusiast ye...
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr administrator marketing associate hr admini...


## 4. Let's really Extract Skills and Education!!

In [15]:
def skills_and_education(text):
    # passing the text to the nlp
    doc = nlp(text)  # note that this nlp already know skills
    
    skills = []
    education = []
    
    # looking at the ents
    for ent in doc.ents:
        # if the ent.label_ is SKILL, then we append to some list
        if ent.label_ == "SKILL":
            skills.append(ent.text)
        if ent.label_ == "EDUCATION":
            education.append(ent.text)

    # converting to list 
    skills_list = list(set(skills))
    education_list = list(set(education))

    # reversing education list to display in order
    education_list.reverse()

    return skills_list, education_list

# def unique_skills(x):
    # return list(set(x))

In [16]:
df_resume.head(1)

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
167,15471999,DESIGNER STYLIST Summary Hi...,"<div class=""fontsize fontface vmargins hmargin...",DESIGNER,designer stylist summary high achieve sales as...


In [17]:
df_resume['Skills_Education'] = df_resume.Clean_resume.apply(skills_and_education)
# df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [18]:
df_resume.Skills_Education.iloc[0]

(['schedule'], ['diploma'])

## 5. Visualization

In [19]:
set(df_resume.Category)

{'ACCOUNTANT',
 'ADVOCATE',
 'AGRICULTURE',
 'APPAREL',
 'ARTS',
 'AUTOMOBILE',
 'AVIATION',
 'BANKING',
 'BPO',
 'BUSINESS-DEVELOPMENT',
 'CHEF',
 'CONSTRUCTION',
 'CONSULTANT',
 'DESIGNER',
 'DIGITAL-MEDIA',
 'ENGINEERING',
 'FINANCE',
 'FITNESS',
 'HEALTHCARE',
 'HR',
 'INFORMATION-TECHNOLOGY',
 'PUBLIC-RELATIONS',
 'SALES',
 'TEACHER'}

In [20]:
category = 'INFORMATION-TECHNOLOGY'
cond = df_resume.Category == category

df_resume_it = df_resume[cond]
df_resume_it.shape

(120, 6)

In [21]:
# skills_education = np.concatenate(df_resume_it.Skills_Education.values)

In [22]:
# counting
# from collections import Counter, OrderedDict

# counting = Counter(skills_education)
# counting = OrderedDict(counting.most_common(10))

In [23]:
# counting

In [24]:
# counting.shape

In [25]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 3))
# plt.xticks(rotation =45)

# plt.bar(counting.keys(), counting.values())

## 6. Name Entity Recognition

In [26]:
from spacy import displacy

text = df_resume_it.Clean_resume.iloc[43]

In [27]:
doc = nlp(text)

In [28]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [29]:
colors = {"SKILL": "linear-gradient(0.25turn, #3f87a6, #ebf8e1, #f69d3c)", 
          "EDUCATION": "linear-gradient(#e66465, #9198e5);"}
options = {"colors": colors}

displacy.render(doc, style="ent", options=options, jupyter=True)

## 7. Let's Load the PDF (adding some realism)

In [31]:
# ! pip install PyPDF2

In [32]:
from PyPDF2 import PdfReader
# pip install PyPDF2

reader = PdfReader("/content/someone_cv.pdf")
page = reader.pages[0] #first page just for demo
text = page.extract_text() 

In [33]:
text = preprocessing(text)

In [34]:
doc = nlp(text)

In [35]:
colors = {"SKILL": "linear-gradient(0.25turn, #3f87a6, #ebf8e1, #f69d3c)", 
          "EDUCATION": "linear-gradient(#e66465, #9198e5);"}
options = {"colors": colors}

displacy.render(doc, style="ent", options=options, jupyter=True)

In [37]:
# collecting all the skills and education and put it into a list

skills = []
education = []

for ent in doc.ents:
    if ent.label_ == 'SKILL':
        skills.append(ent.text)
    if ent.label_ == 'EDUCATION':
        education.append(ent.text)
        
# print(set(skills))
# print(set(education))