In [1]:
#spacy
import spacy
import en_core_web_lg
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
#import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines
import os

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings 
warnings.filterwarnings('ignore')

2021-12-02 10:44:07.256168: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-02 10:44:07.256240: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lewagonlew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lewagonlew/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
PATH = os.path.dirname(os.path.dirname(os.getcwd()))
PATH

'/home/lewagonlew/code/marcorodrigues/skills_matcher'

In [3]:
#skills dictionary for entitiy recognition/training the model

skills = pd.read_csv(PATH + "/data/dictionaries/all_skills.csv")
skills.head()

Unnamed: 0,Label,Skill
0,SKILL,Access software
1,SKILL,Cisco AnyConnect
2,SKILL,Calendar and scheduling software
3,SKILL,Meeting scheduling software
4,SKILL,Cloud-based data access and sharing software


In [59]:
#job description data

data = pd.read_csv(PATH + '/data/cleaned_data.csv')
data = data.drop(['Unnamed: 0'], axis = 1)
data = data.dropna()
data = data.drop_duplicates()
data.head()

Unnamed: 0,ISCO,major_job,job,position,location,description
0,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist id54315,"Villigen PSI, Aargau",[' You have an academic degree in physics or e...
1,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Applied Physicist (Computing) (EP-LBC-2021-125...,Geneva,[' Be in charge of the development of applicat...
2,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist (BE-ABP-LNO-2021-122-LD)...,Geneva,[' Contribute to the maintenance and developme...
3,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Medical Devices Physicist,"Newton, Cambridgeshire",[' Agency: Newton Colmore Consulting Reference...
4,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Fluidics Physicist,Cambridge,[' Agency: Newton Colmore Consulting Reference...


In [5]:
#resume data from kaggle - for tryout

df = pd.read_csv(PATH + "/data/Resume/Resume.csv")
df.head()

df = df.rename

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
data = data.drop(columns = ["ISCO", "major_job", "location"])

In [60]:
data.head()

Unnamed: 0,ISCO,major_job,job,position,location,description
0,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist id54315,"Villigen PSI, Aargau",[' You have an academic degree in physics or e...
1,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Applied Physicist (Computing) (EP-LBC-2021-125...,Geneva,[' Be in charge of the development of applicat...
2,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Accelerator Physicist (BE-ABP-LNO-2021-122-LD)...,Geneva,[' Contribute to the maintenance and developme...
3,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Medical Devices Physicist,"Newton, Cambridgeshire",[' Agency: Newton Colmore Consulting Reference...
4,21,SCIENCE AND ENGINEERING PROFESSIONALS,physicist,Fluidics Physicist,Cambridge,[' Agency: Newton Colmore Consulting Reference...


In [6]:
#large spacy english model - change to small (en_core_web_lg) for faster computation

nlp = spacy.load("en_core_web_sm")
skill_pattern_path = PATH + "/data/skill_patterns.jsonl"

In [7]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [8]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [9]:
#cleaning data function - call with TRUE value if lowering/stopword removiing/punctuation removing/ lemmatizing should be done
def clean_data(data, lower = False, stopw=False, punctuation=False, lemmatize=False):
    data = data.dropna()
    
    # digits
    data['description'] = data['description'].str.replace('\d+', '')
        
    if lower == True:
            ## Lower case
            data['description'] = data['description'].apply(lambda x: " ".join(x.lower()for x in x.split()))
            
    ## remove tabulation and punctuation
    if punctuation == True:
            data['description'] = data['description'].str.replace('[^\w\s]',' ')
            
    ## stopwords
    if stopw == True:
            stop = stopwords.words('english')
            data['description'] = data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
            
    ## lemmatizing
    if lemmatize == True:
            lemmatizer = WordNetLemmatizer()
            data['description'] = data['description'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

    ##depending on data frame, drop more columns, re-index
    ##clean_data.drop(columns = "position")
    
    return data

In [61]:
clean_data = clean_data(data, True, True, True, True)

In [65]:
clean_data.shape

(4104, 6)

In [66]:
#randomly choose 200 rows

clean_data = clean_data.reindex(np.random.permutation(clean_data.index))
clean_data = clean_data.copy().iloc[
    0:200,
]
clean_data.head()

Unnamed: 0,ISCO,major_job,job,position,location,description
551,21,SCIENCE AND ENGINEERING PROFESSIONALS,ecologist,Ecologist Public Sector,Greater London,experienced terrestrial ecologist good underst...
1349,23,TEACHING PROFESSIONALS,art teacher,Secondary Art Teacher wanted in Moston,Manchester,looking experience art department enthusiastic...
1710,22,HEALTH PROFESSIONALS,psychiatrist,General Adult Specialty Doctor Psychiatrist,West Midlands,dedicated platinum awarded compliance team ind...
2001,22,HEALTH PROFESSIONALS,pharmacist,Pharmacist,Birmingham,gphc registered previous experience month uk b...
3522,25,INFORMATION AND COMMUNICATIONS TECHNOLOGY PROF...,data analyst,Data analyst,Barcelona,analyze detail operation logistics complexity ...


In [67]:
clean_data.shape

(200, 6)

In [78]:
clean_data["skills"] = clean_data["description"].str.lower().apply(get_skills)
#clean_data["skills"] = clean_data["skills"].apply(unique_skills)
clean_data.head()

Unnamed: 0,ISCO,major_job,job,position,location,description,skills
551,21,SCIENCE AND ENGINEERING PROFESSIONALS,ecologist,Ecologist Public Sector,Greater London,experienced terrestrial ecologist good underst...,[]
1349,23,TEACHING PROFESSIONALS,art teacher,Secondary Art Teacher wanted in Moston,Manchester,looking experience art department enthusiastic...,[]
1710,22,HEALTH PROFESSIONALS,psychiatrist,General Adult Specialty Doctor Psychiatrist,West Midlands,dedicated platinum awarded compliance team ind...,[]
2001,22,HEALTH PROFESSIONALS,pharmacist,Pharmacist,Birmingham,gphc registered previous experience month uk b...,[]
3522,25,INFORMATION AND COMMUNICATIONS TECHNOLOGY PROF...,data analyst,Data analyst,Barcelona,analyze detail operation logistics complexity ...,[]


In [90]:
sent = nlp(clean_data["description"].iloc[80])
displacy.render(sent, style="ent", jupyter=True)

# Same approach with different data (resume.csv from Kaggle)

In [30]:
df = df.reindex(np.random.permutation(df.index))
df = df.copy().iloc[
    0:200,
]
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_Resume,skills
748,17963031,SYSTEM ADMINISTRATOR/ASSISTANT SUPERV...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE,system administrator assistant supervisor prof...,[]
1226,16509761,DIGITAL MEDIA INTERN Summary ...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA,digital medium intern summary competent reliab...,[]
852,45656814,FLOATER/TEACHER Summary To ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,floater teacher summary obtain position teachi...,[]
2462,22168194,CUSTOMER SERVICE AGENT Prof...,"<div class=""MPR skn-cbg1 fontsize fontface vma...",AVIATION,customer service agent professional summary cu...,[]
2405,14589288,ASSEMBLY MECHANIC Summary A...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,assembly mechanic summary aircraft mechanic 16...,[]


In [31]:
#different cleaning function

clean = []
for i in range(df.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        df["Resume_str"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

In [32]:
df["Clean_Resume"] = clean
df["skills"] = df["Clean_Resume"].str.lower().apply(get_skills)
df["skills"] = df["skills"].apply(unique_skills)
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_Resume,skills
748,17963031,SYSTEM ADMINISTRATOR/ASSISTANT SUPERV...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE,system administrator assistant supervisor prof...,[]
1226,16509761,DIGITAL MEDIA INTERN Summary ...,"<div class=""fontsize fontface vmargins hmargin...",DIGITAL-MEDIA,digital medium intern summary competent reliab...,[]
852,45656814,FLOATER/TEACHER Summary To ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,floater teacher summary obtain position teachi...,[]
2462,22168194,CUSTOMER SERVICE AGENT Prof...,"<div class=""MPR skn-cbg1 fontsize fontface vma...",AVIATION,customer service agent professional summary cu...,[]
2405,14589288,ASSEMBLY MECHANIC Summary A...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,assembly mechanic summary aircraft mechanic 16...,[]


In [33]:
df["skills"]

748     []
1226    []
852     []
2462    []
2405    []
        ..
1442    []
994     []
2234    []
1831    []
1981    []
Name: skills, Length: 200, dtype: object

In [48]:
sent = nlp(df["Resume_str"].iloc[40])
displacy.render(sent, style="ent", jupyter=True)

## adding Job Category as Entity

In [45]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])

In [87]:
#options= {"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(df["Resume_str"].iloc[50])
displacy.render(sent, style="ent", jupyter=True, options=options)