In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk

In [None]:
import en_core_web_sm
import spacy
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.tokens import Span
import jsonlines

In [None]:
df = pd.read_csv('../data/cleaned_data.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
df = df.dropna()

In [None]:
df.head()

In [None]:
raw_text= df.description

In [None]:
raw_names = df.job

In [None]:
PROJECT_DIR = os.path.dirname(os.getcwd()) + '/'

skill_pattern_path = PROJECT_DIR + "data/skill_patterns.jsonl"

In [None]:
#skill list for entity labeling 

with jsonlines.open(skill_pattern_path) as f:
    created_entities = [line['label'].upper() for line in f.iter()]

len(created_entities)


In [None]:
created_entities[1000:1200]

In [None]:
#loading pre-trained english model
nlp = en_core_web_sm.load()

# set pipeline
ner = nlp.create_pipe("ner")
nlp.pipe_names

In [None]:
def create_tokenized_texts_list(job_descriptions):
    '''Create tokenized text list from job description data'''
    description_tokenized = []
    
    for description in job_descriptions:
        description_tokenized.append(nlp(description))
        
    return description_tokenized

jd_tokenized = create_tokenized_texts_list(raw_text)

In [None]:
type(jd_tokenized)

In [None]:
def visualize_entity_ruler(entity_list, doc):
    '''Visualize the Skill entities of a job description'''
    
    options = {"ents": entity_list}
    displacy.serve(doc, style='ent', options=options)
    
    
for i in range(3757,3777):
    visualize_entity_ruler(created_entities, jd_tokenized[i])

In [None]:
def create_skill_set(doc):
    '''Create a set of the extracted skill entities of a JD'''
    
    return set([ent.label_.upper()[6:] for ent in doc.ents if 'skill' in ent.label_.lower()])

vacature_skillset = create_skill_set(jd_tokenized[1])

In [None]:
len(vacature_skillset)

In [None]:
def create_skillset_dict(resume_names, resume_texts):
    '''Create a dictionary containing a set of the extracted skills. Name is key, matching skillset is value'''
    skillsets = [create_skill_set(resume_text) for resume_text in resume_texts]

In [None]:
# Create a span for the new entity
fb_ent = Span(doc, 4, 8, label="GRADUATION")
orig_ents = list(doc.ents)
doc.ents = orig_ents + [fb_ent]
ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print(ents)
displacy.serve(doc, style="ent")