In [4]:
import pandas as pd
import re
import spacy
import numpy as np
import json
from spacy.tokens import DocBin
from tqdm import tqdm

In [3]:
# Load data
df = pd.read_csv("../data/raw/linkedin_jobs.csv")
df = df.drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,job_title,job_post,job_description,job_function,job_industry
0,Data Scientist (Entry Level),2 weeks ago,"<div class=""show-more-less-html__markup relati...",Information Technology,Staffing and Recruiting
1,Data Scientist Intern ('24),11 hours ago,"<div class=""show-more-less-html__markup relati...",Engineering and Information Technology,Software Development
2,Data Science,5 days ago,"<div class=""show-more-less-html__markup relati...",Engineering and Information Technology,IT Services and IT Consulting
3,Data Scientist,5 hours ago,"<div class=""show-more-less-html__markup relati...",Information Technology,"Technology, Information and Internet"
4,Data Scientist,1 week ago,"<div class=""show-more-less-html__markup relati...","Information Technology, Business Development, ...","Transportation, Logistics, Supply Chain and St..."


In [87]:
# Load spaCy 
nlp = spacy.load("en_core_web_lg")
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tokenizer', 'tagger', 'attribute_ruler']]
for pipe in disabled_pipes:
    nlp.disable_pipe(pipe)

# Text Preprocessing
# 1. Remove Stop Words and Punctuations
def preprocess_job_desc(text):
    text = re.sub("<[^>]+>", " ", text)  # remove html element tags
    text = re.sub("[ ]+", " ", text)  # remove long spaces
    text = re.sub("[^\u0000-\u007F]+", "", text)  # remove unicode characters/ non ASCII characters
    text = text.lower()  # transform to lower case
    text = text.strip()
    doc = nlp(text)
    return " ".join([word.text for word in doc])

texts = []
for i, row in tqdm(df.iterrows()):
    texts.append(preprocess_job_desc(row['job_description'])+"\n")


with open("../data/interim/jobs.txt", "w") as f:
    f.writelines(texts)

214it [00:04, 43.28it/s]


Prepare Train, Dev and Test Data

In [88]:


# Open the annotations
with open("../data/interim/annotations.json", "r") as f:
    annotations_json = json.load(f)

# Set params for random select data
total_size = len(annotations_json['annotations'])
train_len, dev_len, test_len = int(total_size * 0.8), int(total_size * 0.1), int(total_size * 0.1) 
data = []
for aj in annotations_json['annotations']:
    data.append((aj[0], aj[1]))    
indexes = np.arange(total_size)
np.random.shuffle(indexes)

# Prepare Train Data
train_data = data[:train_len] 
nlp = spacy.blank('en')
db = DocBin()
for text, annotations in train_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        if type(span) is not type(None):
            ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("../data/processed/train.spacy")

# Prepare Dev Data
dev_data = data[train_len : train_len + dev_len] 
nlp = spacy.blank('en')
db = DocBin()
for text, annotations in dev_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        if type(span) is not type(None):
            ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("../data/processed/dev.spacy")

# Prepare Test Data
test_data = data[train_len + dev_len : train_len + dev_len + test_len] 
nlp = spacy.blank('en')
db = DocBin()
for text, annotations in test_data:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        if type(span) is not type(None):
            ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("../data/processed/test.spacy")

In [104]:
test_data

<spacy.tokens._serialize.DocBin at 0x1b95a598ca0>

Train Model

In [91]:
# !python -m spacy init fill-config ../config/base_config.cfg ../config/config.cfg

In [90]:
# !python -m spacy train ../config/config.cfg --output ../models --paths.train ../data/processed/train.spacy --paths.dev ../data/processed/dev.spacy

# Test Model

In [10]:
# Load spaCy 
nlp = spacy.load("en_core_web_lg")
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tokenizer', 'tagger', 'attribute_ruler']]
for pipe in disabled_pipes:
    nlp.disable_pipe(pipe)

# Load the Test Data
test_data = DocBin().from_disk("../data/processed/test.spacy")
test_data.__len__()

# Access the Test Data and load Spacy Model
best_model = spacy.load("../models/output/model-best")
i = 0
for doc in test_data.get_docs(nlp.vocab):
    docs = best_model(doc.text)
    for doc in docs.ents:
        print(doc,">>>" , doc.label_)
    print()
    # spacy.displacy.serve(doc, style="ent")

fashion >>> KNOWLEDGE AND CONCEPTS
- thinking >>> KNOWLEDGE AND CONCEPTS
problem solvers >>> SOFTSKILLS
critical thinking >>> SOFTSKILLS
communication abilities >>> SOFTSKILLS
business analysis >>> KNOWLEDGE AND CONCEPTS
big data >>> KNOWLEDGE AND CONCEPTS
big query >>> KNOWLEDGE AND CONCEPTS
sql >>> PROGRAMMING LANGUAGES
data analysis >>> KNOWLEDGE AND CONCEPTS
python >>> PROGRAMMING LANGUAGES
r >>> PROGRAMMING LANGUAGES
statistical modeling >>> KNOWLEDGE AND CONCEPTS
machine learning >>> KNOWLEDGE AND CONCEPTS
religion >>> KNOWLEDGE AND CONCEPTS
disability status >>> KNOWLEDGE AND CONCEPTS
any >>> TOOLS AND TECHNOLOGIES

machine learning >>> KNOWLEDGE AND CONCEPTS

local retail >>> KNOWLEDGE AND CONCEPTS
- learning algorithms >>> KNOWLEDGE AND CONCEPTS
data science >>> KNOWLEDGE AND CONCEPTS
statistical analysis >>> KNOWLEDGE AND CONCEPTS
machine learning >>> KNOWLEDGE AND CONCEPTS
modeling >>> KNOWLEDGE AND CONCEPTS
python >>> PROGRAMMING LANGUAGES
r >>> PROGRAMMING LANGUAGES
sql >>