In [21]:
import pandas as pd
import re
import spacy
from tqdm import tqdm

In [22]:
# Load data
df = pd.read_csv("../data/raw/linkedin_jobs.csv")
df = df.drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,job_title,job_post,job_description,job_function,job_industry
0,AI Research Scientist I,10 hours ago,Company Overview1910 Genetics is the only comp...,Engineering and Information Technology,Biotechnology Research
1,Data Scientist(Remote) - Junior/Entry Level,4 hours ago,The Job Market is Challenging due to more than...,Information Technology,IT Services and IT Consulting
2,Data Scientist (Entry Level),1 week ago,The world's largest Cruise Line is seeking a D...,Information Technology,Staffing and Recruiting
3,data scientist,9 hours ago,Now Brewing – Data Scientist #tobeapartnerFrom...,Management and Strategy/Planning,"Food and Beverage Services, Manufacturing, and..."
4,Junior Data Scientist,6 hours ago,This is a remote position. Junior Data Scienti...,Information Technology,Software Development


In [20]:
nlp = spacy.load("en_core_web_sm")
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tokenizer', 'tagger', 'lemmatizer', 'attribute_ruler']]
for pipe in disabled_pipes:
    nlp.disable_pipe(pipe)

# Text Preprocessing
# 1. Remove Stop Words and Punctuations
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([word.lemma_ for word in doc if not word.is_stop and word.is_alpha])

texts = []
for i, row in tqdm(df.iterrows()):
    if i == 20:
        break
    texts.append(preprocess_text(row['job_description'])+"\n")


with open("../data/interim/jobs.txt", "w") as f:
    f.writelines(texts)

20it [00:00, 42.34it/s]


In [13]:
# Load data
df = pd.read_csv("../data/raw/test.csv")
df = df.drop_duplicates().reset_index(drop=True)
df.head()

Unnamed: 0,job_title,job_post,job_description,job_function,job_industry
0,Machine Learning Engineer,3 hours ago,All below skills are must have skills: Experie...,Information Technology,"Information Services and Technology, Informati..."
1,Data Science,11 hours ago,"Hi,Greetings from Conch Technologies IncPositi...",Engineering and Information Technology,IT Services and IT Consulting
2,Data Science,1 day ago,"Hi,Greetings from Conch Technologies IncPositi...",Engineering and Information Technology,IT Services and IT Consulting
3,Junior Data Scientist,17 hours ago,This is a remote position. Junior Data Scienti...,Information Technology,Software Development
4,Data Scientist,41 minutes ago,DATA SCIENTIST 5TECHNOLOGY COMPANY ROLE IS REM...,Consulting,IT Services and IT Consulting


# Create Annotation for The Training Data

In [68]:
import spacy
import json
from spacy.tokens import DocBin

with open("../data/interim/annotations.json", "r") as f:
    annotations_json = json.load(f)

train_data = []
for aj in annotations_json['annotations']:
    train_data.append((aj[0], aj[1]))


nlp = spacy.blank('en')
db = DocBin()
for text, annotations in train_data[:-7]:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("../data/processed/train.spacy")

nlp = spacy.blank('en')
db = DocBin()
for text, annotations in train_data[-3:]:
    doc = nlp(text)
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label)
        ents.append(span)
    doc.ents = ents
    db.add(doc)
db.to_disk("../data/processed/dev.spacy")

# Test Model

In [77]:
nlp = spacy.load("en_core_web_md")
# disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe not in ['tokenizer', 'tagger', 'lemmatizer', 'attribute_ruler']]
# for pipe in disabled_pipes:
#     nlp.disable_pipe(pipe)

# Text Preprocessing
# 1. Remove Stop Words and Punctuations
def preprocess_job_desc(text):
    text = re.sub("<[^>]+>", " ", text)  # remove html element tags
    text = re.sub("[ ]+", " ", text)  # remove long spaces
    text = text.lower()  # transform to lower case
    text = text.strip()
    doc = nlp(text)
    return " ".join([word.text for word in doc if word.is_alpha])

texts = []
for i, row in tqdm(df.iterrows()):
    if i == 10:
        break
    texts.append(preprocess_job_desc(row['job_description'])+"\n")


# with open("../data/interim/jobs.txt", "w") as f:
#     f.writelines(texts)

10it [00:01,  7.90it/s]


In [82]:
best_model = spacy.load("../models/output/model-best")
doc = best_model(texts[-1])

In [83]:
for ent in doc.ents:
    print(ent.text, " >>> ", ent.label_)

python  >>>  PROGRAMMING LANGUAGES
sql  >>>  PROGRAMMING LANGUAGES
machine learning  >>>  KNOWLEDGE AND CONCEPTS
machine learning  >>>  KNOWLEDGE AND CONCEPTS
cloud platforms  >>>  KNOWLEDGE AND CONCEPTS
machine learning coding skills  >>>  KNOWLEDGE AND CONCEPTS
python  >>>  PROGRAMMING LANGUAGES
machine learning  >>>  KNOWLEDGE AND CONCEPTS
problem solving  >>>  SOFTSKILLS
communication and collaboration skills  >>>  SOFTSKILLS
any  >>>  TOOLS AND TECHNOLOGIES
any  >>>  TOOLS AND TECHNOLOGIES
