# Preprocess, Vectorize (Word2Vec) and Similarity Measure

In [97]:
import pandas as pd
import sqlite3
import spacy
import multiprocessing
from gensim.models import Word2Vec

In [3]:
# Connecto to DB and load Job post into DF
con = sqlite3.connect("../collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [4]:
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [9]:
# Get job titles
job_titles = set(job_df['search_kw'].to_list())

# Get job descriptions
job_descriptions = job_df['description'].to_list()

In [11]:
job_titles, job_descriptions[0][:100]

({'data analyst', 'data engineer', 'data scientist'},
 'Do you want a meaningful role in a company that is making a difference in the world? Do you want to ')

In [24]:
corpus = "".join(job_descriptions)
corpus = corpus.lower()

In [26]:
# Download pretrained english model
try:
    import en_core_web_lg
except:
    !python -m spacy download en_core_web_lg
    import en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 31.8 MB/s 
Installing collected packages: en-core-web-lg
    Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [34]:
job_descriptions = [description.lower() for description in job_descriptions]

In [37]:
nlp = en_core_web_lg.load()
docs = list(nlp.pipe(job_descriptions))

In [84]:
nlp.Defaults.stop_words.add("\n")

# Preprocess the text
def process_text(doc):
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    # return " ".join(result)
    return result

In [82]:
a = process_text(docs[0])
type(docs[0])

spacy.tokens.doc.Doc

In [88]:
# Process every document
for i in range(len(docs)):
    docs[i] = process_text(docs[i])

In [96]:
print("JOB DESCRIPTION AFTER PREPROCESSING:\n")
print(docs[0][:20])

JOB DESCRIPTION AFTER PREPROCESSING:

['want', 'meaningful', 'role', 'company', 'make', 'difference', 'world', 'want', 'involve', 'important', 'environmental', 'resource', 'area', 'today', 'want', 'learn', 'involve', 'develop', 'deploy', 'machine']


# Word2Vec using Gensim

In [127]:
# Dimensionality of the resulting word vectors.
num_features = 10
# Minimum word count threshold.
min_word_count = 1

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

In [128]:
model = Word2Vec(
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
)

In [129]:
model.build_vocab(sentences=docs)

In [130]:
print("Word2Vec vocabulary length:", len(model.wv.vocab))

Word2Vec vocabulary length: 14185


In [131]:
model.train(sentences=docs, total_examples=model.corpus_count, epochs=100)

(49533679, 53844200)

In [122]:
model.most_similar("morning")

KeyError: "word 'morning' not in vocabulary"

In [132]:
model['stack']

array([-2.0381882 , -0.53340995, -0.03796148, -4.180321  ,  1.1872729 ,
        7.111414  , -3.855373  , -4.5400825 , -1.9881306 , -3.1982915 ],
      dtype=float32)