In [4]:
from gensim.models import Word2Vec, LdaMulticore, Doc2Vec
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument

from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models.phrases import Phraser, Phrases

import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('data/job_offer.csv')

In [5]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


In [6]:
title_corpus = df.title.map(simple_preprocess)

In [10]:
title_corpus[0]

['machine', 'learning', 'engineer']

In [15]:
title_tagged = [TaggedDocument(words=sent, tags=[i]) for i, sent in enumerate(title_corpus)]

In [30]:
title_model = Doc2Vec(vector_size=300, window_size=5, min_count=1)
title_model.build_vocab(title_tagged)

In [31]:
%time title_model.train(title_tagged, total_examples=title_model.corpus_count, epochs=10)

CPU times: user 21 s, sys: 3.26 s, total: 24.2 s
Wall time: 13.9 s


In [33]:
title_model.docvecs.most_similar(0)

[(7243, 0.8439275622367859),
 (20980, 0.8263928294181824),
 (27487, 0.796842634677887),
 (32357, 0.7925026416778564),
 (26840, 0.7919182777404785),
 (29206, 0.7912325859069824),
 (1245, 0.7801050543785095),
 (366, 0.7791067361831665),
 (34502, 0.7786611914634705),
 (34052, 0.7649329900741577)]

In [34]:
df[df.index==0].title

0    Machine Learning Engineer
Name: title, dtype: object

In [37]:
df[df.index==7243].title

7243    Senior Machine Learning Engineer (Relocate to ...
Name: title, dtype: object

In [39]:
df[df.index==34052].title

34052    Machine Operators and Assemblers
Name: title, dtype: object

## bigrams

In [40]:
title_bigram = Phraser(Phrases(title_corpus, min_count=2, threshold=1))

In [41]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + sent  

In [42]:
extended_corpus = list(prepare_corpus(title_corpus, title_bigram))

In [45]:
title_ext_tagged = [TaggedDocument(words=sent, tags=[i]) for i, sent in enumerate(extended_corpus)]
title_model_ext = Doc2Vec(vector_size=300, window_size=5, min_count=1)
title_model_ext.build_vocab(title_ext_tagged)

In [46]:
%time title_model_ext.train(title_ext_tagged, total_examples=title_model_ext.corpus_count, epochs=10)

CPU times: user 22 s, sys: 3.15 s, total: 25.1 s
Wall time: 13.3 s


In [48]:
df[df.index==0].title

0    Machine Learning Engineer
Name: title, dtype: object

In [47]:
title_model_ext.docvecs.most_similar(0)

[(863, 0.8998875617980957),
 (1315, 0.8756668567657471),
 (1317, 0.872447669506073),
 (1610, 0.8676667809486389),
 (3225, 0.8634226322174072),
 (3018, 0.853679895401001),
 (3389, 0.8476659655570984),
 (1627, 0.84410560131073),
 (1316, 0.8384473323822021),
 (10128, 0.8326234221458435)]

In [51]:
df[df.index==863].title

863    Observability - Integrations Engineer (Go)
Name: title, dtype: object

In [52]:
df[df.index==1315].title

1315    Kibana - Visualisations Engineer
Name: title, dtype: object

In [53]:
df[df.index==10128].title

10128    Supplier Engineer, China Integration
Name: title, dtype: object