In [155]:
from gensim.models import Word2Vec, LdaMulticore
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models.phrases import Phraser, Phrases

import numpy as np
import pandas as pd
import seaborn as sns

from ast import literal_eval

import matplotlib.pyplot as plt
%matplotlib inline

In [115]:
ls data

[0m[01;31mjob_ofer.csv.tar.gz[0m  job_offer.csv  reviews_data.txt  [01;31mreviews_data.txt.gz[0m


In [116]:
df = pd.read_csv('data/job_offer.csv')

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36109 entries, 0 to 36108
Data columns (total 8 columns):
title              36109 non-null object
company_name       33925 non-null object
address            36109 non-null object
description        36109 non-null object
seniority_level    36109 non-null object
employment_type    36109 non-null object
job_function       36099 non-null object
industries         36095 non-null object
dtypes: object(8)
memory usage: 2.2+ MB


In [118]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


In [119]:
df.seniority_level.value_counts()

Entry level         14232
Associate            9929
Mid-Senior level     4162
Not Applicable       3878
Director             2050
Internship           1169
Executive             689
Name: seniority_level, dtype: int64

In [120]:
# przyklad
corpus = [
    ['a', 'b', 'x'],
    ['a', 'b', 'x'],
    ['a', 'b', 'c', 'w'],
    ['q', 'b', 'e', 'u', 'k'],
]
bigram = Phraser(Phrases(corpus, min_count=2, threshold=1))

In [121]:
bigram[['k', 'a', 'b', 'c', 'b', 'c']]

['k', 'a_b', 'c', 'b', 'c']

 ## Word2Vec

In [122]:
title_corpus = df.title.map(simple_preprocess)
title_bigram = Word2Vec(title_corpus, size=100, window=2, min_count=1)
title_bigram.wv.most_similar('machine')

[('deep', 0.9618822336196899),
 ('big', 0.85960853099823),
 ('scientists', 0.8447020053863525),
 ('scientist', 0.8414903283119202),
 ('computer', 0.838074803352356),
 ('inference', 0.8344436287879944),
 ('nlp', 0.8316723108291626),
 ('natural', 0.8311825394630432),
 ('three', 0.8306556940078735),
 ('alpharetta', 0.8291459083557129)]

## Title + phrases

In [123]:
title_corpus = df.title.map(simple_preprocess)
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))

In [124]:
title_bigram[simple_preprocess('Excellent Teaching Opportunity In China')]

['excellent_teaching', 'opportunity_in', 'china']

In [125]:
title_corpus_phrase = [title_bigram[sent] for sent in title_corpus]
model = Word2Vec(title_corpus_phrase, size=100, window=2, min_count=1)

In [126]:
model.wv.most_similar('machine')   

[('high_school', 0.9452986121177673),
 ('victoria_airport', 0.9441039562225342),
 ('retail_become', 0.9433714151382446),
 ('retail_fragrance', 0.9433557987213135),
 ('of_our', 0.9430291652679443),
 ('army_sharp', 0.9429834485054016),
 ('positions_sign', 0.9428299069404602),
 ('invest_in', 0.9426500201225281),
 ('or_ts', 0.9424646496772766),
 ('snp_lake', 0.9422569870948792)]

In [129]:
def prepare_corpus(corpus, bigram):
    for sent in corpus:
        yield bigram[sent] + sent        

In [109]:
simple_preprocess('Deep Learning Applied - ')

['deep', 'learning', 'applied']

In [110]:
title_bigram[['deep', 'learning', 'applied']]

['deep_learning', 'applied']

In [113]:
['deep', 'learning', 'applied'] + ['deep_learning', 'applied']

['deep', 'learning', 'applied', 'deep_learning', 'applied']

In [130]:
extended_corpus = list(prepare_corpus(title_corpus, title_bigram))
title_model = Word2Vec(extended_corpus, size=100, window=2, min_count=1)

In [131]:
title_model.wv.most_similar('machine')

[('machine_learning', 0.9450997114181519),
 ('learning_product', 0.9447668790817261),
 ('bmvc', 0.9423291683197021),
 ('deep', 0.939751923084259),
 ('learning', 0.9390493035316467),
 ('big', 0.9347665309906006),
 ('computer_vision', 0.9310387372970581),
 ('rockville', 0.9265317916870117),
 ('and_assemblers', 0.9256027936935425),
 ('nlp', 0.9251259565353394)]

In [133]:
title_model.wv.most_similar('machine_learning')

[('rockville', 0.9832637310028076),
 ('computer_vision', 0.9816802144050598),
 ('learning', 0.9789376258850098),
 ('deep_learning', 0.976489245891571),
 ('ml', 0.9757351875305176),
 ('nlp', 0.9742149114608765),
 ('bmvc', 0.9740982055664062),
 ('big', 0.9688085317611694),
 ('nlp_engineer', 0.9677655100822449),
 ('computer', 0.9642936587333679)]

In [150]:
#simple_preprocess( df.sample()['description'].values[0])

## Description

In [151]:
desc_corpus = df.description.map(simple_preprocess)
desc_bigram = Phraser(Phrases(desc_corpus, min_count=1, threshold=1))

In [153]:
ext_corpus = list(prepare_corpus(desc_corpus, desc_bigram))
desc_model = Word2Vec(ext_corpus, size=100, window=2, min_count=1)

In [161]:
desc_model.wv.most_similar('keras')

[('tensorflow', 0.945772647857666),
 ('caffe', 0.9426823854446411),
 ('pytorch', 0.9323517084121704),
 ('scipy', 0.9283360242843628),
 ('numpy', 0.9280869364738464),
 ('theano', 0.9229416847229004),
 ('jupyter', 0.908257782459259),
 ('spacy', 0.8976615071296692),
 ('pandas', 0.8933025002479553),
 ('mxnet', 0.8901236653327942)]

In [157]:
title_model.wv.most_similar('machine')

[('machine_learning', 0.9450997114181519),
 ('learning_product', 0.9447668790817261),
 ('bmvc', 0.9423291683197021),
 ('deep', 0.939751923084259),
 ('learning', 0.9390493035316467),
 ('big', 0.9347665309906006),
 ('computer_vision', 0.9310387372970581),
 ('rockville', 0.9265317916870117),
 ('and_assemblers', 0.9256027936935425),
 ('nlp', 0.9251259565353394)]

In [158]:
desc_model_no_bigrams = Word2Vec(desc_corpus, size=100, window=2, min_count=1)

In [160]:
desc_model_no_bigrams.wv.most_similar('keras')

[('pytorch', 0.9421138763427734),
 ('caffe', 0.936102032661438),
 ('tensorflow', 0.9353744983673096),
 ('theano', 0.9265309572219849),
 ('scipy', 0.9170170426368713),
 ('numpy', 0.9061530828475952),
 ('mxnet', 0.8872060775756836),
 ('matplotlib', 0.8789718747138977),
 ('sklearn', 0.8684313893318176),
 ('nltk', 0.8644103407859802)]