# vectorizing job offers

## aim
- cluster job offeres by similarity based on a dictionary of skills

## outline
- preprocess doc2vec with full job offers
- train model
- test similarity of job descriptions
- cluster offers (Kmeans, KNN)

## outcome
unicorns in a meadow

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import joblib
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


%matplotlib inline

In [2]:
df = joblib.load('../../../raw_data/indeed/ip_2021-03-03.joblib')

In [3]:
df.shape

(954, 11)

In [4]:
df.head(3)

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,tag_language,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,Data Scientist / Matching Engineer (m/w/d),You are responsible for improvement of Taledo’...,Taledo,Berlin,Data Scientist / Matching Engineer (m/w/d)\nTa...,data engineer,en,"[data, scientist, matching, engineer, mwd, tal...","[you, are, responsible, for, improvement, of, ...","[You, are, responsible, for, improvement, of, ...","[data, scientist, matching, engineer, mwd]"
1,(Junior) Data Engineer (f/m/x),Über die Stelle\nUnser Data Team braucht Unter...,Customlytics GmbH,Berlin,(Junior) Data Engineer (f/m/x)\nCustomlytics G...,data engineer,de,"[junior, data, engineer, fmx, customlytics, gm...","[über, die, stelle, unser, data, team, braucht...","[Über, die, Stelle, Unser, Data, Team, braucht...","[junior, data, engineer, fmx]"
2,Junior Data Scientist (m/w/d) docmetric,Kennziffer:\nreq6004\nStandort:\nBerlin\nJob S...,CompuGroup Medical,Berlin,Junior Data Scientist (m/w/d) docmetric\nCompu...,data engineer,de,"[junior, data, scientist, mwd, docmetric, comp...","[kennziffer, req, standort, berlin, job, segme...","[Kennziffer, req, Standort, Berlin, Job, Segme...","[junior, data, scientist, mwd, docmetric]"


In [5]:
# select english jobs
df_eng = df.copy()
df_eng = df_eng[df_eng['tag_language'] == 'en']
df_eng.reset_index(inplace=True)
df_eng.drop(columns='index', inplace=True)

In [6]:
df_eng.head()

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,tag_language,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,Data Scientist / Matching Engineer (m/w/d),You are responsible for improvement of Taledo’...,Taledo,Berlin,Data Scientist / Matching Engineer (m/w/d)\nTa...,data engineer,en,"[data, scientist, matching, engineer, mwd, tal...","[you, are, responsible, for, improvement, of, ...","[You, are, responsible, for, improvement, of, ...","[data, scientist, matching, engineer, mwd]"
1,Senior Software Engineer - Data Platform,We are looking for a Senior Software Engineer ...,Zalando SE,Berlin,Senior Software Engineer - Data Platform\nZala...,data engineer,en,"[senior, software, engineer, data, platform, z...","[we, are, looking, for, a, senior, software, e...","[We, are, looking, for, a, Senior, Software, E...","[senior, software, engineer, data, platform]"
2,Senior Software Engineer - Data Platform,We are looking for a Senior Software Engineer ...,Zalando,Berlin,Senior Software Engineer - Data Platform\nZala...,data engineer,en,"[senior, software, engineer, data, platform, z...","[we, are, looking, for, a, senior, software, e...","[We, are, looking, for, a, Senior, Software, E...","[senior, software, engineer, data, platform]"
3,Senior Data Engineer (m/w/t),"As a member of the Data Engineering Team, you ...",Quandoo GmbH,Berlin,Senior Data Engineer (m/w/t)\nQuandoo GmbH17 B...,data engineer,en,"[senior, data, engineer, mwt, quandoo, gmbh, b...","[as, a, member, of, the, data, engineering, te...","[As, a, member, of, the, Data, Engineering, Te...","[senior, data, engineer, mwt]"
4,Data Engineer (w/m/d),We are digitty.io – an international start-up ...,digitty.io,Berlin,Data Engineer (w/m/d)\ndigitty.io - Berlin,data engineer,en,"[data, engineer, wmd, digittyio, berlin]","[we, are, digittyio, an, international, startu...","[We, are, digittyio, an, international, startu...","[data, engineer, wmd]"


In [7]:
# join strings
def join_strings(text):
    return ' '.join(text)

In [8]:
# lemmatize
def lemmatize_words(word):
    lemmatizer = WordNetLemmatizer()
    lemmatized = lemmatizer.lemmatize(word)

    return lemmatized

In [9]:
# remove stopwords
def remove_stopwords(text):

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words] 
  
    return text

#['heute', 'weiter', 'zur', 'bewerbung', 'diesen', 'job', 'melden']

In [10]:
# process text
df_eng['clean'] = df_eng['job_text_tokenized'].apply(join_strings).apply(lemmatize_words).apply(remove_stopwords)

In [11]:
df_eng['clean']

0      [responsible, improvement, taledos, search, ma...
1      [looking, senior, software, engineer, extensiv...
2      [looking, senior, software, engineer, extensiv...
3      [member, data, engineering, team, responsible,...
4      [digittyio, international, startup, headquarte...
                             ...                        
702    [zalando, fulfillment, solutions, team, lookin...
703    [caya, digitizes, small, mediumsized, business...
704    [position, youll, first, point, contact, comes...
705    [position, youll, first, point, contact, comes...
706    [digitize, logistics, industry, bn, market, eu...
Name: clean, Length: 707, dtype: object

## model d2v - whole text

In [12]:
# tag texts

texts = df_eng['clean']
texts_tagged = [TaggedDocument(text, tags=['tag_'+str(tag)]) for tag, text in enumerate(texts)]
texts_tagged[0]

TaggedDocument(words=['responsible', 'improvement', 'taledos', 'search', 'matching', 'engine', 'candidates', 'jobs', 'business', 'drivers', 'data', 'science', 'develop', 'compare', 'different', 'algorithmic', 'approaches', 'andor', 'ml', 'models', 'monitor', 'production', 'performance', 'measure', 'success', 'work', 'update', 'outdated', 'models', 'research', 'discuss', 'algorithmical', 'well', 'model', 'improvements', 'regularly', 'knowledgeable', 'developed', 'ai', 'community', 'propose', 'whats', 'applicable', 'taledo', 'expect', 'curious', 'nature', 'like', 'solve', 'challenging', 'problems', 'proficient', 'python', 'worked', 'relevant', 'libraries', 'know', 'use', 'data', 'handling', 'numpy', 'pandas', 'dask', 'psycopg', 'mldl', 'scikitlearn', 'xgboost', 'keras', 'pytorch', 'spacy', 'visualization', 'seaborn', 'matplotlib', 'experience', 'evaluating', 'different', 'approaches', 'choosing', 'appropriate', 'metric', 'worked', 'search', 'matching', 'nlp', 'using', 'various', 'approac

In [13]:
len(texts_tagged)

707

In [14]:
# build vocabulary with CBOW (dm=0) - ie build model
model_dbow = Doc2Vec(documents=texts_tagged,
                     dm=0,
                     alpha=0.025,
                     vector_size=len(texts_tagged), 
                     min_count=0)

In [15]:
model_dbow

<gensim.models.doc2vec.Doc2Vec at 0x1088540d0>

In [None]:
for epoch in range(10):
    if epoch % 2 == 0:
        print(f'training epoch {epoch}')
    model.tr

In [None]:

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha


In [None]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])