# vectorizing job offers

## aim
- cluster job offeres by similarity based on a dictionary of skills

## outline
- preprocess doc2vec with full job offers
- train model
- test similarity of job descriptions
- cluster offers (Kmeans, KNN)

## outcome
unicorns in a meadow

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import joblib
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import math
import multiprocessing
import gensim.models.doc2vec
import time

%matplotlib inline

In [14]:
df = joblib.load('../../../raw_data/processed_data.joblib')

In [15]:
df.shape

(7859, 14)

In [16]:
df['tag_language'] = df['tag_language'].fillna(value='en')

In [17]:
df.head(3)

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,source,job_link,tag_language,reviews,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,(Junior) Data Engineer (f/m/x),Customlytics ist die führende App Marketing Be...,Customlytics GmbH,Berlin,(Junior) Data Engineer (f/m/x)\nCustomlytics G...,data science,scrape_json,,en,,"[junior, data, engineer, fmx, customlytics, gm...","[customlytics, ist, die, führende, app, market...","[Customlytics, ist, die, führende, App, Market...","[junior, data, engineer, fmx]"
1,,Responsibilities\n\nAs working student (m/f/x)...,Aroundhome,Berlin,Aroundhome6 Bewertungen - Berlin,data science,scrape_json,,en,,"[aroundhome, bewertungen, berlin]","[responsibilities, as, working, student, mfx, ...","[Responsibilities, As, working, student, mfx, ...",[]
2,,Aufgaben\nAls Werkstudent (m/w/d) IT arbeitest...,Aroundhome,Berlin,"Aroundhome6 Bewertungen - Berlin\nTeilzeit, Pr...",data science,scrape_json,,de,,"[aroundhome, bewertungen, berlin, teilzeit, pr...","[aufgaben, als, werkstudent, mwd, it, arbeites...","[Aufgaben, Als, Werkstudent, mwd, IT, arbeites...",[]


In [18]:
# select english jobs
df_eng = df.copy()
df_eng = df_eng[df_eng['tag_language'] == 'en']
df_eng.reset_index(inplace=True)
df_eng.drop(columns='index', inplace=True)

In [19]:
df_eng.head()

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,source,job_link,tag_language,reviews,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,(Junior) Data Engineer (f/m/x),Customlytics ist die führende App Marketing Be...,Customlytics GmbH,Berlin,(Junior) Data Engineer (f/m/x)\nCustomlytics G...,data science,scrape_json,,en,,"[junior, data, engineer, fmx, customlytics, gm...","[customlytics, ist, die, führende, app, market...","[Customlytics, ist, die, führende, App, Market...","[junior, data, engineer, fmx]"
1,,Responsibilities\n\nAs working student (m/f/x)...,Aroundhome,Berlin,Aroundhome6 Bewertungen - Berlin,data science,scrape_json,,en,,"[aroundhome, bewertungen, berlin]","[responsibilities, as, working, student, mfx, ...","[Responsibilities, As, working, student, mfx, ...",[]
2,Full Stack Developer (m/f/d),We’re Phiture: a leading mobile growth consult...,Phiture,BerlinKreuzberg,Full Stack Developer (m/f/d)\nPhiture - Berlin...,data science,scrape_json,,en,,"[full, stack, developer, mfd, phiture, berlink...","[were, phiture, a, leading, mobile, growth, co...","[Were, Phiture, a, leading, mobile, growth, co...","[full, stack, developer, mfd]"
3,,"We are 18,000+ employees strong, operating in ...",PRA Health Sciences,Berlin,PRA Health Sciences - Berlin,data science,scrape_json,,en,,"[pra, health, sciences, berlin]","[we, are, employees, strong, operating, in, mo...","[We, are, employees, strong, operating, in, mo...",[]
4,Head of Finance,Head of Finance (m/f/d)\nAt Home our mission i...,Home HT GmbH,Berlin,Head of Finance\nHome HT GmbH2 Bewertungen - B...,data science,scrape_json,,en,,"[head, of, finance, home, ht, gmbh, bewertunge...","[head, of, finance, mfd, at, home, our, missio...","[Head, of, Finance, mfd, At, Home, our, missio...","[head, of, finance]"


In [20]:
df_eng.shape

(7547, 14)

In [21]:
# join strings
def join_strings(text):
    return ' '.join(text)

In [22]:
# lemmatize
def lemmatize_words(word):
    lemmatizer = WordNetLemmatizer()
    lemmatized = lemmatizer.lemmatize(word)

    return lemmatized

In [23]:
# remove stopwords
def remove_stopwords(text):

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words] 
  
    return text

#['heute', 'weiter', 'zur', 'bewerbung', 'diesen', 'job', 'melden']

In [24]:
# process text
df_eng['clean'] = df_eng['job_text_tokenized'].apply(join_strings).apply(lemmatize_words)\
    .apply(remove_stopwords)

## model doc2vec 1

Conclusions :)
- ~ 700 offers - 100 epocs
    - model performs ok, but tends to cluster according to company
    - texts with very high similarity (> 0.90) are likely to be duplicated job adds
    - looks like the model first shows offers based on duplicates, then company, then position (probably because of semantics)


- 2500 offers - 150 epocs
    - still clusters by company
    - add more data? or try bigrams

In [25]:
# tag texts
texts = df_eng['clean']
texts_tagged = [TaggedDocument(text, tags=['tag_'+str(tag)]) for tag, text in enumerate(texts)]
texts_tagged[0]

TaggedDocument(words=['customlytics', 'ist', 'die', 'führende', 'app', 'marketing', 'beratungsagentur', 'aus', 'berlin', 'wir', 'bieten', 'consulting', 'und', 'handson', 'support', 'rund', 'um', 'app', 'marketing', 'strategie', 'produktmanagement', 'analytics', 'crm', 'unser', 'team', 'erarbeitet', 'mit', 'unternehmen', 'jeder', 'größe', 'konzepte', 'zur', 'erfolgreichen', 'vermarktung', 'von', 'mobilen', 'apps', 'dabei', 'decken', 'wir', 'nicht', 'nur', 'das', 'gesamte', 'spektrum', 'infrastruktureller', 'marketingthemen', 'ab', 'wir', 'konzipieren', 'planen', 'und', 'steuern', 'sowohl', 'das', 'ui', 'ux', 'design', 'von', 'mobilen', 'apps', 'als', 'auch', 'performance', 'marketing', 'kampagnen', 'für', 'alle', 'app', 'verticals', 'über', 'uns', 'unser', 'data', 'team', 'braucht', 'unterstützung', 'du', 'bist', 'motiviert', 'und', 'von', 'der', 'mobile', 'industry', 'begeistert', 'dann', 'suchen', 'wir', 'dich', 'um', 'die', 'data', 'warehouselösungen', 'für', 'unsere', 'kunden', 'aus

In [26]:
# reduced dataset
texts_tagged_small = texts_tagged[:3000]

In [27]:
data_to_train = texts_tagged_small # texts_tagged_small, texts_tagged

# build vocabulary with CBOW (dm=0)
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(documents=data_to_train,
                     dm=0,
                     alpha=0.025,
                     vector_size=len(data_to_train), 
                     min_count=1,
                     workers=cores)

# train the model
model_dbow.train(data_to_train, total_examples=model_dbow.corpus_count, epochs=15)

In [28]:
model_dbow.save('../../../models/doc2vec_3000_15_epochs')
#joblib.dump(model_dbow, filename='../../../models/doc2vec_3000_20_epochs.joblib' )

In [None]:
model_dbow.corpus_count

### test the model by hand

In [36]:
# load model
model_loaded = Doc2Vec.load('../../../models/doc2vec_3000_20_epochs')

In [37]:
def similar_jobs(tokenized_job, offers):
    ''' input: tokenized job offers, number of offers 
        returns tags of top x most similar job offers and similarity probabilities
    '''

    # infer vector from text 
    infer_vector = model_loaded.infer_vector(tokenized_job)
    # find similar offers
    similar_documents = model_loaded.docvecs.most_similar([infer_vector], topn = offers)

    return similar_documents

In [38]:
def print_top_jobs(text_index, offers=5):
    
    """ input: index of text in dataframe and number of offers we want to see
        prints text of the offers
    """
    
    tags = similar_jobs(texts[text_index], offers)
    tags = [list(i) for i in tags]
    
    print(f"{tags}\n")
    print(f"{df_eng['job_title'][text_index], df_eng['company'][text_index], df_eng['job_text'][text_index]} \
        \n-------------END------------\n ")
    
    for tag in tags: 
        num = int(tag[0].strip('tag_'))
        
        print(f"{df_eng['job_title'][num], df_eng['company'][num], df_eng['job_text'][num]} \
        \n-------------END------------\n ") 
    

In [39]:
print_top_jobs(400, 10) # duplicates 3050; 2020; 400
#print_top_jobs(5000) 

[['tag_400', 0.9450560808181763], ['tag_816', 0.7360695600509644], ['tag_385', 0.7156456708908081], ['tag_337', 0.4847617447376251], ['tag_476', 0.4800282120704651], ['tag_66', 0.4686710834503174], ['tag_452', 0.46127212047576904], ['tag_592', 0.45939743518829346], ['tag_149', 0.45668190717697144], ['tag_378', 0.45607903599739075]]

('Senior Data Engineer - Lending Tribe', 'SumUp', "The mission of SumUp's Lending Tribe is to provide our small merchants, who are often turned away by financial institutions, with access to capital to grow their businesses.\nAs a Senior Data Engineer in this green-field tribe, you'll work with a cross-functional team to bring transparent, affordable and seamless lending products to millions of small merchants across our 30+ markets. You'll play a leading role in architecting and developing the lending risk platform to provide access to working capital to all of our small merchants while systematically mitigating business and fraud risks for SumUp's busines

## Improve the model

- try bigrams instead ot unigrams

In [None]:
texts_small = df_eng['clean'][:2500]

In [None]:
texts_small.head()

In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models

In [None]:
bigram = Phrases(texts_small, min_count=1, threshold=2, delimiter=b' ')

bigram_phraser = Phraser(bigram)


In [None]:
bigram_token = []
for sent in texts_small:
    bigram_token.append(bigram_phraser[sent])
    
bigram_token[:20]