# vectorizing job offers

## aim
- cluster job offeres by similarity based on a dictionary of skills

## outline
- preprocess doc2vec with full job offers
- train model
- test similarity of job descriptions
- cluster offers (Kmeans, KNN)

## outcome
unicorns in a meadow

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import joblib
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from pprint import pprint
import math
import multiprocessing
import gensim.models.doc2vec
import time

%matplotlib inline

In [4]:
df = joblib.load('../../../raw_data/processed_data.joblib')

In [5]:
df.shape

(7859, 14)

In [6]:
df['tag_language'] = df['tag_language'].fillna(value='en')

In [7]:
df.head(3)

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,source,job_link,tag_language,reviews,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,(Junior) Data Engineer (f/m/x),Customlytics ist die führende App Marketing Be...,Customlytics GmbH,Berlin,(Junior) Data Engineer (f/m/x)\nCustomlytics G...,data science,scrape_json,,en,,"[junior, data, engineer, fmx, customlytics, gm...","[customlytics, ist, die, führende, app, market...","[Customlytics, ist, die, führende, App, Market...","[junior, data, engineer, fmx]"
1,,Responsibilities\n\nAs working student (m/f/x)...,Aroundhome,Berlin,Aroundhome6 Bewertungen - Berlin,data science,scrape_json,,en,,"[aroundhome, bewertungen, berlin]","[responsibilities, as, working, student, mfx, ...","[Responsibilities, As, working, student, mfx, ...",[]
2,,Aufgaben\nAls Werkstudent (m/w/d) IT arbeitest...,Aroundhome,Berlin,"Aroundhome6 Bewertungen - Berlin\nTeilzeit, Pr...",data science,scrape_json,,de,,"[aroundhome, bewertungen, berlin, teilzeit, pr...","[aufgaben, als, werkstudent, mwd, it, arbeites...","[Aufgaben, Als, Werkstudent, mwd, IT, arbeites...",[]


In [8]:
# select english jobs
df_eng = df.copy()
df_eng = df_eng[df_eng['tag_language'] == 'en']
df_eng.reset_index(inplace=True)
df_eng.drop(columns='index', inplace=True)

In [9]:
df_eng.head()

Unnamed: 0,job_title,job_text,company,location,job_info,query_text,source,job_link,tag_language,reviews,job_info_tokenized,job_text_tokenized,job_text_tokenized_titlecase,job_title_tokenized
0,(Junior) Data Engineer (f/m/x),Customlytics ist die führende App Marketing Be...,Customlytics GmbH,Berlin,(Junior) Data Engineer (f/m/x)\nCustomlytics G...,data science,scrape_json,,en,,"[junior, data, engineer, fmx, customlytics, gm...","[customlytics, ist, die, führende, app, market...","[Customlytics, ist, die, führende, App, Market...","[junior, data, engineer, fmx]"
1,,Responsibilities\n\nAs working student (m/f/x)...,Aroundhome,Berlin,Aroundhome6 Bewertungen - Berlin,data science,scrape_json,,en,,"[aroundhome, bewertungen, berlin]","[responsibilities, as, working, student, mfx, ...","[Responsibilities, As, working, student, mfx, ...",[]
2,Full Stack Developer (m/f/d),We’re Phiture: a leading mobile growth consult...,Phiture,BerlinKreuzberg,Full Stack Developer (m/f/d)\nPhiture - Berlin...,data science,scrape_json,,en,,"[full, stack, developer, mfd, phiture, berlink...","[were, phiture, a, leading, mobile, growth, co...","[Were, Phiture, a, leading, mobile, growth, co...","[full, stack, developer, mfd]"
3,,"We are 18,000+ employees strong, operating in ...",PRA Health Sciences,Berlin,PRA Health Sciences - Berlin,data science,scrape_json,,en,,"[pra, health, sciences, berlin]","[we, are, employees, strong, operating, in, mo...","[We, are, employees, strong, operating, in, mo...",[]
4,Head of Finance,Head of Finance (m/f/d)\nAt Home our mission i...,Home HT GmbH,Berlin,Head of Finance\nHome HT GmbH2 Bewertungen - B...,data science,scrape_json,,en,,"[head, of, finance, home, ht, gmbh, bewertunge...","[head, of, finance, mfd, at, home, our, missio...","[Head, of, Finance, mfd, At, Home, our, missio...","[head, of, finance]"


In [10]:
df_eng.shape

(7547, 14)

In [11]:
# join strings
def join_strings(text):
    return ' '.join(text)

In [12]:
# lemmatize
def lemmatize_words(word):
    lemmatizer = WordNetLemmatizer()
    lemmatized = lemmatizer.lemmatize(word)

    return lemmatized

In [13]:
# remove stopwords
def remove_stopwords(text):

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words] 
  
    return text

#['heute', 'weiter', 'zur', 'bewerbung', 'diesen', 'job', 'melden']

In [14]:
# process text
df_eng['clean'] = df_eng['job_text_tokenized'].apply(join_strings).apply(lemmatize_words)\
    .apply(remove_stopwords)

## model doc2vec 1

Conclusions :)
- ~ 700 offers - 100 epocs
    - model performs ok, but tends to cluster according to company
    - texts with very high similarity (> 0.90) are likely to be duplicated job adds
    - looks like the model first shows offers based on duplicates, then company, then position (probably because of semantics)


- 2500 offers - 150 epocs
    - still clusters by company
    - add more data? or try bigrams

In [17]:
# tag texts
texts = df_eng['clean']
texts_tagged = [TaggedDocument(text, tags=['tag_'+str(tag)]) for tag, text in enumerate(texts)]
texts_tagged[0]

TaggedDocument(words=['customlytics', 'ist', 'die', 'führende', 'app', 'marketing', 'beratungsagentur', 'aus', 'berlin', 'wir', 'bieten', 'consulting', 'und', 'handson', 'support', 'rund', 'um', 'app', 'marketing', 'strategie', 'produktmanagement', 'analytics', 'crm', 'unser', 'team', 'erarbeitet', 'mit', 'unternehmen', 'jeder', 'größe', 'konzepte', 'zur', 'erfolgreichen', 'vermarktung', 'von', 'mobilen', 'apps', 'dabei', 'decken', 'wir', 'nicht', 'nur', 'das', 'gesamte', 'spektrum', 'infrastruktureller', 'marketingthemen', 'ab', 'wir', 'konzipieren', 'planen', 'und', 'steuern', 'sowohl', 'das', 'ui', 'ux', 'design', 'von', 'mobilen', 'apps', 'als', 'auch', 'performance', 'marketing', 'kampagnen', 'für', 'alle', 'app', 'verticals', 'über', 'uns', 'unser', 'data', 'team', 'braucht', 'unterstützung', 'du', 'bist', 'motiviert', 'und', 'von', 'der', 'mobile', 'industry', 'begeistert', 'dann', 'suchen', 'wir', 'dich', 'um', 'die', 'data', 'warehouselösungen', 'für', 'unsere', 'kunden', 'aus

In [26]:
# reduced dataset
texts_tagged_small = texts_tagged[:3000]

In [27]:
data_to_train = texts_tagged_small # texts_tagged_small, texts_tagged

# build vocabulary with CBOW (dm=0)
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(documents=data_to_train,
                     dm=0,
                     alpha=0.025,
                     vector_size=len(data_to_train), 
                     min_count=1,
                     workers=cores)

# train the model
model_dbow.train(data_to_train, total_examples=model_dbow.corpus_count, epochs=15)

In [28]:
model_dbow.save('../../../models/doc2vec_3000_15_epochs')
#joblib.dump(model_dbow, filename='../../../models/doc2vec_3000_20_epochs.joblib' )

In [None]:
model_dbow.corpus_count

### test the model by hand

In [None]:
"""About the job
Bonial helps you save time, money and paper.

We are digital advertising partner for offline business - we support retailers in their marketing activities and help them find a new audience.Would you like to join and be a part of digitalization of retail?

At Bonial, the Data Team is one of the hubs between Online Marketing, Product, IT, Operations and Sales. It's a unique position as we can explore data from different aspects and bring valuable insights to the company. Our mission is to transform data into the source of truth and thereby enable Bonial employees to make smarter decisions. We turn data into information then into knowledge, help teams understand their capabilities, support teams in shaping the right questions for their challenges and teach them how to use the tools we provide. Our success would be to transform Bonial decision making process from gut feeling into scientific approach.

We're looking for an experienced Data Scientist with a curious analytical mindset and strong product intuition to join our Data team. You will be able to consult different teams on both technical and non-technical applications of data science.

You Will Be Responsible For
Designing and building production-ready predictive models or machine learning algorithms;
Optimizing our personalization algorithms;
Crunching, analyzing and investigating large amount of data to discover trends and patterns;
Investigating new machine learning techniques and tools, educating others on the possible applications of these techniques;
Visualizing your analysis in a way that everyone gets it at first glance;
Collaborating with different teams/departments to proactively suggest improvement points, using your initiative to develop new topics.

What You Should Bring To The Table
A ‘can-do’ attitude with a passion for analytics and the insight it can provide;
At least 2 years of experience as a data scientist or data analyst;
A degree in Mathematics, Statistic or Computer Science; graduate degree in Data Science or other quantitative field is preferred;
Good knowledge of Python, SQL or R;
Experience in practical application of machine learning or predictive analytics;
Ability to convey complex issues to a non-technical audience with a high level of abstraction;
Experience in data visualization.

It Would Be a Plus If You Have
Experience with PySpark or SparkR;
Experience building APIs;
Experience using Git, Docker, Airflow, AWS.
Keep in mind that this is just an ideal requirements’ list; we look for potential!

What We Can Offer You

Diversity - international, multicultural and inclusive community with colleagues from over 40 different countries where you can grow personally and professionally. We do not only embrace diversity - we cherish it.

Development - strong support for your professional development with both company sponsored activities and a dedicated budget you can decide on, also English and German language courses in house.

Sustainability - the opportunity to engage in projects that promote sustainability, environmental development and the impact that we make on our planet.

Modern Office with interior design handcrafted by our creative team: fully stocked Kids' Room for when your kita bails on you; Zen Rooms where employees can pray, relax or simply have some quiet time; fully equipped gym in the office; and Roof terrace for amazing Friday events.

Social Culture that encourages people to start conversations, build relationships and participate together in the community through regular team events in a lounge perfect for socializing.

If you think you could fit the bill, we'd love to hear from you!

You can check out our Bonial people Instagram or read more about Bonial culture here.

About The Company

Bonial is part of Axel Springer SE and is the leading "Drive to Store" platform in Germany and France. Every day we inspire millions of users by connecting them to their favorite shops and brands via our platforms "kaufDA" and "MeinProspekt" in Germany, and “Bonial” in France.

More than 1500 retailers and brands across all sectors rely on our tailor-made, data driven marketing solutions to promote their offers and reach measurable business results with maximized return on investment.

Our more than 300 employees come from all over the world joining together to strengthen their skills and work towards creating innovative digital marketing products for clients and users. We believe in the power of each individual to own their impact and drive meaningful results every day. Our diverse and open culture is the secret sauce to our success and ensures we truly put people at the center of everything we do. Our data-driven decision making empowers our mission and how we work"""

NameError: name 'texts_tagged' is not defined

In [22]:
# load model
model_loaded = Doc2Vec.load('../../../models/doc2vec_3000_20_epochs')

In [44]:
def similar_jobs(tokenized_job, offers):
    ''' input: tokenized job offers, number of offers 
        returns tags of top x most similar job offers and similarity probabilities
    '''

    # infer vector from text 
    infer_vector = model_loaded.infer_vector(tokenized_job)
    # find similar offers
    similar_documents = model_loaded.docvecs.most_similar([infer_vector], topn = offers)

    return similar_documents

In [62]:
def print_top_jobs(text, offers=5):
    
    """ input: index of text in dataframe and number of offers we want to see
        prints text of the offers
    """
    
    tags = similar_jobs(text, offers)
    tags = [list(i) for i in tags]
    
    print(f"{tags}\n")
    print(f"{df_eng['job_title'][text_index], df_eng['company'][text_index], df_eng['job_text'][text_index]} \
        \n-------------END------------\n ")
    
    for tag in tags: 
        num = int(tag[0].strip('tag_'))
        
        print(f"{df_eng['job_title'][num], df_eng['company'][num], df_eng['job_text'][num]} \
        \n-------------END------------\n ") 
    

In [44]:
# print_top_jobs(400, 10) # duplicates 3050; 2020; 400
# #print_top_jobs(5000) 

## improve the model 1
Steps:
- filter out duplicates
- filter out jobs from same company
- get output and see if it's better

In [48]:
## change case to lower
def to_lower(text):
    return text.lower()

## remove numbers from the corpus
def remove_number(text):
    text = ''.join(word for word in text if not word.isdigit())
    
    return text

## remove special puncutation from text
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    return text

In [105]:
offer = """
Agoda is an online travel booking platform for accommodation, flights, and more. We build and deploy cutting edge technology that connects travelers with more than 2.5 million accommodations globally. Based in Asia and part of Booking Holdings, our 4,000+ talents coming from 90+ different nationalities foster a work environment rich in diversity, creativity, and collaboration. We innovate through a culture of experimentation and ownership, enabling our customers to experience the world.

Get To Know Our Team

The Performance Marketing Team of Agoda is a world leader in online marketing. This department is highly data-driven and focused on developing at-scale marketing programs that improve the lifetime value of Agoda customers through measurable marketing programs and channels. The team is a blend of the best analysts, marketing strategists, and data scientists in the world. The marketing leadership at Agoda have deep experience in data science, product, strategy, and other marketing fields and have built an organization that thrives on data, creative ideas, and technology. The Performance Marketing Team also fosters a great learning environment. You will be able to learn and grow by working closely with experts from a variety of backgrounds from all over the world.

In This Role, You’ll Get To
Search: Experiment with text ads, bidding, and campaign structures on Google, Bing, Baidu, Naver, and other search engines. Adapt to new product features and roll out changes from successful tests.
Display: Test, analyze, and optimize campaigns on Facebook, Twitter, Instagram, and others.
Modeling: Analyze the vast amounts of data generated by experiments, develop models we can use for optimization, and build dashboards for account managers.


What You’ll Need To Succeed
Bachelor’s Degree or higher from top university in a quantitative subject (computer science, mathematics, engineering, statistics or science)
Ability to communicate fluently in English
Good numerical reasoning skills
Proficiency in Excel
Intellectual curiosity


It’s Great If You Have
Exposure to one or more data analysis packages or databases, e.g., SAS, R, SPSS, Python, VBA, SQL
Experience in digital marketing
Academic research experience
#STRA #ANLS #MRKT #3 #hongkong #kualalumpur #mumbai #newdelhi #delhi #bangalore #hochiminh #jakarta #manila #hongkong #beijing #toronto #vancouver #berlin #paris #barcelona #london #madrid #bangkok data representation data analysis SQL data analytics analytics python (programming language) data mining data science r (programming language) tableau analytical skills data visualization databases business analysis business intelligence (bi) microsoft sql server machine learning statistics microsoft power bi java finance shopee traveloka google facebook ctrip trip.com makemytrip grab amazon pandas (software) artificial intelligence (ai) information technology capital one accenture upwork deloitte mckinsey bain microsoft uber lyft gojek lazada alibaba shopify expedia skyscanner

Equal Opportunity Employer

At Agoda, we pride ourselves on being a company represented by people of all different backgrounds and orientations. We prioritize attracting diverse talent and cultivating an inclusive environment that encourages collaboration and innovation. Employment at Agoda is based solely on a person’s merit and qualifications. We are committed to providing equal employment opportunity regardless of sex, age, race, color, national origin, religion, marital status, pregnancy, sexual orientation, gender identity, disability, citizenship, veteran or military status, and other legally protected characteristics.

We will keep your application on file in accordance with our privacy policy so that we can consider you for future vacancies. You can always ask to have your details removed from the file by contacting us at [email protected]

To all recruitment agencies: Agoda does not accept third party resumes. Please do not send resumes to our jobs alias, Agoda employees or any other organization location. Agoda is not responsible for any fees related to unsolicited resumes."""

In [106]:
token_offer = to_lower(offer)
token_offer = remove_number(offer)
token_offer = remove_punctuation(offer)
token_offer = lemmatize_words(offer)
token_offer = remove_stopwords(offer)
#offer
token_offer[:10]

['Agoda',
 'online',
 'travel',
 'booking',
 'platform',
 'accommodation',
 ',',
 'flights',
 ',',
 '.']

In [107]:
infer_vector = model_loaded.infer_vector(token_offer)
infer_vector

array([ 0.05938027,  0.1751192 , -0.09508178, ...,  0.01733588,
        0.04081504, -0.0244547 ], dtype=float32)

In [108]:
similar_documents = model_loaded.docvecs.most_similar([infer_vector], topn = 5)

In [103]:
pprint(offer)

('Bonial helps you save time, money and paper.We are digital advertising '
 'partner for offline business - we support retailers in their marketing '
 'activities and help them find a new audience.Would you like to join and be a '
 'part of digitalization of retail?At Bonial, the Data Team is one of the hubs '
 "between Online Marketing, Product, IT, Operations and Sales. It's a unique "
 'position as we can explore data from different aspects and bring valuable '
 'insights to the company. Our mission is to transform data into the source of '
 'truth and thereby enable Bonial employees to make smarter decisions. We turn '
 'data into information then into knowledge, help teams understand their '
 'capabilities, support teams in shaping the right questions for their '
 'challenges and teach them how to use the tools we provide. Our success would '
 'be to transform Bonial decision making process from gut feeling into '
 "scientific approach.We're looking for an experienced Data Scienti

In [109]:
tags = similar_documents
tags = [list(i) for i in tags]

print(f"{tags}\n")
# print(f"{df_eng['job_title'][text_index], df_eng['company'][text_index], df_eng['job_text'][text_index]} \
#     \n-------------END------------\n ")

for tag in tags: 
    num = int(tag[0].strip('tag_'))

    print(f"{df_eng['job_title'][num], df_eng['company'][num], df_eng['job_text'][num]} \
    \n-------------END------------\n ") 


[['tag_239', 0.47019219398498535], ['tag_2536', 0.4570488929748535], ['tag_259', 0.452234148979187], ['tag_520', 0.447274386882782], ['tag_2524', 0.4398900866508484]]

('', 'Salesforce', 'To get the best candidate experience, please consider applying for a maximum of 3 roles within 12 months to ensure you are not duplicating efforts.\nOur Futureforce University Recruiting program is dedicated to attracting, retaining and cultivating talent. Our interns and new graduates work on real projects that affect how our business runs, giving them the opportunity to make a tangible impact on the future of our company. With offices all over the world, our recruits have the chance to collaborate and connect with fellow employees on a global scale. We offer job shadowing, mentorship programs, talent development courses, and much more.\nJob Category\nIntern\nJob Details\nMuleSoft Summer 2021 Intern - Technical Consultant\n\nStart date: 16th June 2021\nDuration: 10 weeks\nLocation: Amsterdam, Brussel

In [None]:
#def filter_duplicates(text):
    

In [46]:
#similar_jobs(400, 10)

## Improve the model - 2

- try bigrams instead ot unigrams

In [None]:
texts_small = df_eng['clean'][:2500]

In [None]:
texts_small.head()

In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models

In [None]:
bigram = Phrases(texts_small, min_count=1, threshold=2, delimiter=b' ')

bigram_phraser = Phraser(bigram)


In [None]:
bigram_token = []
for sent in texts_small:
    bigram_token.append(bigram_phraser[sent])
    
bigram_token[:20]