# Job Similarity - How Similar are two jobs?

### Load libraries

In [74]:
import os
import json
import tqdm
import re
import string
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import scipy
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings('ignore')

In [2]:
#nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

### Read data

In [3]:
df_ori = pd.read_csv('../data/data job posts.csv')
df_ori.head()
print(df_ori.shape)
df = df_ori.drop_duplicates(['jobpost','Title'])
print(df.shape)
print("Removed {0} duplicates (based on jobpost + Title)".format(df_ori.shape[0]-df.shape[0]))

(19001, 24)
(18892, 24)
Removed 109 duplicates (based on jobpost + Title)


In [4]:
df.head()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,Location,JobDescription,JobRequirment,RequiredQual,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\nJOB TITL...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,"Yerevan, Armenia",AMERIA Investment Consulting Company is seekin...,- Supervises financial management and administ...,"To perform this job successfully, an\nindividu...",,"To apply for this position, please submit a\nc...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,"IREX Armenia Main Office; Yerevan, Armenia \nD...",,,- Bachelor's Degree; Master's is preferred;\n-...,,Please submit a cover letter and resume to:\nI...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\nJOB...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\nPOSITION,"Yerevan, Armenia",Public outreach and strengthening of a growing...,- Working with the Country Director to provide...,"- Degree in environmentally related field, or ...",,Please send resume or CV toursula.kazarian@......,,20 January 2004\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\nn...,,2004,1,False
3,Manoff Group\nJOB TITLE: BCC Specialist\nPOSI...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,"Manila, Philippines",The LEAD (Local Enhancement and Development fo...,- Identify gaps in knowledge and overseeing in...,"- Advanced degree in public health, social sci...",,Please send cover letter and resume to Amy\nPe...,,23 January 2004\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\nJOB TITLE: Software D...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,"Yerevan, Armenia",,- Rendering technical assistance to Database M...,- University degree; economical background is ...,,Successful candidates should submit\n- CV; \n-...,,"20 January 2004, 18:00",,,,2004,1,True


### Filter selective columns

In [5]:
df_job_related = df[['Title','JobDescription','RequiredQual','JobRequirment']]

In [6]:
df_job_related.head(10)

Unnamed: 0,Title,JobDescription,RequiredQual,JobRequirment
0,Chief Financial Officer,AMERIA Investment Consulting Company is seekin...,"To perform this job successfully, an\nindividu...",- Supervises financial management and administ...
1,Full-time Community Connections Intern (paid i...,,- Bachelor's Degree; Master's is preferred;\n-...,
2,Country Coordinator,Public outreach and strengthening of a growing...,"- Degree in environmentally related field, or ...",- Working with the Country Director to provide...
3,BCC Specialist,The LEAD (Local Enhancement and Development fo...,"- Advanced degree in public health, social sci...",- Identify gaps in knowledge and overseeing in...
4,Software Developer,,- University degree; economical background is ...,- Rendering technical assistance to Database M...
5,Saleswoman,Saleswoman will sell menswear and accessories.,"- Candidates should be female, 20-30 years old...",
6,Chief Accountant/ Finance Assistant,The Armenian Branch Office of the Open Society...,- University degree in finance/ accounting; \n...,
7,Non-paid part or full time Programmatic Intern,,,
8,Assistant to Managing Director,,- University degree;\n- At least 3 years of ex...,- Coordinating the work of subordinate employe...
9,"Program Assistant (INL), FSN-8; FP-6*",The incumbent assists in coordinating INL-fund...,NOTE: All applicants are instructed to\naddres...,


In [75]:
df_job_related['JobDescription'] = df_job_related['JobDescription'].astype(str)
df_job_related['RequiredQual'] = df_job_related['RequiredQual'].astype(str)
df_job_related['JobRequirment'] = df_job_related['JobRequirment'].astype(str)
df_job_related['Title'] = df_job_related['Title'].astype(str)

In [76]:
df_job_related['JobDescription_token'] = df_job_related['JobDescription'].map(word_tokenize)
df_job_related['RequiredQual_token'] = df_job_related['RequiredQual'].map(word_tokenize)
df_job_related['JobRequirment_token'] = df_job_related['JobRequirment'].map(word_tokenize)

#### Remove punctuations and stopwords from the above tokens

In [77]:
# stopwords and punctuation removal
df_job_related['JobDescription_token'] = df_job_related['JobDescription_token'].apply(lambda x: [item for item in x if item not in stop_words])
df_job_related['JobDescription_token'] = df_job_related['JobDescription_token'].apply(lambda x: [item for item in x if item not in string.punctuation])
df_job_related['RequiredQual_token'] = df_job_related['RequiredQual_token'].apply(lambda x: [item for item in x if item not in stop_words])
df_job_related['RequiredQual_token'] = df_job_related['RequiredQual_token'].apply(lambda x: [item for item in x if item not in string.punctuation])
df_job_related['JobRequirment_token'] = df_job_related['JobRequirment_token'].apply(lambda x: [item for item in x if item not in stop_words])
df_job_related['JobRequirment_token'] = df_job_related['JobRequirment_token'].apply(lambda x: [item for item in x if item not in string.punctuation])

In [78]:
# lowercase
df_job_related['JobDescription_token'] = df_job_related['JobDescription_token'].apply(lambda x: [item.lower() for item in x])
df_job_related['RequiredQual_token'] = df_job_related['RequiredQual_token'].apply(lambda x: [item.lower() for item in x])
df_job_related['JobRequirment_token'] = df_job_related['JobRequirment_token'].apply(lambda x: [item.lower() for item in x])

In [79]:
# stem
df_job_related['JobDescription_token'] = df_job_related['JobDescription_token'].apply(lambda x: [PorterStemmer().stem(item) for item in x])
df_job_related['RequiredQual_token'] = df_job_related['RequiredQual_token'].apply(lambda x: [PorterStemmer().stem(item) for item in x])
df_job_related['JobRequirment_token'] = df_job_related['JobRequirment_token'].apply(lambda x: [PorterStemmer().stem(item) for item in x])

In [80]:
# convert list to string
df_job_related['JobDescription_token_str'] = df_job_related['JobDescription_token'].apply(lambda x: " ".join(x))
df_job_related['RequiredQual_token_str'] = df_job_related['RequiredQual_token'].apply(lambda x: " ".join(x))
df_job_related['JobRequirment_token_str'] = df_job_related['JobRequirment_token'].apply(lambda x: " ".join(x))

In [81]:
df_job_related.head(10)

Unnamed: 0,Title,JobDescription,RequiredQual,JobRequirment,JobDescription_token,RequiredQual_token,JobRequirment_token,JobDescription_token_str,RequiredQual_token_str,JobRequirment_token_str,RequiredQual_keywords
0,Chief Financial Officer,AMERIA Investment Consulting Company is seekin...,"To perform this job successfully, an\nindividu...",- Supervises financial management and administ...,"[ameria, invest, consult, compani, seek, chief...","[to, perform, job, success, individu, must, ab...","[supervis, financi, manag, administr, staff, i...",ameria invest consult compani seek chief finan...,to perform job success individu must abl perfo...,supervis financi manag administr staff includ ...,essential duty accepted accounting principles ...
1,Full-time Community Connections Intern (paid i...,,- Bachelor's Degree; Master's is preferred;\n-...,,[nan],"[bachelor, 's, degre, master, 's, prefer, exce...",[nan],,bachelor 's degre master 's prefer excel skill...,,Excellent skills Armenian languages Armenian t...
2,Country Coordinator,Public outreach and strengthening of a growing...,"- Degree in environmentally related field, or ...",- Working with the Country Director to provide...,"[public, outreach, strengthen, grow, network, ...","[degre, environment, relat, field, 5, year, re...","[work, countri, director, provid, environment,...",public outreach strengthen grow network enviro...,degre environment relat field 5 year relev exp...,work countri director provid environment infor...,related field relevant experience written flue...
3,BCC Specialist,The LEAD (Local Enhancement and Development fo...,"- Advanced degree in public health, social sci...",- Identify gaps in knowledge and overseeing in...,"[the, lead, local, enhanc, develop, health, bc...","[advanc, degre, public, health, social, scienc...","[identifi, gap, knowledg, overse, inform, coll...",the lead local enhanc develop health bcc speci...,advanc degre public health social scienc commu...,identifi gap knowledg overse inform collect fi...,Advanced degree public health social science r...
4,Software Developer,,- University degree; economical background is ...,- Rendering technical assistance to Database M...,[nan],"[univers, degre, econom, background, plu, exce...","[render, technic, assist, databas, manag, syst...",,univers degre econom background plu excel know...,render technic assist databas manag system rea...,economical background Excellent knowledge year...
5,Saleswoman,Saleswoman will sell menswear and accessories.,"- Candidates should be female, 20-30 years old...",,"[saleswoman, sell, menswear, accessori]","[candid, femal, 20-30, year, old, nice-look, e...",[nan],saleswoman sell menswear accessori,candid femal 20-30 year old nice-look exterior...,,looking exterior Excellent communication skill...
6,Chief Accountant/ Finance Assistant,The Armenian Branch Office of the Open Society...,- University degree in finance/ accounting; \n...,,"[the, armenian, branch, offic, open, societi, ...","[univers, degre, finance/, account, one, year,...",[nan],the armenian branch offic open societi institu...,univers degre finance/ account one year minimu...,,year minimum experience year minimum experienc...
7,Non-paid part or full time Programmatic Intern,,,,[nan],[nan],[nan],,,,
8,Assistant to Managing Director,,- University degree;\n- At least 3 years of ex...,- Coordinating the work of subordinate employe...,[nan],"[univers, degre, at, least, 3, year, experi, r...","[coordin, work, subordin, employe, maintain, l...",,univers degre at least 3 year experi relev fie...,coordin work subordin employe maintain liaison...,years of experience relevant field office work...
9,"Program Assistant (INL), FSN-8; FP-6*",The incumbent assists in coordinating INL-fund...,NOTE: All applicants are instructed to\naddres...,,"[the, incumb, assist, coordin, inl-fund, law, ...","[note, all, applic, instruct, address, select,...",[nan],the incumb assist coordin inl-fund law enforc ...,note all applic instruct address select criter...,,selection criterion comprehensive information ...


### TF-IDF based similarity

In [60]:
tfidf_model = TfidfVectorizer()
Title_tfidf = tfidf_model.fit_transform(df_job_related['Title'])
JobDescription_tfidf = tfidf_model.fit_transform(df_job_related['JobDescription_token_str'])
RequiredQual_tfidf = tfidf_model.fit_transform(df_job_related['RequiredQual_token_str'])
JobRequirment_tfidf = tfidf_model.fit_transform(df_job_related['JobRequirment_token_str'])

In [62]:
def find_similar_jobs_tfidf(index, top_n = 5):
    Title_tfidf_cosine = cosine_similarity(Title_tfidf[index:index+1], Title_tfidf).flatten()
    JobDescription_tfidf_cosine = cosine_similarity(JobDescription_tfidf[index:index+1], JobDescription_tfidf).flatten()
    RequiredQual_tfidf_cosine = cosine_similarity(RequiredQual_tfidf[index:index+1], RequiredQual_tfidf).flatten()
    JobRequirment_tfidf_cosine = cosine_similarity(JobRequirment_tfidf[index:index+1], JobRequirment_tfidf).flatten()

    average_cosine = (Title_tfidf_cosine + JobDescription_tfidf_cosine + RequiredQual_tfidf_cosine + JobRequirment_tfidf_cosine) / 4.0
    related_docs_indices = [i for i in average_cosine.argsort()[::-1] if i != index]
    return [(index, average_cosine[index]) for index in related_docs_indices][0:top_n]

In [53]:
df_job_related.iloc[0]

Title                                                 Chief Financial Officer
JobDescription              AMERIA Investment Consulting Company is seekin...
RequiredQual                To perform this job successfully, an\nindividu...
JobRequirment               - Supervises financial management and administ...
JobDescription_token        [ameria, invest, consult, compani, seek, chief...
RequiredQual_token          [to, perform, job, success, individu, must, ab...
JobRequirment_token         [supervis, financi, manag, administr, staff, i...
JobDescription_token_str    ameria invest consult compani seek chief finan...
RequiredQual_token_str      to perform job success individu must abl perfo...
JobRequirment_token_str     supervis financi manag administr staff includ ...
Name: 0, dtype: object

In [64]:
for index, score in find_similar_jobs_tfidf(0):
    print(df_job_related.iloc[index])
    print("SIMILARITY SCORE: ", score)

Title                                                 Chief Financial Officer
JobDescription              River Island is seeking a Chief Financial Offi...
RequiredQual                - Master's degree in Management, Finance or Ec...
JobRequirment               - Develop tools and systems to provide critica...
JobDescription_token        [river, island, seek, chief, financi, offic, r...
RequiredQual_token          [master, 's, degre, manag, financ, econom, at,...
JobRequirment_token         [develop, tool, system, provid, critic, financ...
JobDescription_token_str    river island seek chief financi offic respons ...
RequiredQual_token_str      master 's degre manag financ econom at least 5...
JobRequirment_token_str     develop tool system provid critic financi oper...
Name: 11223, dtype: object
SIMILARITY SCORE:  0.4898332490345752
Title                                                 Chief Financial Officer
JobDescription              Armenian Datacom Company (ADC) is seeking a hi...

In [65]:
df_job_related.iloc[1]

Title                       Full-time Community Connections Intern (paid i...
JobDescription                                                            nan
RequiredQual                - Bachelor's Degree; Master's is preferred;\n-...
JobRequirment                                                             nan
JobDescription_token                                                    [nan]
RequiredQual_token          [bachelor, 's, degre, master, 's, prefer, exce...
JobRequirment_token                                                     [nan]
JobDescription_token_str                                                  nan
RequiredQual_token_str      bachelor 's degre master 's prefer excel skill...
JobRequirment_token_str                                                   nan
Name: 1, dtype: object

In [66]:
for index, score in find_similar_jobs_tfidf(1):
    print(df_job_related.iloc[index])
    print("SIMILARITY SCORE: ", score)

Title                                                 IT Teacher  (full time)
JobDescription                                                            nan
RequiredQual                - Degree in Computer Science, Information Tech...
JobRequirment                                                             nan
JobDescription_token                                                    [nan]
RequiredQual_token          [degre, comput, scienc, inform, technolog, rel...
JobRequirment_token                                                     [nan]
JobDescription_token_str                                                  nan
RequiredQual_token_str      degre comput scienc inform technolog relat dis...
JobRequirment_token_str                                                   nan
Name: 114, dtype: object
SIMILARITY SCORE:  0.6631658107959908
Title                       Non-paid part or full time Administrative Intern
JobDescription                                                           nan
Req

In [67]:
df_job_related.iloc[2]

Title                                                     Country Coordinator
JobDescription              Public outreach and strengthening of a growing...
RequiredQual                - Degree in environmentally related field, or ...
JobRequirment               - Working with the Country Director to provide...
JobDescription_token        [public, outreach, strengthen, grow, network, ...
RequiredQual_token          [degre, environment, relat, field, 5, year, re...
JobRequirment_token         [work, countri, director, provid, environment,...
JobDescription_token_str    public outreach strengthen grow network enviro...
RequiredQual_token_str      degre environment relat field 5 year relev exp...
JobRequirment_token_str     work countri director provid environment infor...
Name: 2, dtype: object

In [68]:
for index, score in find_similar_jobs_tfidf(2):
    print(df_job_related.iloc[index])
    print("SIMILARITY SCORE: ", score)

Title                                           Country Coordinator - Armenia
JobDescription              CENN - Caucasus Environmental NGO Network - is...
RequiredQual                1. Education: University education in environm...
JobRequirment               -  Write first hand articles, conduct intervie...
JobDescription_token        [cenn, caucasu, environment, ngo, network, loo...
RequiredQual_token          [1, educ, univers, educ, environment, scienc, ...
JobRequirment_token         [write, first, hand, articl, conduct, intervie...
JobDescription_token_str    cenn caucasu environment ngo network look coun...
RequiredQual_token_str      1 educ univers educ environment scienc relat f...
JobRequirment_token_str     write first hand articl conduct interview coll...
Name: 404, dtype: object
SIMILARITY SCORE:  0.45858797345562224
Title                                                     Country Coordinator
JobDescription              Veya Limited needs an experienced Business Man...


### Word and Doc Vectors

#### 1. Using online pretrained model
https://spacy.io/models/en#en_core_web_lg

In [69]:
# to download model
#!python -m spacy download en_core_web_lg

nlp = spacy.load('en_core_web_lg')

#### Note:
Similarity is determined by comparing word vectors or "word embeddings", multi-dimensional meaning representations of a word. https://spacy.io/usage/spacy-101#vectors-similarity

Similarity in Spacy is computed by:
    
    return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

This looks like its the formula for computing cosine similarity and the vectors seem to be created with SpaCy's .vector which documentation says is trained from GloVe's w2v model. https://spacy.io/api/token#similarity

<u> From the spacy website: </u>

Models that come with built-in word vectors make them available as the Token.vector  attribute. Doc.vector  and Span.vector  will default to an <b>average</b> of their token vectors.

#### Generate a final similarity score between two documents at a time, by comparing against,
- Titles
- Job Description
- Job Requirements
- Required Qualification

#### Note:

The out of the box pretrained word embedding model gives decent results for Titles.

But for other fields where the sentence vector is obtained by simply averaging word vectors alone, the pretrained word embedding model was not able to  distinguish well for RequiredQual, JobDescription and JobRequirment, regardless of the full string or using tokens after punctuation and stopwords removal.

For most of the job comparisons, it gave high similarity score (>0.8)

In [70]:
# e.g. Original text - Title
doc1 = nlp(df_job_related.iloc[0]['Title'])
doc2 = nlp(df_job_related.iloc[2]['Title'])
similarity = doc1.similarity(doc2)
print("DOC1:\n" + doc1.text, "\n\nDOC2:\n", doc2.text, "\n\n", similarity)

DOC1:
Chief Financial Officer 

DOC2:
 Country Coordinator 

 0.537330015139695


In [45]:
# e.g. Original text - RequiredQual
doc1 = nlp(df_job_related.iloc[0]['RequiredQual'])
doc2 = nlp(df_job_related.iloc[2]['RequiredQual'])
similarity = doc1.similarity(doc2)
print("DOC1:\n" + doc1.text, "\n\nDOC2:\n", doc2.text, "\n\n", similarity)

DOC1:
To perform this job successfully, an
individual must be able to perform each essential duty satisfactorily.
The requirements listed below are representative of the knowledge,
skill, and/or ability required.
Knowledge of:
- Generally accepted accounting principles;
- Local accounting standards and legislation;
- State reporting requirements pertaining to accounting;
- Principles and practices of financial management and budgeting;
- Principles and practices of financial systems design and analysis;
- Principles and practices of contract management, records management,
and risk management;
- Principles and practices of management and supervision;
- Principles and practices of information systems management.
Ability to:
- Apply sound fiscal and administrative practices to the company's
activities;
- Plan, organize and supervise the work of subordinate employees,
including training them, assigning and evaluating their work, and
providing job performance feedback;
- Critically analyze

In [46]:
# E.g. removing stopwords and punctuations
doc1 = nlp(SPACE.join(df_job_related.iloc[0]['RequiredQual_token']))
doc2 = nlp(SPACE.join(df_job_related.iloc[2]['RequiredQual_token']))
similarity = doc1.similarity(doc2)
print("DOC1:\n" + doc1.text, "\n\nDOC2:\n", doc2.text, "\n\n", similarity)

DOC1:
To perform job successfully individual must able perform essential duty satisfactorily The requirements listed representative knowledge skill and/or ability required Knowledge Generally accepted accounting principles Local accounting standards legislation State reporting requirements pertaining accounting Principles practices financial management budgeting Principles practices financial systems design analysis Principles practices contract management records management risk management Principles practices management supervision Principles practices information systems management Ability Apply sound fiscal administrative practices company's activities Plan organize supervise work subordinate employees including training assigning evaluating work providing job performance feedback Critically analyze fiscal administrative policies practices procedures systems recommend implement changes needed Gather synthesize financial information variety sources present variety audiences differin

#### Extracting keywords using POS and Regex Matcher

In [82]:
from spacy.matcher import Matcher
from spacy import displacy

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end]  # matched span
    sent = span.sent  # sentence containing matched span
    # append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{'start': span.start_char - sent.start_char,
                   'end': span.end_char - sent.start_char,
                   'label': 'MATCH'}]
    matched_sents.append({'text': sent.text, 'ents': match_ents })

matcher = Matcher(nlp.vocab)
matched_sents = []

pattern = [{'POS':'ADJ'},{'ORTH': '-'}, {'POS':'NOUN', 'OP':'+'}, {'LOWER': 'organization'}]
matcher.add('company has', collect_sents, pattern)  # add pattern
pattern = [{'POS':'NOUN', 'OP':'+'}, {'LOWER': 'organization'}]
matcher.add('company has', collect_sents, pattern)  # add pattern
pattern = [{'POS':'NOUN', 'OP':'+'}, {'LOWER': 'company'}]
matcher.add('company has', collect_sents, pattern)  # add pattern
pattern = [{'POS':'NOUN', 'OP':'+'}, {'LOWER': 'services'}]
matcher.add('company has', collect_sents, pattern)  # add pattern
pattern = [{'LOWER': 'company'}, {'POS':'VERB'}, {'POS':'NOUN', 'OP':'+'}]
matcher.add('company has', collect_sents, pattern)  # add pattern
pattern = [{'LOWER':'provider'}, {'POS':'ADP'}, {'POS':'NOUN',  'OP': '+'}  , {'POS':'CCONJ','OP':'+'}, {'POS':'NOUN','OP':'+'}]
matcher.add('provider of', collect_sents, pattern)  # add pattern
pattern = [{'LOWER':'provider'}, {'POS':'ADP'}, {'POS':'NOUN',  'OP': '+'}]
matcher.add('provider of', collect_sents, pattern)  # add pattern
pattern = [{'POS':'VERB'}, {'POS':'NOUN',  'OP': '+'} ]
matcher.add('verb_noun', collect_sents, pattern)  # add pattern
pattern = [{'POS':'ADJ'}, {'POS':'NOUN',  'OP': '+'} ]
matcher.add('verb_noun', collect_sents, pattern)  # add pattern
pattern = [{'POS':'NOUN'}, {'POS':'ADP'}, {'POS':'NOUN',  'OP': '+'} ]
matcher.add('verb_noun', collect_sents, pattern)  # add pattern
pattern = [{'POS':'NOUN'}, {'POS':'NOUN','OP':'+'} ]
matcher.add('verb_noun', collect_sents, pattern)  # add pattern

In [83]:
SPACE = " "
for job1_idx in range(df_job_related.shape[0]):
    matched_sents = []
    keywords = []
    text = re.sub(r'\s+', ' ', df_job_related.iloc[job1_idx]['RequiredQual'])
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # the matched span
        keywords.append(span.text)
    df_job_related.set_value(job1_idx, 'RequiredQual_keywords', SPACE.join(keywords))

In [84]:
for job1_idx in range(df_job_related.shape[0]):
    matched_sents = []
    keywords = []
    text = re.sub(r'\s+', ' ', df_job_related.iloc[job1_idx]['JobDescription'])
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # the matched span
        keywords.append(span.text)
    df_job_related.set_value(job1_idx, 'JobDescription_keywords', SPACE.join(keywords))

TypeError: expected string or bytes-like object

In [85]:
for job1_idx in range(df_job_related.shape[0]):
    matched_sents = []
    keywords = []
    text = re.sub(r'\s+', ' ', df_job_related.iloc[job1_idx]['JobRequirment'])
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # the matched span
        keywords.append(span.text)
    df_job_related.set_value(job1_idx, 'JobRequirment_keywords', SPACE.join(keywords))

TypeError: expected string or bytes-like object

In [98]:
# matched keywords highlighted - example for job 0
displacy.render(matched_sents, style='ent', manual=True, jupyter=True)

In [99]:
# keywords from the sentence - example for job 0
keywords

['essential duty',
 'accepted accounting principles',
 'accepted accounting principles',
 'accounting principles',
 'Local accounting standards',
 'Local accounting standards',
 'accounting standards',
 'State reporting requirements',
 'State reporting requirements',
 'reporting requirements',
 'financial management',
 'financial systems design',
 'financial systems design',
 'systems design',
 'practices of contract management',
 'practices of contract management',
 'contract management',
 'records management',
 'risk management',
 'practices of management',
 'practices of information systems',
 'practices of information systems management',
 'information systems management',
 'practices of information systems management',
 'information systems management',
 'systems management',
 'administrative practices',
 'subordinate employees',
 'their work',
 'providing job performance',
 'providing job performance feedback',
 'job performance feedback',
 'providing job performance feedback',
 

In [265]:
df_job_related.head(10)

Unnamed: 0,Title,JobDescription,RequiredQual,JobRequirment,JobDescription_token,RequiredQual_token,JobRequirment_token,RequiredQual_keywords,JobDescription_keywords,JobRequirment_keywords
0,Chief Financial Officer,"AMERIA Investment Consulting Company is seeking a\nChief Financial Officer. This position manages the company's fiscal and\nadministrative functions, provides highly responsible and technically\ncomplex staff assistance to the Executive Director. The work performed\nrequires a high level of technical proficiency in financial management\nand investment management, as well as management, supervisory, and\nadministrative skills.","To perform this job successfully, an\nindividual must be able to perform each essential duty satisfactorily.\nThe requirements listed below are representative of the knowledge,\nskill, and/or ability required.\nKnowledge of:\n- Generally accepted accounting principles;\n- Local accounting standards and legislation;\n- State reporting requirements pertaining to accounting;\n- Principles and practices of financial management and budgeting;\n- Principles and practices of financial systems design and analysis;\n- Principles and practices of contract management, records management,\nand risk management;\n- Principles and practices of management and supervision;\n- Principles and practices of information systems management.\nAbility to:\n- Apply sound fiscal and administrative practices to the company's\nactivities;\n- Plan, organize and supervise the work of subordinate employees,\nincluding training them, assigning and evaluating their work, and\nproviding job performance feedback;\n- ...","- Supervises financial management and administrative staff, including\nassigning responsibilities, reviewing employees' work processes and\nproducts, counseling employees, giving performance evaluations, and\nrecommending disciplinary action;\n- Serves as member of management team participating in both strategic\nand operational planning for the company;\n- Directs and oversees the company's financial management activities,\nincluding establishing and monitoring internal controls, managing cash\nand investments, and managing the investment portfolio in collaboration\nwith the Investment team leader. This includes, but is not limited to,\nevaluation of investment risk, concentration risk, fund deployment\nlevels, adequacy of loss and liquidity reserves Assists investment team\nin development of proper documentation and internal systems;\n- Directs and oversees the annual budgeting process, including\ndeveloping projections for financial planning, and preparing budgets;\n- Prepares e...","[AMERIA, Investment, Consulting, Company, seeking, Chief, Financial, Officer, This, position, manages, company, 's, fiscal, administrative, functions, provides, highly, responsible, technically, complex, staff, assistance, Executive, Director, The, work, performed, requires, high, level, technical, proficiency, financial, management, investment, management, well, management, supervisory, administrative, skills]","[To, perform, job, successfully, individual, must, able, perform, essential, duty, satisfactorily, The, requirements, listed, representative, knowledge, skill, and/or, ability, required, Knowledge, Generally, accepted, accounting, principles, Local, accounting, standards, legislation, State, reporting, requirements, pertaining, accounting, Principles, practices, financial, management, budgeting, Principles, practices, financial, systems, design, analysis, Principles, practices, contract, management, records, management, risk, management, Principles, practices, management, supervision, Principles, practices, information, systems, management, Ability, Apply, sound, fiscal, administrative, practices, company's, activities, Plan, organize, supervise, work, subordinate, employees, including, training, assigning, evaluating, work, providing, job, performance, feedback, Critically, analyze, fiscal, administrative, policies, practices, procedures, systems, recommend, implement, changes, ne...","[Supervises, financial, management, administrative, staff, including, assigning, responsibilities, reviewing, employees, work, processes, products, counseling, employees, giving, performance, evaluations, recommending, disciplinary, action, Serves, member, management, team, participating, strategic, operational, planning, company, Directs, oversees, company, 's, financial, management, activities, including, establishing, monitoring, internal, controls, managing, cash, investments, managing, investment, portfolio, collaboration, Investment, team, leader, This, includes, limited, evaluation, investment, risk, concentration, risk, fund, deployment, levels, adequacy, loss, liquidity, reserves, Assists, investment, team, development, proper, documentation, internal, systems, Directs, oversees, annual, budgeting, process, including, developing, projections, financial, planning, preparing, budgets, Prepares, external, internal, financial, management, reports, audited, financial, statement...",essential duty accepted accounting principles accepted accounting principles accounting principles Local accounting standards Local accounting standards accounting standards State reporting requirements State reporting requirements reporting requirements financial management financial systems design financial systems design systems design practices of contract management practices of contract management contract management records management risk management practices of management practices of information systems practices of information systems management information systems management practices of information systems management information systems management systems management administrative practices subordinate employees their work providing job performance providing job performance feedback job performance feedback providing job performance feedback job performance feedback performance feedback administrative policies implement changes financial information variety of sources ...,administrative functions complex staff assistance complex staff assistance staff assistance high level technical proficiency financial management investment management administrative skills,financial management administrative staff assigning responsibilities reviewing employees work processes counseling employees giving performance evaluations giving performance evaluations performance evaluations disciplinary action member of management team member of management team management team operational planning financial management activities financial management activities management activities internal controls managing cash investment portfolio portfolio in collaboration team leader evaluation of investment risk evaluation of investment risk investment risk concentration risk fund deployment levels fund deployment levels deployment levels adequacy of loss liquidity reserves investment team team in development proper documentation internal systems annual budgeting process annual budgeting process budgeting process developing projections financial planning preparing budgets financial management reports financial management reports management reports financial statements tax...
1,Full-time Community Connections Intern (paid internship),,- Bachelor's Degree; Master's is preferred;\n- Excellent skills in spoken and written English and Armenian languages;\n- Past English to Armenian translation and Armenian to English\ntranslation experience;\n- Good communication and public speaking skills;\n- Ability to work independently and as part of a team.\nREMUNERATION: Commensurate with experience.,,[nan],"[Bachelor, 's, Degree, Master, 's, preferred, Excellent, skills, spoken, written, English, Armenian, languages, Past, English, Armenian, translation, Armenian, English, translation, experience, Good, communication, public, speaking, skills, Ability, work, independently, part, team, REMUNERATION, Commensurate, experience]",[nan],Excellent skills Armenian languages Armenian translation English translation experience English translation experience translation experience Good communication public speaking skills public speaking skills speaking skills Commensurate with experience,,
2,Country Coordinator,"Public outreach and strengthening of a growing\nnetwork of environmental NGOs, businesses, international organizations\nand public agencies. Will serve as primary contact between CENN and\npublic. This is a full-time position.","- Degree in environmentally related field, or 5 years relevant\nexperience;\n- Oral and written fluency in Armenian, Russian and English;\n- Knowledge/ experience of working with environmental issues specific to\nArmenia is a plus.\nREMUNERATION: Salary commensurate with experience.","- Working with the Country Director to provide environmental information\nto the general public via regular electronic communications and serving\nas the primary local contact to Armenian NGOs and businesses and the\nArmenian offices of international organizations and agencies;\n- Helping to organize and prepare CENN seminars/ workshops;\n- Participating in defining the strategy and policy of CENN in Armenia,\nthe Caucasus region and abroad.","[Public, outreach, strengthening, growing, network, environmental, NGOs, businesses, international, organizations, public, agencies, Will, serve, primary, contact, CENN, public, This, full-time, position]","[Degree, environmentally, related, field, 5, years, relevant, experience, Oral, written, fluency, Armenian, Russian, English, Knowledge/, experience, working, environmental, issues, specific, Armenia, plus, REMUNERATION, Salary, commensurate, experience]","[Working, Country, Director, provide, environmental, information, general, public, via, regular, electronic, communications, serving, primary, local, contact, Armenian, NGOs, businesses, Armenian, offices, international, organizations, agencies, Helping, organize, prepare, CENN, seminars/, workshops, Participating, defining, strategy, policy, CENN, Armenia, Caucasus, region, abroad]",related field relevant experience written fluency environmental issues Salary commensurate commensurate with experience,Public outreach growing network international organizations public agencies primary contact time position,environmental information general public electronic communications local contact Armenian offices international organizations
3,BCC Specialist,"The LEAD (Local Enhancement and Development for\nHealth) BCC Specialist will apply state-of-the-art approaches in working\nwith LGUs (Local Government Units) and NGOs to help them to identify and\naddress provider-caused barriers to service provision as well as to\nidentify and address supports for good service delivery by developing\ntools that may be adapted to each LGU's needs. S/he will work with LEAD\nstaff across all components to support quality service delivery and will\nalso monitor implementation of improved service delivery in LGUs, and\nwill provide additional assistance to LGUs and NGOs, as needed. S/he\nwill collect all relevant published and grey literature documents,\nidentify gaps in knowledge, and work with NGOs and consultants to fill\nin the gaps. S/he will establish training for NGOs and LGU\nadministration staff pursuing service enhancement and provider behavior\nchange and will oversee the training. S/he will serve as the Project's\nprimary liaison to TSAP an...","- Advanced degree in public health, social science, or communication or\nrelated experience;\n- Familiarity with Behavior-centered Programming SM preferred;\n- At least five years experience in the design, implementation, and\nmonitoring/evaluation of behavior change programs in the developing\nworld, preferably including Philippines;\n- Experience with behavior change in reproductive health and/or family\nplanning programs;\n- Excellent communication, coordination, and facilitation skills;\n- Experience with PC-based software, including word processing and basic\nspreadsheet analysis;\n- Knowledge of a language of the Philippines, or willingness to learn;\n- High level of written and spoken English fluency.","- Identify gaps in knowledge and overseeing information collection to\nfill them;\n- Consolidate partnerships with center of excellence of technical\nsupport, working with local NGOs/COs, LCEs, and LGUs;\n- Work with LEAD team to develop assessment tools for LGUs to identify\nbarriers and supports to quality service provision;\n- Assist LGUs and NGOs to design and implement multifaceted (if\nnecessary), systematic interventions, based on assessment results, to\nenhance quality service provision, especially through provider behavior\nchange. For example, ensure that not only providers, but their\nsupervisors are trained and that monitoring and supervisory systems\npromote provider change, quality services, and sustainability of change;\n- Assist other LEAD specialists and units to enhance the quality of\nservices, especially through BCC and provider behavior change, and\nassure that programs serve community needs;\n- Oversee and, when practical, co-facilitate behavior change trainin...","[The, LEAD, Local, Enhancement, Development, Health, BCC, Specialist, apply, state-of-the-art, approaches, working, LGUs, Local, Government, Units, NGOs, help, identify, address, provider-caused, barriers, service, provision, well, identify, address, supports, good, service, delivery, developing, tools, may, adapted, LGU, 's, needs, S/he, work, LEAD, staff, across, components, support, quality, service, delivery, also, monitor, implementation, improved, service, delivery, LGUs, provide, additional, assistance, LGUs, NGOs, needed, S/he, collect, relevant, published, grey, literature, documents, identify, gaps, knowledge, work, NGOs, consultants, fill, gaps, S/he, establish, training, NGOs, LGU, administration, staff, pursuing, service, enhancement, provider, behavior, change, oversee, training, S/he, serve, Project's, primary, liaison, TSAP, projects, organizations, working, ...]","[Advanced, degree, public, health, social, science, communication, related, experience, Familiarity, Behavior-centered, Programming, SM, preferred, At, least, five, years, experience, design, implementation, monitoring/evaluation, behavior, change, programs, developing, world, preferably, including, Philippines, Experience, behavior, change, reproductive, health, and/or, family, planning, programs, Excellent, communication, coordination, facilitation, skills, Experience, PC-based, software, including, word, processing, basic, spreadsheet, analysis, Knowledge, language, Philippines, willingness, learn, High, level, written, spoken, English, fluency]","[Identify, gaps, knowledge, overseeing, information, collection, fill, Consolidate, partnerships, center, excellence, technical, support, working, local, NGOs/COs, LCEs, LGUs, Work, LEAD, team, develop, assessment, tools, LGUs, identify, barriers, supports, quality, service, provision, Assist, LGUs, NGOs, design, implement, multifaceted, necessary, systematic, interventions, based, assessment, results, enhance, quality, service, provision, especially, provider, behavior, change, For, example, ensure, providers, supervisors, trained, monitoring, supervisory, systems, promote, provider, change, quality, services, sustainability, change, Assist, LEAD, specialists, units, enhance, quality, services, especially, BCC, provider, behavior, change, assure, programs, serve, community, needs, Oversee, practical, co-facilitate, behavior, change, training, activities, proposal, development, coordinate, activities, needed, project, components, counterpart, agencies, ...]",Advanced degree public health social science related experience Familiarity with Behavior years experience evaluation of behavior change evaluation of behavior change programs behavior change programs evaluation of behavior change programs behavior change programs change programs developing world Experience with behavior change Experience with behavior change behavior change reproductive health family planning programs family planning programs planning programs Excellent communication facilitation skills Experience with PC based software including word processing including word processing word processing basic spreadsheet analysis basic spreadsheet analysis spreadsheet analysis High level English fluency,apply state art approaches address provider caused barriers barriers to service provision barriers to service provision service provision address supports good service delivery good service delivery service delivery developing tools LEAD staff support quality service support quality service delivery quality service delivery support quality service delivery quality service delivery service delivery monitor implementation improved service delivery improved service delivery service delivery delivery in LGUs additional assistance assistance to LGUs grey literature documents grey literature documents literature documents identify gaps gaps in knowledge establish training administration staff pursuing service enhancement pursuing service enhancement service enhancement provider behavior change provider behavior change behavior change primary liaison other projects behavior change,Identify gaps gaps in knowledge overseeing information collection overseeing information collection information collection Consolidate partnerships partnerships with center center of excellence technical support LEAD team develop assessment tools develop assessment tools assessment tools tools for LGUs identify barriers supports to quality service supports to quality service provision quality service provision supports to quality service provision quality service provision service provision Assist LGUs systematic interventions assessment results enhance quality service enhance quality service provision quality service provision enhance quality service provision quality service provision service provision provider behavior change provider behavior change behavior change only providers their supervisors supervisory systems promote provider change promote provider change provider change quality services quality services sustainability of change quality of services provider behavior ch...
4,Software Developer,,"- University degree; economical background is a plus;\n- Excellent knowledge of Windows 2000 Server, Networking TCP/ IP\ntechnologies, MS SQL 2000 Server, Visual Basic 6;\n- At least 2 years of experience in database software development;\n- Good knowledge of English.\nREMUNERATION: Will be commensurate with the norms accepted in the\nCompany.",- Rendering technical assistance to Database Management Systems;\n- Realization of SQL servers maintenance activities: back-up and\nreplication;\n- Participation in designing of software development projects.,[nan],"[University, degree, economical, background, plus, Excellent, knowledge, Windows, 2000, Server, Networking, TCP/, IP, technologies, MS, SQL, 2000, Server, Visual, Basic, 6, At, least, 2, years, experience, database, software, development, Good, knowledge, English, REMUNERATION, Will, commensurate, norms, accepted, Company]","[Rendering, technical, assistance, Database, Management, Systems, Realization, SQL, servers, maintenance, activities, back-up, replication, Participation, designing, software, development, projects]",economical background Excellent knowledge years of experience experience in database software experience in database software development database software development experience in database software development database software development software development Good knowledge,,technical assistance servers maintenance activities servers maintenance activities maintenance activities Participation in designing designing of software development designing of software development projects software development projects designing of software development projects software development projects development projects
5,Saleswoman,Saleswoman will sell menswear and accessories.,"- Candidates should be female, 20-30 years old;\n- Nice-looking exterior; \n- Excellent communication skills;\n- Fluency in Armenian and Russian; \n- Good knowledge of English (oral).\nREMUNERATION: Starting salary - 40,000 AMD.",,"[Saleswoman, sell, menswear, accessories]","[Candidates, female, 20-30, years, old, Nice-looking, exterior, Excellent, communication, skills, Fluency, Armenian, Russian, Good, knowledge, English, oral, REMUNERATION, Starting, salary, 40,000, AMD]",[nan],looking exterior Excellent communication skills Excellent communication skills communication skills Good knowledge Starting salary,sell menswear,
6,Chief Accountant/ Finance Assistant,"The Armenian Branch Office of the Open Society\nInstitute Assistance Foundation is seeking applications for the position\nof Chief Accountant/ Finance Assistant. The Chief Accountant/ Finance\nAssistant will be responsible for all transactions, connected with grant\npayments, administrative expenses.","- University degree in finance/ accounting; \n- One year minimum experience in an international organization; \n- Strong organizational skills; \n- Good knowledge of software programs: MS Excel and MS Access; \n- Good knowledge of IAS, Armenian taxation laws, reporting requirements\nand current reforms; \n- Discretion and ability to handle confidential issues; \n- Self-motivation with an ability to set and meet goals; \n- Quick learning skills; \n- Fluency in English, Armenian and Russian.",,"[The, Armenian, Branch, Office, Open, Society, Institute, Assistance, Foundation, seeking, applications, position, Chief, Accountant/, Finance, Assistant, The, Chief, Accountant/, Finance, Assistant, responsible, transactions, connected, grant, payments, administrative, expenses]","[University, degree, finance/, accounting, One, year, minimum, experience, international, organization, Strong, organizational, skills, Good, knowledge, software, programs, MS, Excel, MS, Access, Good, knowledge, IAS, Armenian, taxation, laws, reporting, requirements, current, reforms, Discretion, ability, handle, confidential, issues, Self-motivation, ability, set, meet, goals, Quick, learning, skills, Fluency, English, Armenian, Russian]",[nan],year minimum experience year minimum experience minimum experience international organization organizational skills Good knowledge knowledge of software programs knowledge of software programs software programs Good knowledge Armenian taxation laws Armenian taxation laws taxation laws reporting requirements current reforms confidential issues meet goals Quick learning skills Quick learning skills learning skills,seeking applications grant payments administrative expenses,
7,Non-paid part or full time Programmatic Intern,,,,[nan],[nan],[nan],,,
8,Assistant to Managing Director,,"- University degree;\n- At least 3 years of experience in the relevant field;\n- Mastership of the office work and competency of business ethics;\n- Excellent working knowledge of written and verbal Armenian, Russian\nand English; knowledge of French is a plus;\n- Knowledge of MS Office;\n- Strong problem-solving and organizational skills; outstanding\ninterpersonal skills. \nREMUNERATION: Will be commensurate with the norms accepted in the\nCompany",- Coordinating the work of subordinate employees;\n- Maintaining liaison with the other subdivisions within the Company;\n- Managing correspondence flow;\n- Realizing written and verbal translations;\n- Organizing meetings and conferences; compiling minutes of meetings;\n- Drawing reports;\n- Operating office equipment.,[nan],"[University, degree, At, least, 3, years, experience, relevant, field, Mastership, office, work, competency, business, ethics, Excellent, working, knowledge, written, verbal, Armenian, Russian, English, knowledge, French, plus, Knowledge, MS, Office, Strong, problem-solving, organizational, skills, outstanding, interpersonal, skills, REMUNERATION, Will, commensurate, norms, accepted, Company]","[Coordinating, work, subordinate, employees, Maintaining, liaison, subdivisions, within, Company, Managing, correspondence, flow, Realizing, written, verbal, translations, Organizing, meetings, conferences, compiling, minutes, meetings, Drawing, reports, Operating, office, equipment]",years of experience relevant field office work competency of business ethics competency of business ethics business ethics Excellent working knowledge Excellent working knowledge working knowledge Strong problem organizational skills interpersonal skills,,subordinate employees Maintaining liaison other subdivisions Managing correspondence flow Managing correspondence flow correspondence flow verbal translations Organizing meetings compiling minutes minutes of meetings Drawing reports Operating office equipment Operating office equipment office equipment
9,"Program Assistant (INL), FSN-8; FP-6*","The incumbent assists in coordinating INL-funded law\nenforcement and drug control programs in Armenia (including training\nprograms supported and implemented by US Agencies) to avoid duplication\nof effort with other programs and to ensure that assistance from various\nUSG agencies is complementary. Additionally, assists on coordinating US\nmission activities in the crime and narcotics area with other donors\nlike the United Nations, Non Governmental organizations and other\ninternational donors.\nA copy of the complete position description listing all duties and\nresponsibilities is available in the Human Resources Office. Contact\nnumber: (3741) 52-46-61","NOTE: All applicants are instructed to\naddress each selection criterion detailed below with specific and\ncomprehensive information supporting each criteria.\n- University degree in a relevant field (Law, Law Enforcement, Political\nScience, Public Administration) or its equivalent is required;\n- Two years of progressively responsible experience in program\nadministration and logistical support is required;\n- Level IV proficiency in English and Armenian;\n- Thorough knowledge of Armenian Criminal Justice System, legislative\nprocess, and structure and operations of law enforcement agencies\nrequired;\n- Sound management, analytical and organizational skills. \nSELECTION PROCESS: When equally qualified, Eligible Family Members and\nU.S. Veterans will be given preference. Therefore, it is essential that\nall candidates address the required qualifications above in the\napplication. \nADDITIONAL SELECTION CRITERIA: \n1. Management will consider nepotism/ conflict of interest, budg...",,"[The, incumbent, assists, coordinating, INL-funded, law, enforcement, drug, control, programs, Armenia, including, training, programs, supported, implemented, US, Agencies, avoid, duplication, effort, programs, ensure, assistance, various, USG, agencies, complementary, Additionally, assists, coordinating, US, mission, activities, crime, narcotics, area, donors, like, United, Nations, Non, Governmental, organizations, international, donors, A, copy, complete, position, description, listing, duties, responsibilities, available, Human, Resources, Office, Contact, number, 3741, 52-46-61]","[NOTE, All, applicants, instructed, address, selection, criterion, detailed, specific, comprehensive, information, supporting, criteria, University, degree, relevant, field, Law, Law, Enforcement, Political, Science, Public, Administration, equivalent, required, Two, years, progressively, responsible, experience, program, administration, logistical, support, required, Level, IV, proficiency, English, Armenian, Thorough, knowledge, Armenian, Criminal, Justice, System, legislative, process, structure, operations, law, enforcement, agencies, required, Sound, management, analytical, organizational, skills, SELECTION, PROCESS, When, equally, qualified, Eligible, Family, Members, U.S., Veterans, given, preference, Therefore, essential, candidates, address, required, qualifications, application, ADDITIONAL, SELECTION, CRITERIA, 1, Management, consider, nepotism/, conflict, interest, budget, visa, status, determining, successful, candidacy, 2, Current, employees, serving, probationary, per...",[nan],selection criterion comprehensive information relevant field its equivalent responsible experience experience in program administration experience in program administration program administration logistical support Thorough knowledge legislative process operations of law enforcement operations of law enforcement agencies law enforcement agencies operations of law enforcement agencies law enforcement agencies enforcement agencies Sound management organizational skills SELECTION PROCESS given preference required qualifications SELECTION CRITERIA nepotism/ conflict conflict of interest visa status successful candidacy Current employees probationary period employed AEFMs who employed AEFMs who AEFMs who advertised positions calendar days their employment Position Grade,coordinating INL funded law enforcement funded law enforcement law enforcement drug control programs drug control programs control programs including training programs including training programs training programs avoid duplication duplication of effort other programs mission activities narcotics area other donors Governmental organizations international donors complete position description complete position description position description Contact number,


In [113]:
# optional - saving df to disk for future
df_job_related.to_csv("job_similarity_df_with_keywords.csv", index=False, sep='\t')

#### Create a job-similarity matrix storing the values of similarity(cosine) scores between two jobs 

##### Note:
Computing 1 vs 19K jobs takes 1 hour.

Computing entire matrix will not be feasible.

Hence only 3 jobs were selected for this operation and restricted to first 1000 jobs.

In [86]:
# iterate through each row vs rest of rows to calculate DIS-similarity

job_dissimilarity_df = pd.DataFrame(columns=['index', 'similar-job', 'dissimilar-score'])

for job1_idx in range(3):  # rows
    for job2_idx in range(1000):

        doc1 = nlp(df_job_related.iloc[job1_idx]['Title'])
        doc2 = nlp(df_job_related.iloc[job2_idx]['Title'])
        title_similarity = doc1.similarity(doc2)
        
        doc1 = nlp(df_job_related.iloc[job1_idx]['JobDescription_keywords'])
        doc2 = nlp(df_job_related.iloc[job2_idx]['JobDescription_keywords'])
        jd_similarity = doc1.similarity(doc2)
        
        doc1 = nlp(df_job_related.iloc[job1_idx]['JobRequirment_keywords'])
        doc2 = nlp(df_job_related.iloc[job2_idx]['JobRequirment_keywords'])
        jr_similarity = doc1.similarity(doc2)
        
        doc1 = nlp(df_job_related.iloc[job1_idx]['RequiredQual_keywords'])
        doc2 = nlp(df_job_related.iloc[job2_idx]['RequiredQual_keywords'])
        rq_similarity = doc1.similarity(doc2)
        
        averaged_score = (title_similarity + jd_similarity + jr_similarity + rq_similarity) / 4.0
        job_dissimilarity_df = job_dissimilarity_df.append({'index': job1_idx, 'similar-job': job2_idx, 'dissimilar-score': (1-averaged_score)},ignore_index=True)

In [266]:
job_dissimilarity_df.head(10)

Unnamed: 0,index,similar-job,dissimilar-score
0,0.0,0.0,0.0
1,0.0,1.0,0.691006
2,0.0,2.0,0.248817
3,0.0,3.0,0.275137
4,0.0,4.0,0.481422
5,0.0,5.0,0.708744
6,0.0,6.0,0.358109
7,0.0,7.0,0.874434
8,0.0,8.0,0.379402
9,0.0,9.0,0.468607


### Show top similar jobs

In [136]:
job_dissimilarity_df_sorted = job_dissimilarity_df.sort_values(by=['index', 'dissimilar-score'], ascending=[True, True])

In [267]:
job_dissimilarity_df_sorted.head(10)

Unnamed: 0,index,similar-job,dissimilar-score
0,0.0,0.0,0.0
859,0.0,859.0,0.109565
446,0.0,446.0,0.117994
776,0.0,776.0,0.126975
72,0.0,72.0,0.135553
581,0.0,581.0,0.14203
970,0.0,970.0,0.142128
925,0.0,925.0,0.149169
153,0.0,153.0,0.151827
862,0.0,862.0,0.152852


In [243]:
# Show top 5 similar results to job - Chief Financial Officer
similar_jobs = job_dissimilarity_df_sorted.loc[job_dissimilarity_df_sorted['index'] == 0].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)].replace(r'\n',' ', regex=True))

Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [143]:
# Show top 5 similar results to job - Country Coordinator
similar_jobs = job_dissimilarity_df_sorted.loc[job_dissimilarity_df_sorted['index'] == 2].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                    Country Coordinator
JobDescription             Public outreach and strengthening of a growing...
RequiredQual               - Degree in environmentally related field, or ...
JobRequirment              - Working with the Country Director to provide...
JobDescription_token       [Public, outreach, strengthening, growing, net...
RequiredQual_token         [Degree, environmentally, related, field, 5, y...
JobRequirment_token        [Working, Country, Director, provide, environm...
RequiredQual_keywords      related field relevant experience written flue...
JobDescription_keywords    Public outreach growing network international ...
JobRequirment_keywords     environmental information general public elect...
Name: 2, dtype: object
Title                                                Agriculture Coordinator
JobDescription             Facilitating diversified and market oriented a...
RequiredQual               -\tUniversity degree in Ag

### 2. Using custom Doc2Vec
#### Note: 
This has already been done, so just going to load the model and compute similarity

In [103]:
from gensim.models.doc2vec import Doc2Vec

filename = "../classification/models/doc2vec_model_trained_300"
d2vmodel = Doc2Vec.load(filename)

#### Keeping original columns

In [104]:
# iterate through each row vs rest of rows to calculate DIS-similarity

job_dissimilarity_d2v_df = pd.DataFrame(columns=['index', 'similar-job', 'dissimilar-score'])

for job1_idx in range(3):  # rows
    for job2_idx in range(1000):
        title_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['Title'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['Title'].split()))
        jd_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobDescription'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobDescription'].split()))
        jr_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobRequirment'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobRequirment'].split()))
        rq_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['RequiredQual'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['RequiredQual'].split()))

        averaged_score = (title_similarity + jd_similarity + jr_similarity + rq_similarity) / 4.0
        job_dissimilarity_d2v_df = job_dissimilarity_d2v_df.append({'index': job1_idx, 'similar-job': job2_idx, 'dissimilar-score': (1-averaged_score)},ignore_index=True)

In [105]:
job_dissimilarity_d2v_df_sorted = job_dissimilarity_d2v_df.sort_values(by=['index', 'dissimilar-score'], ascending=[True, True])

In [106]:
job_dissimilarity_d2v_df_sorted.head(10)

Unnamed: 0,index,similar-job,dissimilar-score
0,0.0,0.0,0.007872
696,0.0,696.0,0.500684
941,0.0,941.0,0.720614
528,0.0,528.0,0.726391
491,0.0,491.0,0.728418
162,0.0,162.0,0.729573
536,0.0,536.0,0.730215
362,0.0,362.0,0.73244
859,0.0,859.0,0.732727
863,0.0,863.0,0.733301


In [107]:
# Show top 5 similar results to job - Chief Financial Officer
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 0].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                 Chief Financial Officer
JobDescription              AMERIA Investment Consulting Company is seekin...
RequiredQual                To perform this job successfully, an\nindividu...
JobRequirment               - Supervises financial management and administ...
JobDescription_token        [ameria, invest, consult, compani, seek, chief...
RequiredQual_token          [to, perform, job, success, individu, must, ab...
JobRequirment_token         [supervis, financi, manag, administr, staff, i...
JobDescription_token_str    ameria invest consult compani seek chief finan...
RequiredQual_token_str      to perform job success individu must abl perfo...
JobRequirment_token_str     supervis financi manag administr staff includ ...
RequiredQual_keywords       essential duty accepted accounting principles ...
JobDescription_keywords     administrative functions complex staff assista...
JobRequirment_keywords      financial management administrative 

In [108]:
# Show top 5 similar results to job - Intern
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 1].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                       Full-time Community Connections Intern (paid i...
JobDescription                                                            nan
RequiredQual                - Bachelor's Degree; Master's is preferred;\n-...
JobRequirment                                                             nan
JobDescription_token                                                    [nan]
RequiredQual_token          [bachelor, 's, degre, master, 's, prefer, exce...
JobRequirment_token                                                     [nan]
JobDescription_token_str                                                  nan
RequiredQual_token_str      bachelor 's degre master 's prefer excel skill...
JobRequirment_token_str                                                   nan
RequiredQual_keywords       Excellent skills Armenian languages Armenian t...
JobDescription_keywords                                                      
JobRequirment_keywords                                          

In [109]:
# Show top 5 similar results to job - Country Coordinator
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 2].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                     Country Coordinator
JobDescription              Public outreach and strengthening of a growing...
RequiredQual                - Degree in environmentally related field, or ...
JobRequirment               - Working with the Country Director to provide...
JobDescription_token        [public, outreach, strengthen, grow, network, ...
RequiredQual_token          [degre, environment, relat, field, 5, year, re...
JobRequirment_token         [work, countri, director, provid, environment,...
JobDescription_token_str    public outreach strengthen grow network enviro...
RequiredQual_token_str      degre environment relat field 5 year relev exp...
JobRequirment_token_str     work countri director provid environment infor...
RequiredQual_keywords       related field relevant experience written flue...
JobDescription_keywords     Public outreach growing network international ...
JobRequirment_keywords      environmental information general pu

#### Using only the keywords

In [117]:
# iterate through each row vs rest of rows to calculate DIS-similarity

job_dissimilarity_d2v_df = pd.DataFrame(columns=['index', 'similar-job', 'dissimilar-score'])

for job1_idx in range(3):  # rows
    for job2_idx in range(1000):
        title_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['Title'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['Title'].split()))
        jd_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobDescription_keywords'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobDescription_keywords'].split()))
        jr_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobRequirment_keywords'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobRequirment_keywords'].split()))
        rq_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['RequiredQual_keywords'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['RequiredQual_keywords'].split()))

        averaged_score = (title_similarity + jd_similarity + jr_similarity + rq_similarity) / 4.0
        job_dissimilarity_d2v_df = job_dissimilarity_d2v_df.append({'index': job1_idx, 'similar-job': job2_idx, 'dissimilar-score': (1-averaged_score)},ignore_index=True)

In [118]:
job_dissimilarity_d2v_df_sorted = job_dissimilarity_d2v_df.sort_values(by=['index', 'dissimilar-score'], ascending=[True, True])

In [119]:
# Show top 5 similar results to job - Chief Financial Officer
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 0].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                 Chief Financial Officer
JobDescription              AMERIA Investment Consulting Company is seekin...
RequiredQual                To perform this job successfully, an\nindividu...
JobRequirment               - Supervises financial management and administ...
JobDescription_token        [ameria, invest, consult, compani, seek, chief...
RequiredQual_token          [to, perform, job, success, individu, must, ab...
JobRequirment_token         [supervis, financi, manag, administr, staff, i...
JobDescription_token_str    ameria invest consult compani seek chief finan...
RequiredQual_token_str      to perform job success individu must abl perfo...
JobRequirment_token_str     supervis financi manag administr staff includ ...
RequiredQual_keywords       essential duty accepted accounting principles ...
JobDescription_keywords     administrative functions complex staff assista...
JobRequirment_keywords      financial management administrative 

In [113]:
# Show top 5 similar results to job - Intern
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 1].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                       Full-time Community Connections Intern (paid i...
JobDescription                                                            nan
RequiredQual                - Bachelor's Degree; Master's is preferred;\n-...
JobRequirment                                                             nan
JobDescription_token                                                    [nan]
RequiredQual_token          [bachelor, 's, degre, master, 's, prefer, exce...
JobRequirment_token                                                     [nan]
JobDescription_token_str                                                  nan
RequiredQual_token_str      bachelor 's degre master 's prefer excel skill...
JobRequirment_token_str                                                   nan
RequiredQual_keywords       Excellent skills Armenian languages Armenian t...
JobDescription_keywords                                                      
JobRequirment_keywords                                          

In [120]:
# Show top 5 similar results to job - Coordinator
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 2].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                     Country Coordinator
JobDescription              Public outreach and strengthening of a growing...
RequiredQual                - Degree in environmentally related field, or ...
JobRequirment               - Working with the Country Director to provide...
JobDescription_token        [public, outreach, strengthen, grow, network, ...
RequiredQual_token          [degre, environment, relat, field, 5, year, re...
JobRequirment_token         [work, countri, director, provid, environment,...
JobDescription_token_str    public outreach strengthen grow network enviro...
RequiredQual_token_str      degre environment relat field 5 year relev exp...
JobRequirment_token_str     work countri director provid environment infor...
RequiredQual_keywords       related field relevant experience written flue...
JobDescription_keywords     Public outreach growing network international ...
JobRequirment_keywords      environmental information general pu

In [121]:
# iterate through each row vs rest of rows to calculate DIS-similarity

job_dissimilarity_d2v_df = pd.DataFrame(columns=['index', 'similar-job', 'dissimilar-score'])

for job1_idx in range(3):  # rows
    for job2_idx in range(1000):
        title_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['Title'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['Title'].split()))
        jd_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobDescription'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobDescription'].split()))
        jr_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['JobRequirment'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['JobRequirment'].split()))
        rq_similarity = 1- scipy.spatial.distance.cosine(d2vmodel.infer_vector(df_job_related.iloc[job1_idx]['RequiredQual'].split()), d2vmodel.infer_vector(df_job_related.iloc[job2_idx]['RequiredQual'].split()))

        averaged_score = (title_similarity*0.9 + jd_similarity*0.3 + jr_similarity*0.3 + rq_similarity*0.3) / 4.0
        job_dissimilarity_d2v_df = job_dissimilarity_d2v_df.append({'index': job1_idx, 'similar-job': job2_idx, 'dissimilar-score': (1-averaged_score)},ignore_index=True)

In [122]:
job_dissimilarity_d2v_df_sorted = job_dissimilarity_d2v_df.sort_values(by=['index', 'dissimilar-score'], ascending=[True, True])

In [123]:
job_dissimilarity_d2v_df_sorted.head(10)

Unnamed: 0,index,similar-job,dissimilar-score
0,0.0,0.0,0.552287
696,0.0,696.0,0.699992
314,0.0,314.0,0.904818
859,0.0,859.0,0.905493
346,0.0,346.0,0.906263
427,0.0,427.0,0.907035
925,0.0,925.0,0.908099
162,0.0,162.0,0.908832
860,0.0,860.0,0.90908
536,0.0,536.0,0.90942


In [124]:
# Show top 5 similar results to job - Chief Financial Officer
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 0].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                 Chief Financial Officer
JobDescription              AMERIA Investment Consulting Company is seekin...
RequiredQual                To perform this job successfully, an\nindividu...
JobRequirment               - Supervises financial management and administ...
JobDescription_token        [ameria, invest, consult, compani, seek, chief...
RequiredQual_token          [to, perform, job, success, individu, must, ab...
JobRequirment_token         [supervis, financi, manag, administr, staff, i...
JobDescription_token_str    ameria invest consult compani seek chief finan...
RequiredQual_token_str      to perform job success individu must abl perfo...
JobRequirment_token_str     supervis financi manag administr staff includ ...
RequiredQual_keywords       essential duty accepted accounting principles ...
JobDescription_keywords     administrative functions complex staff assista...
JobRequirment_keywords      financial management administrative 

In [125]:
# Show top 5 similar results to job - Intern
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 1].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                       Full-time Community Connections Intern (paid i...
JobDescription                                                            nan
RequiredQual                - Bachelor's Degree; Master's is preferred;\n-...
JobRequirment                                                             nan
JobDescription_token                                                    [nan]
RequiredQual_token          [bachelor, 's, degre, master, 's, prefer, exce...
JobRequirment_token                                                     [nan]
JobDescription_token_str                                                  nan
RequiredQual_token_str      bachelor 's degre master 's prefer excel skill...
JobRequirment_token_str                                                   nan
RequiredQual_keywords       Excellent skills Armenian languages Armenian t...
JobDescription_keywords                                                      
JobRequirment_keywords                                          

In [126]:
# Show top 5 similar results to job - Country Coordinator
similar_jobs = job_dissimilarity_d2v_df_sorted.loc[job_dissimilarity_d2v_df_sorted['index'] == 2].head(6)['similar-job'].tolist()

for job in similar_jobs:
    print(df_job_related.iloc[int(job)])

Title                                                     Country Coordinator
JobDescription              Public outreach and strengthening of a growing...
RequiredQual                - Degree in environmentally related field, or ...
JobRequirment               - Working with the Country Director to provide...
JobDescription_token        [public, outreach, strengthen, grow, network, ...
RequiredQual_token          [degre, environment, relat, field, 5, year, re...
JobRequirment_token         [work, countri, director, provid, environment,...
JobDescription_token_str    public outreach strengthen grow network enviro...
RequiredQual_token_str      degre environment relat field 5 year relev exp...
JobRequirment_token_str     work countri director provid environment infor...
RequiredQual_keywords       related field relevant experience written flue...
JobDescription_keywords     Public outreach growing network international ...
JobRequirment_keywords      environmental information general pu