In [1]:
import numpy as np
import pandas as pd
import sqlite3
import spacy

In [2]:
# Download pretrained enlgish model
try:
    import en_core_web_sm
except:
    !python -m spacy download en_core_web_sm
    import en_core_web_sm

In [3]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
con.close()

In [4]:
# Verify that result of SQL query is stored in the dataframe
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [5]:
onet_competencies = 'datasets/competencies.csv'
onet_df = pd.read_csv(onet_competencies,index_col=0)
onet_df[onet_df['occupation'] == 'Computer and Information Research Scientists'].head(5)

Unnamed: 0,occupation,competency,category,description
0,Computer and Information Research Scientists,Source code management SCM software,Technology Skills,Development environment software
1,Computer and Information Research Scientists,Microsoft Azure,Technology Skills,Development environment software
2,Computer and Information Research Scientists,Visualization,Abilities,The ability to imagine how something will look...
3,Computer and Information Research Scientists,Free-field speakers,Tools Used,Loudspeakers
4,Computer and Information Research Scientists,Data visualization software,Technology Skills,Analytical or scientific software


In [6]:
def process_text(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        # result.append(token.lemma_)
        result.append(token.text)
    return " ".join(result)

In [7]:
nlp = en_core_web_sm.load()
nlp.Defaults.stop_words.add("\n")

In [8]:
onet_comp = np.array(onet_df['competency'])
onet_desc = np.array(onet_df['description'])

In [9]:
onet_comp = np.array([process_text(comp.lower()) for comp in onet_comp])
onet_desc = np.array([process_text(desc.lower()) for desc in onet_desc])

In [10]:
occupations = 'results/title_occupation.csv'
occupations = pd.read_csv(occupations)
occupations['description'] = job_df['description']
occupations.head(5)

Unnamed: 0,id,title_processed,identifier,score_all,name,description
0,1,data scientist,15-1111.00,0.749518,Computer and Information Research Scientists,Do you want a meaningful role in a company tha...
1,2,business intelligence analyst,15-1199.08,0.87363,Business Intelligence Analysts,Brand: Glentel Corporate\nLocation: Burnaby Of...
2,3,human resources data scientist,15-2041.02,0.810885,Clinical Data Managers,2 x newly created Data Scientist opportunities...
3,4,lead human resource data scientist,15-2041.00,0.828754,Statisticians,Newly created data science lead embedded withi...
4,5,machine learning engineer,19-2099.01,0.811024,Remote Sensing Scientists and Technologists,"Who We are\nFounded in 2016, Skycope Technolog..."


In [11]:
job_desc = occupations['description'].values

In [12]:
job_desc = np.array([desc.lower() for desc in job_desc])

In [13]:
first_n = 25

job_desc = np.array([np.array([process_text(line.text).replace("\n", '') \
                     for line in nlp(str(desc)).sents]) \
                     for desc in job_desc[:first_n]])

In [14]:
# ***************************************************************
# BEWARE: EXTREMELY TIME CONSUMING
# TRY FOR SMALL NUMBER OF JOBS
# IMPRACTICAL FOR A LARGE SET OF JOBS SIMULTANEOUSLY
# PRACTICAL IMPLICATION: PERFORM THIS WHENEVER A NEW JOB IS ADDED
# ***************************************************************

scores = []
comp_ids = []

for i, desc in enumerate(job_desc):
    title = occupations['name'].iloc[i]
    idss = onet_df.index[onet_df['occupation'] == title].tolist()
    sent_score = []
    sent_comp_id = []
    print("COMPETENCIES {}, SENTENCES {}".format(len(idss), len(desc)))
    print("------------------------------")
    for j, sentence in enumerate(desc):
        sentence = nlp(str(sentence))
        temp_score = []
        # temp_comp_id = np.empty()
        for ids in idss:
            sim1 = sentence.similarity(nlp(str(onet_comp[ids])))
            sim2 = sentence.similarity(nlp(str(onet_desc[ids])))
            temp_score.append(max(sim1, sim2))
    
        max_score = np.max(temp_score)
        max_comp_id = idss[np.argmax(temp_score)]
        sent_score.append(max_score)
        sent_comp_id.append(max_comp_id)
        print("JOB {}, SENTENCE {} DONE".format(i, j))

    scores.append(sent_score)
    comp_ids.append(sent_comp_id)

TENCE 57 DONE
JOB 3, SENTENCE 58 DONE
JOB 3, SENTENCE 59 DONE
JOB 3, SENTENCE 60 DONE
JOB 3, SENTENCE 61 DONE
JOB 3, SENTENCE 62 DONE
JOB 3, SENTENCE 63 DONE
JOB 3, SENTENCE 64 DONE
JOB 3, SENTENCE 65 DONE
JOB 3, SENTENCE 66 DONE
JOB 3, SENTENCE 67 DONE
JOB 3, SENTENCE 68 DONE
JOB 3, SENTENCE 69 DONE
JOB 3, SENTENCE 70 DONE
JOB 3, SENTENCE 71 DONE
JOB 3, SENTENCE 72 DONE
JOB 3, SENTENCE 73 DONE
JOB 3, SENTENCE 74 DONE
JOB 3, SENTENCE 75 DONE
JOB 3, SENTENCE 76 DONE
JOB 3, SENTENCE 77 DONE
JOB 3, SENTENCE 78 DONE
JOB 3, SENTENCE 79 DONE
JOB 3, SENTENCE 80 DONE
JOB 3, SENTENCE 81 DONE
JOB 3, SENTENCE 82 DONE
JOB 3, SENTENCE 83 DONE
JOB 3, SENTENCE 84 DONE
JOB 3, SENTENCE 85 DONE
JOB 3, SENTENCE 86 DONE
JOB 3, SENTENCE 87 DONE
JOB 3, SENTENCE 88 DONE
JOB 3, SENTENCE 89 DONE
JOB 3, SENTENCE 90 DONE
JOB 3, SENTENCE 91 DONE
JOB 3, SENTENCE 92 DONE
JOB 3, SENTENCE 93 DONE
JOB 3, SENTENCE 94 DONE
JOB 3, SENTENCE 95 DONE
JOB 3, SENTENCE 96 DONE
JOB 3, SENTENCE 97 DONE
JOB 3, SENTENCE 98 DONE
JO

In [21]:
competency = []
for i in range(len(scores)):
    tup = list(zip(scores[i], comp_ids[i]))
    tup.sort(reverse=True)

    top = []
    visited = []
    for t in tup:
        if t[1] not in visited:
            top.append(t)
            visited.append(t[1])

    idss = [x[1] for x in top]
    if len(idss) > 20:
        idss = idss[:20]

    competency_names = []
    for ids in idss:
        competency_names.append(onet_df['category'].iloc[ids] + " | " + \
                                onet_df['competency'].iloc[ids] + " | " + \
                                onet_df['description'].iloc[ids])

    competency.append(competency_names)

In [22]:
competency[0]

['Work Values | Working Conditions | Occupations that satisfy this work value offer job security and good working conditions. Corresponding needs are Activity, Compensation, Independence, Security, Variety and Working Conditions.',
 'Work Styles | Self Control | Job requires maintaining composure, keeping emotions in check, controlling anger, and avoiding aggressive behavior, even in very difficult situations.',
 'Work Values | Recognition | Occupations that satisfy this work value offer advancement, potential for leadership, and are often considered prestigious. Corresponding needs are Advancement, Authority, Recognition and Social Status.',
 'Abilities | Fluency of Ideas | The ability to come up with a number of ideas about a topic (the number of ideas is important, not their quality, correctness, or creativity).',
 'Work Activities | Communicating with Persons Outside Organization | Communicating with people outside the organization, representing the organization to customers, the p

In [23]:
result_df = occupations.iloc[:first_n]

In [24]:
result_df['onet_competency'] = competency

In [25]:
result_df.head()

Unnamed: 0,id,title_processed,identifier,score_all,name,description,onet_competency
0,1,data scientist,15-1111.00,0.749518,Computer and Information Research Scientists,Do you want a meaningful role in a company tha...,[Work Values | Working Conditions | Occupation...
1,2,business intelligence analyst,15-1199.08,0.87363,Business Intelligence Analysts,Brand: Glentel Corporate\nLocation: Burnaby Of...,[Work Values | Support | Occupations that sati...
2,3,human resources data scientist,15-2041.02,0.810885,Clinical Data Managers,2 x newly created Data Scientist opportunities...,[Task Statements | Design and validate clinica...
3,4,lead human resource data scientist,15-2041.00,0.828754,Statisticians,Newly created data science lead embedded withi...,[Task Statements | Prepare data for processing...
4,5,machine learning engineer,19-2099.01,0.811024,Remote Sensing Scientists and Technologists,"Who We are\nFounded in 2016, Skycope Technolog...",[Work Values | Working Conditions | Occupation...


In [26]:
# Save file
result_df.to_csv('results/description_competency.csv', index=False)