In [1]:
import pandas as pd
import sqlite3
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("collectors/data.sqlite3")
job_df = pd.read_sql_query("SELECT * from job_post", con)
company_review_df = pd.read_sql_query("SELECT * from company_review", con)
job_interview_df = pd.read_sql_query("SELECT * from job_interview", con)
con.close()

In [3]:
# Verify that result of SQL query is stored in the dataframe
job_df.head()

Unnamed: 0,id,title,company,location,description,source,search_kw
0,1,Data Scientist,Aquatic Informatics,"Vancouver, BC",Do you want a meaningful role in a company tha...,indeed.com,data scientist
1,2,Business Intelligence Analyst,GLENTEL,"Burnaby, BC",Brand: Glentel Corporate\nLocation: Burnaby Of...,indeed.com,data scientist
2,3,Human Resources Data Scientist,Rio Tinto,Canada,2 x newly created Data Scientist opportunities...,indeed.com,data scientist
3,4,Lead - Human Resource Data Scientist,Rio Tinto,Canada,Newly created data science lead embedded withi...,indeed.com,data scientist
4,5,Machine Learning Engineer,Skycope Technologies Inc,"Vancouver, BC","Who We are\nFounded in 2016, Skycope Technolog...",indeed.com,data scientist


In [4]:
company_review_df.head(5)

Unnamed: 0,id,company,title,rating,author,author_status,location,date,description,source
0,1,bmo financial group,Decent,3.0,Credit Analyst (Current Employee),,"Burnaby, BC",11 July 2018,Not a bad job but can get boring easily. Manag...,indeed.com
1,2,bmo financial group,easy place to work,3.0,Project Manager (Former Employee),,"Toronto, ON",7 April 2020,co-workers are friendly but pay is not good at...,indeed.com
2,3,bmo financial group,Dead Industry- They will squeeze you like a le...,1.0,Assistant Manager (Former Employee),,"Cambridge, ON",6 April 2020,They give you the illusion of how great of a j...,indeed.com
3,4,bmo financial group,"In BMO agile team, one developer has to servic...",2.0,Software Specialist (Former Employee),,"Toronto, ON",6 April 2020,"Basically, in BMO agile team, they hire 5+ per...",indeed.com
4,5,bmo financial group,Good,5.0,Relationship Manager (Current Employee),,"Nanaimo, BC",4 April 2020,Work/life balance and customer centric. Traini...,indeed.com


In [5]:
job_interview_df.head()

Unnamed: 0,id,company,title,question,date,source,search_kw
0,1,Geotab,Data Scientist,Past projects,"Mar. 6, 2019",glassdoor.com,data scientist
1,2,Benbria,"Full-stack Developer, Data Scientist",NLP challenge about finding semantic similarit...,"Nov. 11, 2015",glassdoor.com,data scientist
2,3,Banque Nationale du Canada/National Bank of Ca...,Data Scientist,What measure would you use to assess the perfo...,"Nov. 25, 2019",glassdoor.com,data scientist
3,4,Geotab,Data Scientist,Walk me through your educational background.,"Feb. 1, 2020",glassdoor.com,data scientist
4,5,Capital One,Data Scientist,They provided some scenarios of a business and...,"Oct. 7, 2019",glassdoor.com,data scientist


In [6]:
job_df.describe()

Unnamed: 0,id
count,1450.0
mean,735.258621
std,419.134002
min,1.0
25%,373.25
50%,735.5
75%,1097.75
max,1460.0


In [7]:
job_df[job_df['search_kw'] == 'data scientist']['title'].head(50)

0                                      Data Scientist
1                       Business Intelligence Analyst
2                      Human Resources Data Scientist
3                Lead - Human Resource Data Scientist
4                           Machine Learning Engineer
5                                      Data Scientist
6                            Associate Data Scientist
7                                    Data Scientist I
8                            Data Scientist, AI@Unity
9                                      Data Scientist
10                                     Data Scientist
11                                     Data Scientist
12                         Jr. Data Science Developer
13                                     Data Scientist
14                     Data scientist (summer intern)
15    Data Scientist - Price Point – Start-up company
16    Data Scientist - Price Point – Start-up company
17        Manager of Data Science, RACE21 - Vancouver
18                          

In [8]:
title_df = job_df[job_df['search_kw'] == 'data scientist'][['title']].copy()

In [9]:
title_df.head()

Unnamed: 0,title
0,Data Scientist
1,Business Intelligence Analyst
2,Human Resources Data Scientist
3,Lead - Human Resource Data Scientist
4,Machine Learning Engineer


In [10]:
title_df['title_length'] = title_df.apply(lambda row: len(row['title']), axis=1)

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [12]:
title_df['tokens'] = title_df.apply(lambda row: [token for token in nlp(row['title']) if not token.is_punct], axis=1)

In [13]:
title_df.head(50)

Unnamed: 0,title,title_length,tokens
0,Data Scientist,14,"[Data, Scientist]"
1,Business Intelligence Analyst,29,"[Business, Intelligence, Analyst]"
2,Human Resources Data Scientist,30,"[Human, Resources, Data, Scientist]"
3,Lead - Human Resource Data Scientist,36,"[Lead, Human, Resource, Data, Scientist]"
4,Machine Learning Engineer,25,"[Machine, Learning, Engineer]"
5,Data Scientist,14,"[Data, Scientist]"
6,Associate Data Scientist,24,"[Associate, Data, Scientist]"
7,Data Scientist I,16,"[Data, Scientist, I]"
8,"Data Scientist, AI@Unity",24,"[Data, Scientist, AI@Unity]"
9,Data Scientist,14,"[Data, Scientist]"


In [14]:
job_df.iloc[0]['description']

"Do you want a meaningful role in a company that is making a difference in the world? Do you want to be involved in one of the most important environmental resource areas today? Do you want to learn what's involved in developing and deploying machine learning and predictive analytics solutions from colleagues with years of research and development experience? Then join our energetic and growing team and help revolutionize an industry.\nAbout our company\nFounded in 2003, Aquatic Informatics provides software solutions that address critical water data management, analytics and compliance challenges for the rapidly growing water industry. Aquatic Informatics is the trusted provider of water management solutions to over 1,000 municipal, federal, state/provincial, hydropower, mining, academic, and consulting organizations in over 60 countries that collect, manage, and process large volumes of water data.\nAquatic Informatics' platforms include AQUARIUS (http://aquaticinformatics.com/why-aq

In [15]:
doc = nlp(job_df.iloc[0]['description'])
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

today DATE
years DATE
2003 DATE
Aquatic Informatics ORG
Aquatic Informatics ORG
over 1,000 CARDINAL
over 60 CARDINAL
Aquatic Informatics' ORG
AQUARIUS ORG
WaterTrax ORG
Linko ORG
https://aquaticinformatics.com/products/linko/ ORG
Aquatic Informatics ORG
Vancouver GPE
Canada GPE
US GPE
Australia GPE
one CARDINAL
Canada GPE
EQ ORG
PhD WORK_OF_ART
2+ years DATE
at least one CARDINAL
Python ORG
English LANGUAGE
NumPy LOC
TensorFlow PRODUCT
PyTorch ORG
AWS ORG


In [16]:
spacy.explain('LOC')

'Non-GPE locations, mountain ranges, bodies of water'

In [17]:
onet_competencies = 'datasets/competencies.csv'
onet_df = pd.read_csv(onet_competencies,index_col=0)
onet_df.head(5)

Unnamed: 0,occupation,competency,category,description
0,Computer and Information Research Scientists,Source code management SCM software,Technology Skills,Development environment software
1,Computer and Information Research Scientists,Microsoft Azure,Technology Skills,Development environment software
2,Computer and Information Research Scientists,Visualization,Abilities,The ability to imagine how something will look...
3,Computer and Information Research Scientists,Free-field speakers,Tools Used,Loudspeakers
4,Computer and Information Research Scientists,Data visualization software,Technology Skills,Analytical or scientific software


In [18]:
onet_df[onet_df['competency'] == 'Python'].head()

Unnamed: 0,occupation,competency,category,description
111,Computer and Information Research Scientists,Python,Technology Skills,Object or component oriented development software
716,Bioinformatics Scientists,Python,Technology Skills,Object or component oriented development software
909,Geospatial Information Scientists and Technolo...,Python,Technology Skills,Object or component oriented development software
1115,Survey Researchers,Python,Technology Skills,Object or component oriented development software
1283,Statisticians,Python,Technology Skills,Object or component oriented development software


In [19]:
onet_occupations = 'datasets/occupations.csv'
onet_oc_df = pd.read_csv(onet_occupations,index_col=0)
onet_oc_df.head(5)

Unnamed: 0,identifier,name,description,titles
0,15-1111.00,Computer and Information Research Scientists,Conduct research into fundamental computer and...,Artificial Intelligence Specialist (AI Special...
1,19-4061.00,Social Science Research Assistants,"Assist social scientists in laboratory, survey...","Bilingual Research Interviewer,Clinical Resear..."
2,19-2099.01,Remote Sensing Scientists and Technologists,Apply remote sensing principles and methods to...,"All Source Intelligence Analyst,Data Analytics..."
3,19-1029.01,Bioinformatics Scientists,Conduct research using bioinformatics theory a...,"Assistant Scientist,Bioinformatician,Bioinform..."
4,15-1199.04,Geospatial Information Scientists and Technolo...,Research or develop geospatial technologies. M...,"Geographic Information Scientist,Geographic In..."


In [22]:
onet_oc_df.iloc[0]['titles']

'Artificial Intelligence Specialist (AI Specialist),Computational Linguist,Computational Theory Scientist,Computer Scientist,Computer Specialist,Control System Computer Scientist,Data Analyst,Data Consultant,Data Scientist,HPC Applications Manager (High Performance Computing Applications Manager),Information Scientist,Languages Researcher,Machine Learning Scientist,Multi-Disciplined Language Analyst,Programming Methodology Researcher,Research and Development Engineer (R&D Engineer),Research Scientist,Scientific Programmer Analyst'