## Introduction
####  Data analysis & visualiztion on the skills of data scientists from the job description of 2 hiring websites

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import altair as alt
from altair import datum
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# Other utils
from tqdm import tqdm  # Progress bar
from datetime import datetime
from dateutil import parser

#EDA tools.
import dtale

# nlp text cleaning
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

# Transformers
from transformers import pipeline
import ipywidgets as widgets
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Clustering algorithms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

# Visualizing text
import spacy
import scattertext

### Pre-settings

In [2]:
# Set the width to show the column as much as possible.
pd.set_option('display.max_colwidth', 200)

# Disable 3 types of warning
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=(FutureWarning))
warnings.filterwarnings("ignore",category=(RuntimeWarning))

### Data clean & manipulation
- Remove the duplicated records.
- Remove the incorrect or inappropriate skills
- Combine the main and skills into one table: df_full

In [3]:
# load main table only including the jobs of data scientists
ds_file= './01_data/output/datascientists.csv'
df_main=pd.read_csv(ds_file)
df_main['id']=df_main['id'].astype('int64')

In [4]:
# Drop the duplicated job postings 
df_main.drop_duplicates(subset=['employer','description','title','location'],inplace=True)
# Drop the job posting with same id even the above would be a little different.
df_main.drop_duplicates(subset=['id'],inplace=True)

In [5]:
#removed space in country
df_main['country']=df_main['country'].apply(lambda x: x.strip())

In [6]:
# load skills produced by azure
az_skills= './01_data/output/az_skills.csv'
df_skills=pd.read_csv(az_skills)

In [7]:
# Select skils from 3 categories generated by azure
df_skills=df_skills[df_skills['category'].isin(['Skill','Product','Person'])]

In [8]:
# Drop the duplicated skills in the same job description
df_skills.drop_duplicates(subset=['id','skill'],inplace=True)

In [9]:
# Decide to divide the job postings into 3 types:junior, senior, and others
def ds_level(title=''):
    jr = ["junior", "jr", "jr.","intern","internship","young","student","analyst","associate"]
    sr = ["sr.","sr","senior","lead","leading","principal","president"]

    if any(x in title.lower() for x in jr):
        return "junior"

    if any(x in title.lower() for x in sr):
        return "senior"
    
    return "others"


In [10]:
# Assign types
df_main['type']=df_main['title'].apply(lambda x: ds_level(x))

In [11]:
# Change string to datetime
df_main['posting_date']=df_main['posting_date'].apply(lambda x: parser.parse(x))

#### Refine the skills extracted from Azure

In [12]:
# The list of person to be excluded.
xl_prd=[
        'food coupons', 'coffee', 'room', 'snacks',
      'computer screens', 'computers', 'printers',
       'copiers', 'computer', 'pool table', 'beers',
       'soft drinks', 'wine', 'mac', 'pc', 'nintendo switch',
       'big screen tv', 'mario kart', 'holidays', 'car', 'cycle',
       'machine',
       'office suite',
       'ph', 'd', 'drinks', 'watson', 'breakfast', 'fruits', 'advanced',
       'artefact',  'suite',
       'tv', 'headphones', 'notebooks', 'medicines',
       'beverages', 'vehicle',
       'infrastructure', 'food',  'celery',
       'ansible', 'espresso', 
       'petrel', 'macbook', 'books', 'vinted', 'bikes', 'home',
       'travel insurance', 'earthsofts equis', 'power', 'journals',
       'big machines', 
        'meal',  'meal vouchers',
       'laptop', 
       'textbooks', 'nail polish', 'fridge',  'company car',
       'book', 'goodies', 'fresh fruits', 
        'refreshments', 'plan',
       'disability insurance', 'free drinks', 'fruit', 
      'notebook',
       'passport',
       'opportunity',
       'artificial barriers',
       'tessian anc headphones', 'coffee machine', 'hummus',
       'dinner', 
       'apparel', 'uniform', 'restaurant tickets', 'transport tickets',
       'gear',   'pizza',
       'beer', 'vouchers', 
        'dress',
        'conference tickets', 'cell phone', 'sport vouchers',
       'consumer', 'phone',
        'friday drinks',
       'beanbags', 'guitars', 'table football',
       'meals', 
       'shiny',
       'satellite', 'desks', 
       'mobile phone', 
       'outlook', 'word', 
       'jupiter', 
       'barista', 'coffee machines', 
       'furniture',  'health care plan',
       'death service plan', 'cycle to work scheme', 
       'cinema tickets', 
     'company', 
       'kitchen', 'gourmet coffee', 'teas', 'leaseauto', 'healthy snacks',
       'pc computing',  'hd tv', 'pass', 
         'bicycle', 'plans',
       'oil', 'gas', 
        'fast food', 'ship',
       'gas turbines', 'cntk', 'nvidia tx2,', 'nvidia xavier',
       'sap', 'db2.']

In [13]:
# The list of person to be excluded.
xl_per=['ltat de lart', 'john doe',
        'monte', 'tiki', 'san francisco', 'roche', 'jhu',
      'veronika grollova', 'frederik norgaard', 'kennis van',
       'balderton']

In [14]:
# Removed the incorrect skills
df_skills['skill']=df_skills['skill'].apply(lambda x: None if x in (xl_prd+xl_per) else x)

In [15]:
df_skills.dropna(subset=['skill'],inplace=True)

In [16]:
# Export skills for manual refining
df_skill_cnt=df_skills.groupby(['skill']).count()['id'].reset_index().sort_values(['id'],ascending=False)
df_skill_cnt.to_csv('skills.csv')

In [17]:
# load skills refined by manual, the csv's columns are changed to: id, skill,count, keep 
refined_file= './01_data/manual/skills_refined.csv'
df_skills_r=pd.read_csv(refined_file)

In [18]:
df_skills

Unnamed: 0.1,Unnamed: 0,id,skill,category,confidence score
0,0,4.148184e+09,technologie-themen,Skill,0.84
1,1,4.148184e+09,technologie-themen,Skill,0.84
2,2,4.203393e+09,artificial intelligence,Skill,0.91
3,3,4.203393e+09,machine learning,Skill,0.83
7,7,4.203393e+09,datasets,Skill,0.80
...,...,...,...,...,...
33281,33281,4.147913e+09,decision tree,Skill,0.95
33282,33282,4.147913e+09,random,Skill,0.58
33283,33283,4.147913e+09,neural network,Skill,0.99
33284,33284,4.189587e+09,machine learning,Skill,1.00


In [19]:
# Combine skills with manual results
df_skills=df_skills.merge(df_skills_r,how='left',right_on=['skill'],left_on=['skill'])

In [20]:
df_skills.dropna(subset=['keep'],inplace=True)

In [21]:
df_skills

Unnamed: 0.1,Unnamed: 0,id,skill,category,confidence score,index,count,keep
2,2,4.203393e+09,artificial intelligence,Skill,0.91,298.0,46.0,y
3,3,4.203393e+09,machine learning,Skill,0.83,2414.0,479.0,y
4,7,4.203393e+09,datasets,Skill,0.80,1346.0,73.0,y
5,8,4.203393e+09,cybersecurity,Skill,0.87,1075.0,2.0,y
6,9,4.203393e+09,developing,Skill,0.62,1459.0,96.0,y
...,...,...,...,...,...,...,...,...
22370,33280,4.147913e+09,machine learning,Skill,0.98,2414.0,479.0,y
22371,33281,4.147913e+09,decision tree,Skill,0.95,1369.0,5.0,y
22373,33283,4.147913e+09,neural network,Skill,0.99,2779.0,7.0,y
22374,33284,4.189587e+09,machine learning,Skill,1.00,2414.0,479.0,y


#### Generate the sentence embedding to aggregate the duplicate skills

In [22]:
# Loading bert model
onlinemodel='bert-large-nli-mean-tokens'
embedder = SentenceTransformer(onlinemodel)

In [23]:
# Create skills embedding to aggregate the similiar skills
queries = list(df_skills['skill'].unique())
query_embeddings = embedder.encode(queries)

In [24]:
# functions to replace skills with similiar one
def remove_sim(f_queries,f_embeddings,threshold=0.9):
    np_em=np.array(f_embeddings)
    map_query=[]
    replace_lst=[]
    for i in range(len(f_queries)):
        sim=cosine_similarity([f_embeddings[i]],np_em[0:])
        sim[np.where(sim>=0.99)]=0
        #print(sim)
        x=np.argmax(sim)
        """
        items=np.where(sim[0]>=threshold)
        for item in items[0]:
            if item in replace_lst:
                print(x,item)
                x=item
        """
        if sim[0][x]>=threshold and i not in replace_lst:
            f_embeddings[i]=f_embeddings[x]
            np_em[i]=np_em[x]
            #print((f_queries[i],f_queries[x]))
            map_query.append((f_queries[i],f_queries[x]))
            f_queries[i]=f_queries[x]
            replace_lst.append(x)
        else:
            map_query.append((f_queries[i],f_queries[i]))
    return map_query,f_embeddings

In [25]:
# Copy skills and embedding for replacement
f_queries=queries.copy()
f_query_embeddings=query_embeddings.copy()
new_query,new_embeddings=remove_sim(f_queries,f_query_embeddings)

In [26]:
df_skills['skill']=df_skills['skill'].apply(lambda x: new_query[queries.index(x)][1])

In [27]:
# Drop the duplicated skills in the same job description
df_skills.drop_duplicates(subset=['id','skill'],inplace=True)

In [28]:
df_skills

Unnamed: 0.1,Unnamed: 0,id,skill,category,confidence score,index,count,keep
2,2,4.203393e+09,artificial intelligence models,Skill,0.91,298.0,46.0,y
3,3,4.203393e+09,machine learning.,Skill,0.83,2414.0,479.0,y
4,7,4.203393e+09,data sets,Skill,0.80,1346.0,73.0,y
5,8,4.203393e+09,cyber security,Skill,0.87,1075.0,2.0,y
6,9,4.203393e+09,develop,Skill,0.62,1459.0,96.0,y
...,...,...,...,...,...,...,...,...
22370,33280,4.147913e+09,machine learning.,Skill,0.98,2414.0,479.0,y
22371,33281,4.147913e+09,decision tree,Skill,0.95,1369.0,5.0,y
22373,33283,4.147913e+09,neural networks,Skill,0.99,2779.0,7.0,y
22374,33284,4.189587e+09,machine learning.,Skill,1.00,2414.0,479.0,y


In [29]:
# Produce the refined main table
df_full=df_skills.merge(df_main,how='left',left_on=['id'],right_on=['id'])[['id', 'type','posting_date', 'description', 'title', 'country',
       'employer', 'industry', 'source', 'skill']]

In [30]:
#Remove records which are duplicated and removed in main, but still exists in skill table
df_full.dropna(subset=['description'],inplace=True)

In [31]:
# data clean for data jobs
data_file= './01_Data/Output/datajobs.csv'
df_data=pd.read_csv(data_file)

In [32]:
# Drop the duplicated job postings 
df_data.drop_duplicates(subset=['employer','description','title','location'],inplace=True)
# Drop the job posting with same id even the above would be a little different.
df_data.drop_duplicates(subset=['id'],inplace=True)

In [33]:
# drop na
df_data.dropna(subset=['description'],inplace=True)
# Change string to datetime
df_data['posting_date']=df_data['posting_date'].apply(lambda x: parser.parse(x))

### Data analysis
- Jobs based analysis: by countries, by type(level), by industries
- Skills based analysis: by countries, by level, by industries.
- Seek to combine the above.
- Keywords anaysis in scattertext between Sr. and Jr.

In [34]:
# top N setting
top_n=20
# plots' width and height
b_height=300
b_width=700

##### Job distribution

In [35]:
df_main.groupby(df_main['posting_date'].dt.to_period("M")).count()['id'].sort_values(ascending=False)

posting_date
2019-10    475
2019-11    274
2019-09      7
2019-08      5
2019-07      3
2019-06      1
Freq: M, Name: id, dtype: int64

In [36]:
df_data.groupby(df_data['posting_date'].dt.to_period("M")).count()['id'].sort_values(ascending=False)

posting_date
2019-10    1980
2019-11    1243
2019-09      28
2019-08      22
2019-07       8
2019-06       4
2018-02       2
2019-04       1
Freq: M, Name: id, dtype: int64

##### Skills distribution

In [37]:
df_id=df_full.groupby(['type','id']).count()['skill']

In [38]:
df_id=df_id.reset_index()
df_id.columns=['type','id','cnt']

In [39]:
title_str="Skill Distribution By Types"
main_chart=alt.Chart(df_id).mark_boxplot(size=50, extent=0.5).encode(
        y=alt.Y('cnt:Q', title='# of skills'),
        x=alt.X('type:O',title='Type',axis=alt.Axis( 
                                   labelAngle=0, 
                                   labelOverlap=False)),
        color=alt.Color('type', scale=alt.Scale(scheme='set2'),title='Type'),
    ).properties(height=b_height, width=b_width,
            title = alt.TitleParams(text = title_str,
                                                align='center',
                                                font = 'Ubuntu Mono', 
                                                fontSize = 20, 
                                                color = '#3E454F'
                                                )
        )
median_line = alt.Chart(df_id).mark_rule(color='red').encode(
    y=alt.Y('mean(cnt):Q', title='# of skills'),
    size=alt.value(5)
)
main_chart+median_line

##### Skill analysis by levels - find out top n skills across levels

In [40]:
df_type=df_full.groupby(['type','skill']).count()['id'].sort_values(ascending=False)
df_type=df_type.reset_index()
df_type.columns=['type','skill','cnt']

In [41]:

title_str=" Top %d Skills By Types" %(top_n)
alt.Chart(df_type.groupby('type').head(top_n)).mark_bar().encode(
    y=alt.X('cnt:Q', title='# of count'),
    x=alt.Y('skill:O',sort='-y',title='skill',axis=alt.Axis( 
                                   labelAngle=-40, 
                                   labelOverlap=False)),
    color=alt.Color('type', scale=alt.Scale(scheme='set2'),title='Type'),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    )
).properties(height=b_height, width=b_width,
        title = alt.TitleParams(text = title_str,
                                            align='center',
                                            font = 'Ubuntu Mono', 
                                            fontSize = 20, 
                                            color = '#3E454F'
                                            )
    )

##### Skill analysis by countries - find out top n skills in top 3 countries on job listing

In [42]:
# Select top n countries on job postings
df_country=df_main.groupby(['country','type']).count()['id'].sort_values(ascending=False)
df_country=df_country.reset_index()
df_country.columns=['country','type','# of jobs']

In [52]:
v_height=400
v_width=400
title_str=" Top %d Countries By Types" %(top_n)
alt.Chart(df_country.head(top_n)).mark_bar().encode(
    x=alt.X('# of jobs:Q', title='# of jobs'),
    y=alt.Y('country:O',sort='-x',title='countries'),
    color=alt.Color('type', scale=alt.Scale(scheme='set2'),title='Type'),
).properties(height=v_height, width=v_width,
        title = alt.TitleParams(text = title_str,
                                            align='center',
                                            font = 'Ubuntu Mono', 
                                            fontSize = 14, 
                                            color = '#3E454F'
                                            )
    )

In [44]:
# Select the above top n countries to select top skills
df_type_c=df_full[df_full['country'].isin(df_country['country'].unique())].groupby(
    ['type','skill']).count()['id'].sort_values(ascending=False)
df_type_c=df_type_c.reset_index()
df_type_c.columns=['type','skill','cnt']

In [45]:
title_str=" Top %d Skills By Countries" %(top_n)
alt.Chart(df_type_c.groupby('type').head(top_n)).mark_bar().encode(
    y=alt.X('cnt:Q', title='# of count'),
    x=alt.Y('skill:O',sort='-y',title='skill',axis=alt.Axis( 
                                   labelAngle=-40, 
                                   labelOverlap=False)),
    color=alt.Color('type', scale=alt.Scale(scheme='set2'),title='Type'),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    )
).properties(height=b_height, width=b_width,
        title = alt.TitleParams(text = title_str,
                                            align='center',
                                            font = 'Ubuntu Mono', 
                                            fontSize = 20, 
                                            color = '#3E454F'
                                            )
    )

##### Skill analysis by industries - find out top n skills in those countries on job listing

In [46]:
# Select top n industries on job postings excluding "unclassified"
df_industry=df_main[df_main['industry']!='unclassified'].groupby(
    ['industry','type']).count()['id'].sort_values(ascending=False)
df_industry=df_industry.reset_index()
df_industry.columns=['industry','type','# of jobs']

In [56]:
v_height=400
v_width=400
title_str=" Top Industries By Types"
alt.Chart(df_industry.head(top_n)).mark_bar().encode(
    x=alt.X('# of jobs:Q', title='# of jobs'),
    y=alt.Y('industry:O',sort='-x',title='countries'),
    color=alt.Color('type', scale=alt.Scale(scheme='set2')),
).properties(height=v_height, width=v_width,
        title = alt.TitleParams(text = title_str,
                                            align='center',
                                            font = 'Ubuntu Mono', 
                                            fontSize = 14, 
                                            color = '#3E454F'
                                            )
    )

In [48]:
# Select the above top n countries to select top skills
df_type_i=df_full[df_full['industry'].isin(df_industry['industry'].unique())].groupby(
    ['type','skill']).count()['id'].sort_values(ascending=False)
df_type_i=df_type_i.reset_index()
df_type_i.columns=['type','skill','cnt']

In [49]:
title_str=" Top %d Skills By Countries" %(top_n)
alt.Chart(df_type_i.groupby('type').head(top_n)).mark_bar().encode(
    y=alt.X('cnt:Q', title='# of count'),
    x=alt.Y('skill:O',sort='-y',title='skill',axis=alt.Axis( 
                                   labelAngle=-40, 
                                   labelOverlap=False)),
    color=alt.Color('type', scale=alt.Scale(scheme='set2'),title='Type'),
    order=alt.Order(
      # Sort the segments of the bars by this field
      'type',
      sort='ascending'
    )
).properties(height=b_height, width=b_width,
        title = alt.TitleParams(text = title_str,
                                            align='center',
                                            font = 'Ubuntu Mono', 
                                            fontSize = 20, 
                                            color = '#3E454F'
                                            )
    )