## Introduction
####  Data analysis & visualiztion on the skills of data scientists from the job description of 2 hiring websites

In [17]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Other utils
from tqdm import tqdm  # Progress bar

#EDA tools.
import dtale

# nlp text cleaning
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

# Transformers
from transformers import pipeline
import ipywidgets as widgets
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Clustering algorithms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans

In [2]:
# load main table only including the jobs of data scientists
ds_file= './01_data/output/datascientists.csv'
df_main=pd.read_csv(ds_file)

In [3]:
# Drop the duplicated job postings 
df_main.drop_duplicates(subset=['employer','description','title','location'],inplace=True)

In [4]:
# load skills produced by azure
az_skills= './01_data/output/az_skills.csv'
df_skills=pd.read_csv(az_skills)

In [33]:
df_skills=df_skills[df_skills['category'].isin(['Skill','Product','Person'])]

In [34]:
len(df_skills)

22703

In [35]:
# Drop the duplicated skills in the same job description
df_skills.drop_duplicates(subset=['id','skill'],inplace=True)

In [36]:
len(df_skills)

22703

In [37]:
df_id_cnt=df_skills.groupby(['id']).count()['skill']
df_id_cnt=df_id_cnt.reset_index()

In [38]:
df_id_cnt.describe()

Unnamed: 0,id,skill
count,758.0,758.0
mean,4144643000.0,29.951187
std,68327060.0,18.724069
min,3716076000.0,1.0
25%,4112985000.0,16.25
50%,4169677000.0,28.0
75%,4194223000.0,40.0
max,4210952000.0,107.0


In [39]:
df_skills_cnt=df_skills.groupby(['skill']).count()['id']
df_skills_cnt=df_skills_cnt.reset_index()

In [40]:
df_skills_cnt.sort_values('id',ascending=False)

Unnamed: 0,skill,id
2497,machine learning,479
1123,data,403
4105,statistics,364
533,business,355
3460,python,351
...,...,...
1972,gitlab,1
1973,gizmodo,1
1974,glm/regression,1
1977,globally,1


In [12]:
# Perform EDA to check main table
d0 = dtale.show(df_main)
d0.open_browser()

In [41]:
onlinemodel='bert-large-nli-mean-tokens'
embedder = SentenceTransformer(onlinemodel)

2022-05-19 20:50:01,741 - INFO     - Load pretrained SentenceTransformer: bert-large-nli-mean-tokens
2022-05-19 20:51:03,953 - INFO     - Use pytorch device: cpu


In [42]:
#queries = list(jobs['job_desc'][0:30])
queries = list(df_skills['skill'].unique())
query_embeddings = embedder.encode(queries)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=147.0), HTML(value='')))




In [None]:

# Elbow criterion - Determine optimal numbers of clusters by elbow rule.
def elbow_plot(data, maxK=15, seed_centroids=None):
    """
        parameters:
        - data: pandas DataFrame (data to be fitted)
        - maxK (default = 10): integer (maximum number of clusters with which to run k-means)
        - seed_centroids (default = None ): float (initial value of centroids for k-means)
    """
    sse = []
    K= range(1, maxK)
    for k in K:
        if seed_centroids is not None:
            seeds = seed_centroids.head(k)
            kmeans = KMeans(n_clusters=k, max_iter=500, n_init=100, random_state=0, init=np.reshape(seeds, (k,1))).fit(data)
            #data["clusters"] = kmeans.labels_
        else:
            kmeans = KMeans(n_clusters=k, max_iter=300, n_init=100, random_state=0).fit(data)
            #data["clusters"] = kmeans.labels_
        print("k: ", k,"sse: ",kmeans.inertia_)
        # Inertia: Sum of distances of samples to their closest cluster center
        sse.append(kmeans.inertia_)
    plt.figure()
    plt.plot(K,sse,'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()
    return kmeans.labels_

# Run Elbow
elbow_plot(query_embeddings)

k:  1 sse:  715951.6875
k:  2 sse:  670928.125
k:  3 sse:  646905.5


In [None]:
# As clustering algorithm KMeams is a perfect fit.
num_clusters = 3
clf = KMeans(n_clusters=num_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
clf.fit_predict(query_embeddings)
cluster_assignment = clf.labels_

cdf=pd.DataFrame(columns=["cluster_id","sentence_id","sentence"])

for i in range(len(cluster_assignment)):
    new_row=pd.Series(data={"cluster_id":cluster_assignment[i],
                                "sentence_id":i,
                                "sentence":queries[i]
                           }
                            )
    cdf=cdf.append(new_row,ignore_index=True)

cdf.head()

In [30]:
# Perform EDA to check clustering result
d1 = dtale.show(cdf)
d1.open_browser()