## Introduction
####  Data analysis & visualiztion on the skills of data scientists from the job description of 2 hiring websites

In [None]:
# uncomment the following if it's required to install the following packages
#!pip install altair pyLDAvis tqdm dtale transformers sentence_transformers spacy scattertext
#!pip install Flask==2.1.0

In [2]:
from __future__ import print_function
import pandas as pd
import numpy as np

# Text preprocessing
import os,re

# Disable warning of 3 types
import warnings

#Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import altair as alt
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()


# Other utils
from tqdm import tqdm  # Progress bar
from datetime import datetime
from dateutil import parser

#EDA tools.
import dtale

# nlp text cleaning
import nltk
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer

# Transformers
from transformers import pipeline
import ipywidgets as widgets
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Clustering algorithms
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import LatentDirichletAllocation
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

# Visualizing text
import spacy
import scattertext


BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.


NEAREST is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.NEAREST or Dither.NONE instead.


`np.typeDict` is a deprecated alias for `np.sctypeDict`.



In [3]:
# load training data
train_data_path="./01_data/WikiLarge_Train.csv"
train_data=pd.read_csv(train_data_path)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



## Embedding-based clustering analysis

In [4]:
#onlinemodel='bert-large-nli-mean-tokens'
onlinemodel='distiluse-base-multilingual-cased-v2'
embedder = SentenceTransformer(onlinemodel)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
#queries = list(jobs['job_desc'][0:30])
queries_0 = list(train_data[train_data['label']==0]['original_text'])
query_embeddings_0=[]
for item in tqdm(queries_0):
    query_embeddings_0.append(embedder.encode([item]))


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

 50%|█████     | 104326/208384 [1:02:33<1:03:05, 27.49it/s]

In [None]:
query_embeddings_0

In [None]:
def remove_sim(i,queries,embeddings,threshold=0.9):
    np_em=np.array(embeddings)
    sim=cosine_similarity([embeddings[i]],np_em[0:])
    
    sim[np.where(sim>=1)]=0
    #print(sim)
    x=np.argmax(sim)
    
    if sim[0][x]>=threshold:
        #print(x,sim[0][x],queries[i],queries[x])
        return queries[x]
    else:
        return queries[i]


In [None]:

# Elbow criterion - Determine optimal numbers of clusters by elbow rule.
def elbow_plot(data, maxK=15, seed_centroids=None):
    """
        parameters:
        - data: pandas DataFrame (data to be fitted)
        - maxK (default = 10): integer (maximum number of clusters with which to run k-means)
        - seed_centroids (default = None ): float (initial value of centroids for k-means)
    """
    sse = []
    K= range(1, maxK)
    for k in K:
        if seed_centroids is not None:
            seeds = seed_centroids.head(k)
            kmeans = KMeans(n_clusters=k, max_iter=500, n_init=100, random_state=0, init=np.reshape(seeds, (k,1))).fit(data)
            #data["clusters"] = kmeans.labels_
        else:
            kmeans = KMeans(n_clusters=k, max_iter=300, n_init=100, random_state=0).fit(data)
            #data["clusters"] = kmeans.labels_
        print("k: ", k,"sse: ",kmeans.inertia_)
        # Inertia: Sum of distances of samples to their closest cluster center
        sse.append(kmeans.inertia_)
    plt.figure()
    plt.plot(K,sse,'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()
    return kmeans.labels_

# Run Elbow
elbow_plot(query_embeddings_0)

In [None]:
# As clustering algorithm KMeams is a perfect fit.
num_clusters = 3
clf = KMeans(n_clusters=num_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
clf.fit_predict(query_embeddings_0)
cluster_assignment = clf.labels_

cdf=pd.DataFrame(columns=["cluster_id","sentence_id","sentence"])

for i in range(len(cluster_assignment)):
    new_row=pd.Series(data={"cluster_id":cluster_assignment[i],
                                "sentence_id":i,
                                "sentence":queries_0[i]
                           }
                            )
    cdf=cdf.append(new_row,ignore_index=True)

cdf.head()

In [None]:
# Using PCA to reduce the dimension to project the result to 2-d scatter plot

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(query_embeddings_0)

df_pca = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

df_pca['sentence']=queries_0

In [None]:
# Perform EDA to check clustering result
d2 = dtale.show(df_pca)
d2.open_browser()
# Using PCA, it could not clearly identify different group

In [None]:
# Combine PCA results with K-means results to see clustering
df_k=df_pca.merge(cdf,right_on=['sentence'],left_on=['sentence'])

In [None]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(df_main['description_cln'])
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(df_main['description_cln'])

In [None]:
# Using LDA to cluster skills
lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tfidf.fit(dtm_tfidf)

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [None]:
# The above result present topic modelling may not suggest meaningful clustering.
# However, it may suggest I can combine all skills into sentence, then using topic modelling to analysis.

In [None]:
df_k

In [None]:
    height=600
    width=800
    # Create scatter plot to display death and confirmed cases by countries
    scat=alt.Chart(df_k).mark_circle(size=100).encode(
        y=alt.Y("principal component 1", axis=alt.Axis(format='f', title='PC 1')),
        x=alt.X("principal component 2",axis=alt.Axis(format='f', title='PC 2')),
        #color=alt.condition(select_country,alt.value("red"),alt.value("#66B2FF")),
        color='cluster_id',
        tooltip=["skills"]
    ).properties(
        height=height, width=width,
        title = alt.TitleParams(text = 'Clusters of skills',
                                anchor='middle',
                                font = 'Ubuntu Mono', 
                                fontSize = 16, 
                                color = '#3E454F', 
                                )
    )
    scat

In [None]:
# The clustering of embedding based also has no clue on the skills clustering.

In [None]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [None]:
def top_tfidf_feats(row, features, top_n=20):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats, columns=['features', 'score'])
    return df

def top_feats_in_doc(X, features, row_id, top_n=25):
    row = np.squeeze(X[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

def top_mean_feats(X, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    if grp_ids:
        D = X[grp_ids].toarray()
    else:
        D = X.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:

stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
# At this point we are going to tokenize the bodies and convert them
# into a document-term matrix.

# Some note on min_df and max_df
# max_df=0.5 means "ignore all terms that appear in more then 50% of the documents"
# min_df=2 means "ignore all terms that appear in less then 2 documents"
stopwords = ENGLISH_STOP_WORDS.union(['data','scientist'])
vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.5, min_df=2,use_idf=True,max_features=200000, 
                       tokenizer=tokenize_only,ngram_range=(1,3))

X = vect.fit_transform(df_tk['titles'].values)
print(X.shape)
features = vect.get_feature_names()

In [None]:
def top_feats_per_cluster(X, y, features, min_tfidf=0.1, top_n=25):
    dfs = []

    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label) 
        feats_df = top_mean_feats(X, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def plot_tfidf_classfeats_h(dfs):
    fig = plt.figure(figsize=(15, 9+len(dfs)*6), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(len(dfs),3, i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("cluster = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.score, align='center', color='#7530FF')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.features)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [None]:
#Use this to print the top terms per cluster with matplotlib.
plot_tfidf_classfeats_h(top_feats_per_cluster(X, t_clf.labels_, features, 0.1, 25))

In [None]:
# The above is more than clear, the tiltes are consisted of 3 types: junior, senior, and others

In [None]:
# Decide to divide the job postings into 3 types:junior, senior, and others
def ds_level(title=''):
    jr = ["junior", "jr", "jr.","intern","internship","young","student","analyst","associate"]
    sr = ["sr.","sr","senior","lead","leading","principal","president"]

    if any(x in title.lower() for x in jr):
        return "junior"

    if any(x in title.lower() for x in sr):
        return "senior"
    
    return "others"


In [None]:
df_skills

In [None]:
df_main['type']=df_main['title'].apply(lambda x: ds_level(x))

In [None]:
df_main['id']=df_main['id'].astype('int64')

In [None]:
df_full=df_skills.merge(df_main,how='left',left_on=['id'],right_on=['id'])

In [None]:
# Perform EDA to check main table
d4 = dtale.show(df_full)
d4.open_browser()

In [None]:
# Using scattertext to visualize the skills by types
nlp = spacy.load('en_core_web_sm')

In [None]:
    corpus = (scattertext.CorpusFromPandas(df_full,
                                           category_col='type', 
                                           text_col='skill',
                                           nlp=nlp)
              .build()
              .remove_terms(nlp.Defaults.stop_words, ignore_absences=True)
              )

In [None]:
df = corpus.get_term_freq_df()

In [None]:
df

In [None]:
html = scattertext.produce_scattertext_explorer(
                   corpus,
                   category='senior',
                   category_name='senior',
                   not_category_name=['junior'],
                   width_in_pixels=1000,
                   )

In [None]:
#open("ds_skills.html", 'wb').write(html.encode('utf-8'))
#with open("ds_skills.html", 'w') as outf: outf.write(html)
from IPython.display import IFrame    
display(IFrame("ds_skills.html", width=900, height=650))

In [None]:

# data clean for data jobs
data_file= './01_Data/Output/datajobs.csv'
df_data=pd.read_csv(data_file)
# Drop the duplicated job postings 
df_data.drop_duplicates(subset=['employer','description','title','location'],inplace=True)
# Drop the job posting with same id even the above would be a little different.
df_data.drop_duplicates(subset=['id'],inplace=True)
# drop na
df_data.dropna(subset=['description'],inplace=True)
# Change string to datetime
df_data['posting_date']=df_data['posting_date'].apply(lambda x: parser.parse(x))

In [None]:
#remove data scientists job from data jobs
print(len(df_data))
ds_list=df_main['id'].unique()
df_data['id']=df_data['id']=df_data['id'].apply(lambda x: None if x in (ds_list) else x)
df_data.dropna(subset=['id'],inplace=True)
print(len(df_data))

In [None]:
# Convert titles to embedding
titles = list(df_data['title'].unique())
titles_embeddings = embedder.encode(titles)

In [None]:
# Run Elbow to decide K for k-means
elbow_plot(titles_embeddings,maxK=8)

In [None]:
# As clustering algorithm KMeams is a perfect fit.
num_clusters = 3
t_clf = KMeans(n_clusters=num_clusters, 
            max_iter=100, 
            init='k-means++', 
            n_init=1)
t_clf.fit_predict(titles_embeddings)
t_cluster_assignment = t_clf.labels_

t_cdf=pd.DataFrame(columns=["cluster_id","sentence_id","sentence"])

for i in range(len(t_cluster_assignment)):
    new_row=pd.Series(data={"cluster_id":t_cluster_assignment[i],
                                "sentence_id":i,
                                "sentence":titles[i]
                           }
                            )
    t_cdf=t_cdf.append(new_row,ignore_index=True)

t_cdf.head()

In [None]:
# Using PCA to reduce the dimension to project the result to 2-d scatter plot

t_pca = PCA(n_components=2)
t_principalComponents = t_pca.fit_transform(titles_embeddings)


df_tpca = pd.DataFrame(data = t_principalComponents
             , columns = ['principal component 1', 'principal component 2'])

df_tpca['titles']=titles

In [None]:
# Combine PCA results with K-means results to see clustering
df_tk=df_tpca.merge(t_cdf,right_on=['sentence'],left_on=['titles'])

In [None]:
# At this point we are going to tokenize the bodies and convert them
# into a document-term matrix.

# Some note on min_df and max_df
# max_df=0.5 means "ignore all terms that appear in more then 50% of the documents"
# min_df=2 means "ignore all terms that appear in less then 2 documents"
stopwords = ENGLISH_STOP_WORDS.union(['data','scientist'])
vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.5, min_df=2,use_idf=True,max_features=200000, 
                       tokenizer=tokenize_only,ngram_range=(1,3))

X = vect.fit_transform(df_tk['titles'].values)
print(X.shape)
features = vect.get_feature_names()

In [None]:
    height=600
    width=800
    # Create scatter plot to display death and confirmed cases by countries
    tscat=alt.Chart(df_tk).mark_circle(size=100).encode(
        y=alt.Y("principal component 1", axis=alt.Axis(format='f', title='PC 1')),
        x=alt.X("principal component 2",axis=alt.Axis(format='f', title='PC 2')),
        #color=alt.condition(select_country,alt.value("red"),alt.value("#66B2FF")),
        color=alt.Color('cluster_id', scale=alt.Scale(scheme='accent')),
        tooltip=["titles"]
    ).properties(
        height=height, width=width,
        title = alt.TitleParams(text = 'Clusters of titles',
                                anchor='middle',
                                font = 'Ubuntu Mono', 
                                fontSize = 16, 
                                color = '#3E454F', 
                                )
    )
    tscat

In [None]:
#Use this to print the top terms per cluster with matplotlib.
plot_tfidf_classfeats_h(top_feats_per_cluster(X, t_clf.labels_, features, 0.1, 25))

In [None]:
# Select data scientist jobs
print(len(df_data))
df_de=df_data[df_data['title'].str.contains(r'^(?=.*data)(?=.*engineer)',case=False)]
print(len(df_de))