# Classifying job postings from Indeed.com

adapeted from by https://whatisai.github.io/Gaussian-Mixture-Models/

The structure is as follows :

1. Read a corpus from a number of job postings (e.g. https://www.kaggle.com/datasets/yusufolonade/data-science-job-postings-indeed-usa)
       
2. Create Tf-idf features using 1,2 and 3-gram bag of words (TfidfVectorizer from sklearn.feature_extraction.text)

3. Perform unsupervised classification of the job-postings with kmeans++ and GMM from sklearnadapted 

In [24]:
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
import numpy as np
%matplotlib inline

### Data Preprocessing and feature creation
Read the data and use some stop word lists for cleaning the corpus.

In [25]:
def corpus_stop_word_cleaner(corpus, stop_words_input=None):
    '''
    This function just removes some words from the corpus in case you realize you want to filter out more words.
    Inputs: corpus
    Outputs: stop_words filtered out of corpus
    '''
    cc=[0]
    if type(corpus) != list: 
        cc[0] = corpus
        corpus = cc
        
    for ic,text_original in enumerate(corpus):       
        stop_words_base = set(stopwords.words("english")) # Filter out any stop words
        if stop_words_input != None: stop_words_base = stop_words_base.union(stop_words_input)
        stop_words_jobs = set(['job','jobs','candidate','candidates','apply','now','skills','application','new',
                           'group','day','company','experience','our','job','position',
                           'pay','train','training','team','staff','indeed','work','working',
                           'yes','we','us','pay','hour','hours','uk','london','hire',
                           'team','within','slavery','therefore','opportunities','opportunity',
                           'motivation','motivated','he','she','he/she','much','very',
                              'cookies','com','asos','postcode','ago','date','benefits',
                              'cv','role','cookies','com','asos','postcode','ago','date',
                               'benefits','religion','sexual','orientation','salary','asap',
                               'annum','race','like' ,'may','enjoy','keywords' ])
    
        stop_words = stop_words_base.union(stop_words_jobs)
        text = [w.lower() for w in text_original.split() if w.lower() not in stop_words]
        text_original = ' '.join(text)
        corpus[ic] = text_original
    if len(corpus) == 1 :
        return corpus[ic]
    else:
        return corpus

In [26]:
# Read the CSV into a pandas data frame (df)
df = pd.read_csv('../../DataDirectory/data_science_jobs_indeed_usa.csv', delimiter=',')

In [27]:
corpus = list(df['Description'])

#call the stp word removal function
corpus =  corpus_stop_word_cleaner(corpus, ['cv'])

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# let's play with max_features: If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. 
def create_features(corpus, nmin=1,nmax=3,nfeat=10000):    
    #using or regular Tfidf sklearn function
    vectorizer = TfidfVectorizer(ngram_range=(nmin,nmax), 
                                 sublinear_tf = True, max_features = nfeat)
    job_features = vectorizer.fit_transform(corpus)
    return vectorizer, job_features

In [29]:

vect_ds, indeed_ds_features = create_features(corpus, nmin=1, nmax=3, nfeat=500)
#inspect some of the created features
print('Shape of extracted features',indeed_ds_features.toarray().shape)
print('Some features')
print(vect_ds.get_feature_names_out()[20:100])
print('Features are stored in a sparse format',type(indeed_ds_features))

Shape of extracted features (1200, 500)
Some features
['analyst' 'analysts' 'analytical' 'analytics' 'analytics data'
 'analytics data science' 'analytics machine' 'analytics machine learning'
 'analyze' 'analyzing' 'and' 'and or' 'applications' 'applied' 'applying'
 'appropriate' 'architecture' 'areas' 'artificial' 'as' 'assist'
 'automate' 'aws' 'azure' 'bachelor' 'bachelor degree' 'background'
 'backup' 'based' 'best' 'best practices' 'bi' 'big' 'big data' 'bring'
 'build' 'building' 'building data' 'business' 'business analysis'
 'business analyst' 'business challenges' 'business intelligence'
 'business intelligence solutions' 'business needs' 'business problems'
 'business process' 'business processes' 'business requirements'
 'business stakeholders' 'capabilities' 'cases' 'center' 'challenges'
 'changes' 'client' 'clinical' 'closely' 'cloud' 'cloud environment'
 'cloud environment highly' 'cloud public' 'cloud public cloud'
 'clustering' 'code' 'collaborate' 'collaborate data' '

## Classification with kmeans

In [30]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=8, init='k-means++', max_iter=100, n_init=1,
                    verbose=1)
km.fit(indeed_ds_features)

Initialization complete
Iteration 0, inertia 1973.3434870972808.
Iteration 1, inertia 1067.841894129976.
Iteration 2, inertia 1049.4741160619028.
Iteration 3, inertia 1043.7546881829987.
Iteration 4, inertia 1042.34339948647.
Iteration 5, inertia 1041.8734538795452.
Iteration 6, inertia 1041.6658530352934.
Iteration 7, inertia 1041.464387373932.
Iteration 8, inertia 1041.2471510541209.
Iteration 9, inertia 1041.1961440053246.
Converged at iteration 9: strict convergence.


In [31]:
#Labels of each point
km.labels_

array([6, 4, 3, ..., 3, 7, 4], dtype=int32)

Inspect the most important features in each cluster to understand them.

In [32]:
def importance_features(feat_names, km, perc=99.9):
    res=km.__dict__
    for iclass in set(km.labels_):
        print('\n****** \nImportant features for class ',iclass,'\n')
        for ii,iv in enumerate(res['cluster_centers_'][iclass]):
            if iv > np.percentile(res['cluster_centers_'][iclass],perc)  :
                print('{0:<30s}{1}'.format(feat_names[ii],iv))

In [33]:
importance_features(vect_ds.get_feature_names_out(),km, perc=98)


****** 
Important features for class  0 

algorithms                    0.03877973413068355
data                          0.04432968730460575
deep                          0.05148796430928891
deep learning                 0.041448460150211044
learning                      0.24065587484954476
learning models               0.05758499447104636
machine                       0.22995151386985585
machine learning              0.2212641766695532
machine learning models       0.05281433955379254
models                        0.07089138319952278

****** 
Important features for class  1 

business                      0.0709256566965375
closely                       0.05635280346725043
cross                         0.05474078735885329
cross functional              0.05534243598427741
data                          0.11405317958280264
data science                  0.07454417818310743
functional                    0.055664316171007734
help                          0.07407813690861956
science       


Looking at each category, the job adds can be described as:

0. AI - ML
1. 
2. 
3.
4. 
5. DB and Data Science
6. 
7. 

# Classification with GMM

In [34]:
from sklearn.mixture import GaussianMixture

def call_GMM(k, features):
    cov_type='tied'
    gmm = GaussianMixture(n_components=k,
                       covariance_type=cov_type, max_iter=20, random_state=0  )
    gmm.fit(features)    # Learns model parameters
    labels = gmm.predict(features) # assign labels
    return labels, gmm.predict_proba(features), gmm

In [35]:
k=8
#dense data is required
zlab,zprob, estimn = call_GMM(k, indeed_ds_features.toarray())
print('Labels',zlab)
print('Probability of belonging to each cluster', zprob[0:5])

Labels [2 1 4 ... 0 7 1]
Probability of belonging to each cluster [[8.33774107e-047 2.53355085e-053 1.00000000e+000 7.67571282e-104
  9.85165875e-042 7.82865833e-071 2.77526493e-093 2.92331099e-050]
 [8.33504904e-022 1.00000000e+000 2.20469188e-047 1.81929429e-073
  9.98073981e-014 9.03894995e-049 6.10234591e-075 2.19353361e-031]
 [5.97927275e-009 1.98720764e-017 9.61953413e-034 3.57724320e-063
  9.99999994e-001 1.00363881e-050 1.24438056e-078 1.27687296e-022]
 [7.26011745e-041 8.05577675e-058 1.00000000e+000 4.80993575e-092
  9.94393623e-049 1.05908353e-074 3.88851069e-099 6.36233135e-057]
 [2.06129255e-015 3.15152166e-019 1.82993270e-037 2.42805990e-050
  1.00000000e+000 1.13626878e-048 3.56316536e-077 2.00905340e-023]]


In [52]:
#helper function to print features and probabilites
import operator
def importance_features(k, feature_names,estimator,perc=99.9):

    for ik in range(k):        
        impor_feat = {}
        muik = estimator.means_[ik]
        tth = np.percentile(muik,perc)
        for i,iv in enumerate(muik):
            if iv > tth:
                impor_feat[feature_names[i] ] =  iv 
        sorted_x = sorted(impor_feat.items(), key=operator.itemgetter(1),reverse=True  )
        print('\n \n Important features for cluster ', ik)
        for ii in sorted_x:
             print('{0:<30s}{1}'.format(ii[0],ii[1]))

In [37]:
importance_features(k,vect_ds.get_feature_names_out(),estimn,perc=98)


 
 Important features for cluster  0
data                          0.1993610541195355
sources                       0.061005575538085145
analysis                      0.057486192357036475
data sources                  0.056699688483988896
large                         0.04317090495882641
data analysis                 0.043053658797131224
modeling                      0.03562035500156122
sets                          0.0332037108813425
mining                        0.03220980871531062
data sets                     0.030396499229859816

 
 Important features for cluster  1
business                      0.2272450541110653
business intelligence         0.07148103516299564
requirements                  0.07110847999247094
intelligence                  0.06991449089305575
business requirements         0.05017529483184601
analyst                       0.04399143604691083
needs                         0.0358590166725423
solutions                     0.03563582538438177
business analyst       

If fewer features were to be considered, the probabilites are distributed slightly different. For example, consider only 100 features

In [50]:
vect_ds_2, indeed_features_ds_2 = create_features(corpus, nmin=1,nmax=1,nfeat=300)
feat_names_ds_2 = vect_ds_2.get_feature_names_out()
print(indeed_features_ds_2.shape)
z2lab, z2prob, estimn2 = call_GMM(k, indeed_features_ds_2.toarray() )
print('Labels',zlab)
print('Probability of belonging to each cluster', z2prob[0:10])

(1200, 300)
Labels [2 1 4 ... 0 7 1]
Probability of belonging to each cluster [[1.83408129e-12 3.59344375e-18 1.59364392e-47 1.14730249e-16
  1.31529914e-12 1.14709722e-21 3.90930779e-26 1.00000000e+00]
 [7.34009156e-14 1.00000000e+00 2.71903454e-47 2.67610070e-13
  6.76412245e-21 6.05381769e-28 3.15643431e-23 5.09888914e-15]
 [5.51260255e-06 4.09321326e-12 2.04425382e-39 2.50916942e-09
  1.04754858e-11 2.81163724e-23 9.83068542e-29 9.99994485e-01]
 [9.99999182e-01 5.03728048e-16 8.72266968e-31 1.45046666e-10
  5.37055284e-15 1.92989404e-21 1.27966647e-28 8.18071823e-07]
 [1.15967884e-09 1.44706197e-12 7.63511961e-28 2.85497708e-10
  1.24657597e-13 2.18752312e-20 2.59907511e-23 9.99999999e-01]
 [7.41413135e-10 5.03964887e-11 1.39085289e-41 9.40174026e-14
  3.90596641e-06 2.83006000e-22 3.75796513e-27 9.99996093e-01]
 [3.21317902e-21 1.51008629e-16 5.21671572e-56 4.61685932e-27
  6.11593187e-19 5.01625645e-32 1.00000000e+00 2.03918834e-23]
 [5.31265415e-08 1.25797620e-10 7.84956092e-43 

In [51]:
importance_features(k,feat_names_ds_2,estimn2,perc=98)


 
 Important features for cluster  0
data                          0.23840469499072905
sources                       0.06493001953388162
analysis                      0.05829688802878933
quality                       0.05207955860444713
identify                      0.04307307390625364
mining                        0.03805127912196063

 
 Important features for cluster  1
business                      0.2771274482661637
requirements                  0.12393865363536244
analyst                       0.06750767267608071
process                       0.06251591729666903
needs                         0.05405745937405643
analysis                      0.05281632733872505

 
 Important features for cluster  2
database                      0.4143982790138815
performance                   0.08108253628453288
administration                0.0731610178131282
administrator                 0.0674234493567392
support                       0.04792848484930052
management                    0.04495579

## What can we do with the results

Looking at each category, we now could find the original posting URL.

In [55]:
def provide_url_one_class(labels, URL, target_category):
    target_urls=[]
    for i,ilab in enumerate(labels):
        if ilab in target_category:
            print(URL[i])
            target_urls.append(URL[i])
    return target_urls
        

In [54]:
#lets look at developer-oriented data science jobs
URLs = list(df['Links'])
my_url = provide_url_one_class(zlab, URLs, [0])

https://www.indeed.com/company/Vingsfire-pvt-ltd/jobs/Data-Engineer-4b87879df55e1eb8?fccid=d2a5bfde4401643f&vjs=3
https://www.indeed.com/company/HELM/jobs/Database-Administrator-5729f15bbb8b27a4?fccid=feeac034ca41a48b&vjs=3
https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BGKj2dVRoMy2japSZrYRM8IJNi6D13enLCCRY5KIhxigb2ni9doBXKUb107NWv_dmX6aLP-ZYBZm0GymJ5yJjiP2o5QGsYGjzoXP5ncuGZGt062KuV5iiEyL5rssPfWSQvaaREbZ_8aa58SixHHn7yb5TNr6DbWWMypOTBIB8hVtWHffAONDonKywBr_XfQwqk4s9GYcRF0LJrY7PEjjA_Ipa5i7CitYaKFF-EL6CUYpmaW2P6XCmbR5Z9i2b_dc_nIfq_XhaXDv3UQcK0xEu9ix9G7u-sWTcyqqK1uYrg-oWG06OxdWOBq2N-alxoStIZVm5kvQ6WJCjkz_HgSw4zKA0_mwiLTOWz3BnqgecvkYolOFkUHiUBI6dJTwwZRJwveUYwSbZDpDvcGBo67gVZ3FfpEJQyPTa7OfpvZQNsaOF9Jv3mk0cUqoOgxneva3Kwf-CYCeZ7__0xe8sR2gXCHEvA2fbc_tk44f-fYeDKsKc37izYigOOvl_tELiCxuyzbC_jgauLg4Hi5VEdI1Q9mxBwAS4d2ovv0eztUZqIfrp7RCn6perBjPAoGq5qh7vWF65i8MeG4PYvdlz3hqCgWClgWOOcxZwyqRHKGnDJ-MV9mr-u9GTq6Po-TLgbt_a4wCIZrEqsCCgdbmxYmHRxAgrjcyzEXRAaSoWYupJCJe2hB-Yjn9UBQzG6SsPgdNkHBk0gEHwRE_JB2KJ6HC