In [20]:
import pandas as pd
from bs4 import BeautifulSoup

import re

import nltk
import pyLDAvis
import pyLDAvis.gensim
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import LdaMulticore

import spacy

nltk.download('stopwords')


In [17]:
df_1 = pd.read_csv('jobs_df')
df_1.head()

Unnamed: 0.1,Unnamed: 0,title,description,labels,token_description,token_title,tfidf_title,tfidf_description
0,0,Strategic Marketing & Brand Design Intern,<p>We're looking for a badass intern that can ...,Food & Agriculture,looking badass intern join team hit ground run...,strategic marketing brand design intern,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
1,1,Community Manager,<p><strong>Who We Are</strong></p>\n<p>We are ...,"Energy, Buildings & Cities",soontobe launched notforprofit business accele...,community manager,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
2,2,Senior EE Hardware Architect/Design - Infotain...,"<p><em>At SERES, we&rsquo;re forging a new kin...",Transportation,seres wersquore forging new kind mobility comp...,senior ee hardware architectdesign infotainment,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
3,3,"Retail Specialist, Berkeley",<b>Why We’re Rad (about us): </b><br /><span s...,Transportation,rad power bike mission get people onto bike el...,retail specialist berkeley,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]
4,4,Customer Care Associate,<div>\r\n<div>About Imperfect</div>\r\n<div>&n...,Food & Agriculture,imperfect nbsp imperfect food founded mission ...,customer care associate,[0. 0. 0. ... 0. 0. 0.],[0. 0. 0. ... 0. 0. 0.]


After looking through some of the descriptions, most of the job postings have some kind of list that has key information. The first step is to collect that information. We can run an LDA on all lists such as "responsibilities", "Qualifications" or "position description". The first pass is just to create a basic LDA and see what we get

In [21]:
is_in_tag = [
    "responsibilities", 
    "qualifications", 
    "about you", 
    "a plus", 
    "what youll be doing", 
    "what youll do",
    "you will",
    "skills",
    'the ideal candidate',
    'attributes',
    'duties',
    'prerequisites',
    'position description',
    'requirements'
]

def is_in_contents(tag, tag_list):
    for key in tag_list:
        if key in re.sub(r'[^\w\s]', '', tag.get_text().lower()):
            return True
    return False

def get_relevant_list(job_postings, tag_list):
    """ job_postings is a list of job html descriptors. This function 
    returns a dictionary for each listing that indicates what list if any
    of the relevant lists were in the posting"""
    tracker = []
    for job in job_postings:
        soup = BeautifulSoup(job, 'html.parser')
        track_ind={}
        for tag in soup.find_all(True):
            if tag.name in ['h3', 'h2', 'h1', 'strong', 'span', 'p'] and is_in_contents(tag, tag_list) and len(re.sub(r'[^\w\s]', '', tag.get_text().lower()).split(' '))<25:
                entry = re.sub(r'[^\w\s]', '', tag.get_text().lower())
                track_ind[entry]=0
                for next_tags in tag.find_all_next()[:4]:
                    if next_tags.name in ['ol', 'ul'] and track_ind[entry]==0:
                        track_ind[entry] = next_tags.get_text()
        tracker.append(track_ind)  
    return tracker



df_1['relevant_text']=get_relevant_list(df_1['description'].tolist(), is_in_tag)
df_1.to_csv('processed_txt.csv')


In [22]:
df_1 = pd.read_csv('processed_txt.csv')


In [29]:
j=0
for i, k in enumerate(df_1['relevant_text'].tolist()):
    if k =='{}':
        j=j+1
        # print(i)
print(
f'''
There are {j} jobs listings that \
do not have a list with titles like "Qualifications/ responsibilities", \
out of the {i} many jobs in the dataset
'''
)


There are 1823 jobs listings that do not have a list with titles like "Qualifications/ responsibilities", out of the 17758 many jobs in the dataset



In [108]:
data = [
    BeautifulSoup(i, 'html.parser').get_text() 
    for i in df_1.description.tolist()
]


In [109]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# get stopwords from nltk library
stop_words = nltk.corpus.stopwords.words('english')

def process_words(texts, stop_words=stop_words, allowed_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    """Convert a document into a list of lowercase tokens, build bigrams-trigrams, implement lemmatization"""
    
    # I suspect these bigrams and trigrams are not 
    # making any real impact, I varied the threshold 
    # and did not really see much difference
    bigram = gensim.models.Phrases(data, min_count=20, threshold=90)
    trigram = gensim.models.Phrases(bigram[data], threshold=90)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    # remove stopwords, short tokens and letter accents 
    texts = [
        [
            word for word in 
            gensim.utils.simple_preprocess(
                str(doc), deacc=True, min_len=2
            ) 
            if word not in stop_words
        ] for doc in texts
    ]
    
    # bi-gram and tri-gram implementation
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    texts_out = []
    
    # implement lemmatization and filter out unwanted part of speech tags
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_tags])
    
    # remove stopwords and short tokens again after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=2) if word not in stop_words] for doc in texts_out]    
    
    return texts_out

In [110]:
data_ready = process_words(data)


In [111]:
id2word = corpora.Dictionary(data_ready)
corpus = [id2word.doc2bow(text) for text in data_ready]
print('Total Vocabulary Size:', len(id2word))

Total Vocabulary Size: 36832


In [119]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.043523088506304965),
 (1, 0.05927956126072254),
 (2, 0.09597422495361731),
 (3, 0.028975340264615717),
 (4, 0.18593095864782802),
 (5, 0.14984483348928154),
 (6, 0.18097683787120056),
 (7, 0.022719094060973576),
 (8, 0.12781636054575685),
 (9, 0.04490916819302359),
 (10, 0.14962601677039822),
 (11, 0.09641722159005242),
 (12, 0.0638266765078442),
 (13, 0.032393413635367754),
 (14, 0.1326560969711348),
 (15, 0.08606590138074906),
 (16, 0.0759795569537991),
 (17, 0.04542734841187554),
 (18, 0.06392576596561733),
 (19, 0.024743229368099296),
 (20, 0.09035837674667235),
 (21, 0.09230562768603312),
 (22, 0.03658586199058109),
 (23, 0.217739181075178),
 (24, 0.07586582149649097),
 (25, 0.1447473120782141),
 (26, 0.022242206040226684),
 (27, 0.017791578884071964),
 (28, 0.14039086223941644),
 (29, 0.0682411603905896),
 (30, 0.08928023329260128),
 (31, 0.06743524323590674),
 (32, 0.07771185896675878),
 (33, 0.03599073108660291),
 (34, 0.012502572555624881),
 (35, 0.2529907213645158),
 (

Some of the most common words:

In [113]:
dict_corpus = {}

for i in range(len(corpus)):
    for idx, freq in corpus[i]:
        if id2word[idx] in dict_corpus:
            dict_corpus[id2word[idx]] += freq
        else:
            dict_corpus[id2word[idx]] = freq
            
dict_df = pd.DataFrame.from_dict(dict_corpus, orient='index', columns=['freq'])


In [114]:
dict_df.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq
work,85132
team,77945
experience,71103
customer,33962
product,32986
opportunity,30765
company,29683
support,27299
include,27230
skill,26810


In [115]:
id2word = corpora.Dictionary(data_ready)
print('Total Vocabulary Size:', len(id2word))

# Filter out words that occur less than 10 documents, or more than
# 70% of the documents.
id2word.filter_extremes( no_below=10, no_above=0.7)
print('Total Vocabulary Size:', len(id2word))

Total Vocabulary Size: 36832
Total Vocabulary Size: 8866


In [116]:
corpus = [id2word.doc2bow(text) for text in data_ready]
dict_corpus = {}

for i in range(len(corpus)):
    for idx, freq in corpus[i]:
        if id2word[idx] in dict_corpus:
            dict_corpus[id2word[idx]] += freq
        else:
            dict_corpus[id2word[idx]] = freq
            
dict_df = pd.DataFrame.from_dict(dict_corpus, orient='index', columns=['freq'])
dict_df.sort_values('freq', ascending=False).head(10)

Unnamed: 0,freq
customer,33962
product,32986
company,29683
support,27299
include,27230
business,26720
drive,26695
process,26115
ability,26060
build,25260


In [120]:
# import gensim
num_topics = 15
lda_model_tfidf = LdaMulticore(
    corpus, 
    num_topics=num_topics, 
    id2word=id2word, 
    passes=2, 
    workers=4
)


lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, 
    num_topics=num_topics, 
    id2word=id2word, 
    passes=2, 
    workers=4
)


for idx, topic in lda_model_tfidf.print_topics(-1):
    print(f'Topic: {idx} \n Word: {topic}\n\n')

Topic: 0 
 Word: 0.003*"food" + 0.003*"afresh" + 0.003*"perfect" + 0.003*"lime" + 0.003*"solar" + 0.003*"sale" + 0.002*"customer" + 0.002*"carboncure" + 0.002*"climate" + 0.002*"day"


Topic: 1 
 Word: 0.005*"convoy" + 0.005*"meat" + 0.003*"campaign" + 0.003*"empty" + 0.003*"marketing" + 0.003*"freight" + 0.003*"sale" + 0.003*"climate" + 0.003*"medium" + 0.002*"policy"


Topic: 2 
 Word: 0.020*"spin" + 0.004*"fulfill" + 0.003*"apeel" + 0.003*"transportation" + 0.003*"creativity" + 0.003*"chat" + 0.003*"city" + 0.003*"food" + 0.003*"bowery" + 0.003*"qualified"


Topic: 3 
 Word: 0.006*"und" + 0.006*"amp" + 0.004*"recycling" + 0.004*"robotic" + 0.003*"die" + 0.003*"pay" + 0.002*"food" + 0.002*"day" + 0.002*"employee" + 0.002*"eine"


Topic: 4 
 Word: 0.021*"imperfect" + 0.013*"food" + 0.007*"warehouse" + 0.006*"grocery" + 0.006*"covid" + 0.005*"sanitize" + 0.005*"confirm" + 0.005*"delivery" + 0.005*"verify" + 0.005*"employee"


Topic: 5 
 Word: 0.011*"food" + 0.010*"impossible" + 0.009*"