In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models, utils
from gensim.models import LdaModel
from gensim import similarities
from gensim.models.phrases import Phrases, Phraser

import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from cleantext import clean

from wordcloud import WordCloud
import seaborn as sns

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
#Import data files
jd_ds = pd.read_csv("../data/jd_ds.csv",encoding='utf-8')
jd_ds.head()

Unnamed: 0,description
0,ABOUT HOPPER At Hopper we re on a mission to m...
1,At Noom we use scientifically proven methods t...
2,Decode M Data Science Manager Job Description ...
3,Sapphire Digital seeks a dynamic and driven mi...
4,Director Data Science 200537 Description Edelm...


### Training LDA model using words

In [3]:
def preprocess_text(text):
    # Tokenize text
    tokens = text.lower().split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [4]:
# Preprocess job descriptions
preprocessed_job_descriptions = jd_ds['description'].apply(preprocess_text)

In [5]:
# Create dictionary representation of job descriptions
dictionary = corpora.Dictionary(preprocessed_job_descriptions)

In [6]:
# Create document-term matrix
doc_term_matrix = [dictionary.doc2bow(desc) for desc in preprocessed_job_descriptions]

In [7]:
# Train LDA model
lda_model = models.LdaModel(doc_term_matrix, num_topics=10, id2word=dictionary, passes=10)

In [8]:
# Extract and print top topics
top_topics = lda_model.show_topics(num_topics=10, num_words=10)

In [9]:
for topic in top_topics:
    print(topic)

(0, '0.009*"work" + 0.008*"employment" + 0.008*"information" + 0.008*"required" + 0.008*"research" + 0.008*"job" + 0.007*"employee" + 0.006*"position" + 0.006*"experience" + 0.006*"status"')
(1, '0.027*"data" + 0.015*"experience" + 0.013*"team" + 0.013*"learning" + 0.010*"machine" + 0.009*"work" + 0.007*"science" + 0.006*"model" + 0.006*"company" + 0.005*"product"')
(2, '0.027*"data" + 0.013*"experience" + 0.010*"work" + 0.010*"ability" + 0.009*"management" + 0.009*"skill" + 0.008*"business" + 0.007*"report" + 0.007*"project" + 0.007*"knowledge"')
(3, '0.022*"business" + 0.019*"data" + 0.015*"experience" + 0.010*"solution" + 0.008*"project" + 0.006*"work" + 0.006*"year" + 0.005*"technology" + 0.005*"service" + 0.004*"team"')
(4, '0.022*"data" + 0.013*"experience" + 0.011*"technology" + 0.008*"cloud" + 0.007*"client" + 0.007*"security" + 0.007*"platform" + 0.007*"team" + 0.006*"service" + 0.006*"working"')
(5, '0.014*"experience" + 0.011*"development" + 0.008*"team" + 0.007*"work" + 0.0

In [10]:
topic_labels = []
num_topics = 10
for topic_id in range(num_topics):
    top_words = lda_model.show_topic(topic_id, topn=10)  # Get the top 10 words for each topic
    words = [word for word, _ in top_words]
    topic_label = ', '.join(words)  # Join the words into a single string
    topic_labels.append(topic_label)

In [11]:
# Print the topic labels
for i, label in enumerate(topic_labels):
    print(f"Topic {i}: {label}")

Topic 0: work, employment, information, required, research, job, employee, position, experience, status
Topic 1: data, experience, team, learning, machine, work, science, model, company, product
Topic 2: data, experience, work, ability, management, skill, business, report, project, knowledge
Topic 3: business, data, experience, solution, project, work, year, technology, service, team
Topic 4: data, experience, technology, cloud, client, security, platform, team, service, working
Topic 5: experience, development, team, work, skill, research, cell, clinical, product, scientist
Topic 6: student, school, johnson, janssen, data, health, course, research, penn, harris
Topic 7: data, business, team, experience, analytics, product, analysis, insight, science, model
Topic 8: data, experience, year, business, skill, sql, solution, development, tool, team
Topic 9: ibm, quantum, data, world, business, industry, client, experience, work, development


### Training LDA model using bigrams

In [12]:
def preprocess_text_bigram(text):
    # Tokenize text into sentences
    sentences = [sent for sent in nltk.sent_tokenize(text)]
    
    # Tokenize sentences into words
    tokenized_sentences = [utils.simple_preprocess(sent) for sent in sentences]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    sentences = [[token for token in sent if token not in stop_words] for sent in tokenized_sentences]
    
    # Create bigrams
    bigram_model = Phrases(sentences, min_count=5, threshold=100)
    bigram_phraser = Phraser(bigram_model)
    bigram_sentences = [bigram_phraser[sent] for sent in sentences]
    
    # Convert bigrams into single words using '_'
    for i in range(len(bigram_sentences)):
        for j in range(len(bigram_sentences[i])):
            if '_' in bigram_sentences[i][j]:
                bigram_sentences[i][j] = bigram_sentences[i][j].replace('_', '')
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    preprocessed_sentences = [[lemmatizer.lemmatize(token) for token in sent] for sent in bigram_sentences]
    
    # Flatten sentences into a single list of words
    words = [word for sent in preprocessed_sentences for word in sent]
    
    return words

In [13]:
# Preprocess job descriptions
preprocessed_job_descriptions_bigram = jd_ds['description'].apply(preprocess_text_bigram)

In [14]:
# Identify bigrams and add them to the dictionary
bigram_model = Phrases(preprocessed_job_descriptions_bigram, min_count=5, threshold=100)
bigram_phraser = Phraser(bigram_model)

for i in range(len(preprocessed_job_descriptions_bigram)):
    for token in bigram_phraser[preprocessed_job_descriptions_bigram[i]]:
        if '_' in token:
            preprocessed_job_descriptions_bigram[i].append(token)

In [15]:
# Create dictionary representation of job descriptions
dictionary_bigram = corpora.Dictionary(preprocessed_job_descriptions_bigram.values)

In [16]:
# Create document-term matrix
doc_term_matrix_bigram = [dictionary.doc2bow(desc) for desc in preprocessed_job_descriptions_bigram]

In [17]:
# Train LDA model
lda_model_bigram = models.LdaModel(doc_term_matrix_bigram, num_topics=10, id2word=dictionary, passes=10)

In [18]:
# Extract and print top topics
top_topics_bigram = lda_model.show_topics(num_topics=10, num_words=10)

for topic in top_topics_bigram:
    print(topic)# Extract and print top topics

(0, '0.009*"work" + 0.008*"employment" + 0.008*"information" + 0.008*"required" + 0.008*"research" + 0.008*"job" + 0.007*"employee" + 0.006*"position" + 0.006*"experience" + 0.006*"status"')
(1, '0.027*"data" + 0.015*"experience" + 0.013*"team" + 0.013*"learning" + 0.010*"machine" + 0.009*"work" + 0.007*"science" + 0.006*"model" + 0.006*"company" + 0.005*"product"')
(2, '0.027*"data" + 0.013*"experience" + 0.010*"work" + 0.010*"ability" + 0.009*"management" + 0.009*"skill" + 0.008*"business" + 0.007*"report" + 0.007*"project" + 0.007*"knowledge"')
(3, '0.022*"business" + 0.019*"data" + 0.015*"experience" + 0.010*"solution" + 0.008*"project" + 0.006*"work" + 0.006*"year" + 0.005*"technology" + 0.005*"service" + 0.004*"team"')
(4, '0.022*"data" + 0.013*"experience" + 0.011*"technology" + 0.008*"cloud" + 0.007*"client" + 0.007*"security" + 0.007*"platform" + 0.007*"team" + 0.006*"service" + 0.006*"working"')
(5, '0.014*"experience" + 0.011*"development" + 0.008*"team" + 0.007*"work" + 0.0

In [19]:
# Save the trained LDA model
lda_model.save("../model/lda_model_ds")
lda_model_bigram.save("../model/lda_model_ds_bigram")