In [1]:
import pandas as pd
import numpy as np
# do the LDA
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
import nltk
from nltk import sent_tokenize,word_tokenize 

In [2]:
df_BA=pd.read_csv('BA')
df_DS=pd.read_csv('DS')
df_DA=pd.read_csv('DA')
df_DE=pd.read_csv('DE')

In [3]:
my_stop_words = STOPWORDS.union(set(['experience', 'work','requirements','ability','years','analyst',
                                     'required','including','best','new','applicants','jobs','candidate',
                                     'help','regard','qualified','employment','consideration','applications',
                                     'position','able','application','role'
                                    ]))

# LDA Model

In [4]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def lda(df,num_topics = 3,passes = 30,num_words=8):
    l1=[]
    for i in df.index:
        sents=df.loc[i]['JD']
        l1.append(sents.replace('\n',''))
    
    texts = [[word for word in story.lower().split()
            if word not in my_stop_words and word.isalnum()]
            for story in l1]
    dictionary = corpora.Dictionary(texts) #(word_id,word) pairs
    dictionary.filter_extremes(no_below=20,no_above=0.2, keep_n= 100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words))
    
    return lda

In [5]:
lda(df_BA).show_topic(0)

[   (   0,
        '0.007*"client" + 0.006*"able" + 0.006*"agile" + 0.006*"financial" + '
        '0.005*"verbal" + 0.005*"training" + 0.005*"solution" + '
        '0.005*"industry" + 0.005*"users" + 0.005*"office"'),
    (   1,
        '0.016*"sap" + 0.009*"high" + 0.008*"monitoring" + 0.008*"areas" + '
        '0.007*"improvement" + 0.007*"review" + 0.007*"risk" + 0.007*"change" '
        '+ 0.007*"time" + 0.007*"problems"'),
    (   2,
        '0.008*"reports" + 0.008*"microsoft" + 0.007*"reporting" + '
        '0.007*"duties" + 0.006*"assigned" + 0.006*"ms" + 0.006*"issues" + '
        '0.006*"salesforce" + 0.005*"operational" + 0.005*"services"')]


[('client', 0.007322344),
 ('able', 0.006169489),
 ('agile', 0.0061636427),
 ('financial', 0.0055259336),
 ('verbal', 0.0051059197),
 ('training', 0.005047989),
 ('solution', 0.0050025703),
 ('industry', 0.004857691),
 ('users', 0.0048178737),
 ('office', 0.0047422876)]

In [6]:
lda(df_DA).show_topic(0)

[   (   0,
        '0.007*"research" + 0.006*"financial" + 0.006*"health" + '
        '0.005*"office" + 0.005*"duties" + 0.005*"software" + 0.005*"perform" '
        '+ 0.005*"job" + 0.005*"healthcare" + 0.005*"program"'),
    (   1,
        '0.039*"interpret" + 0.035*"packages" + 0.023*"patterns" + '
        '0.022*"efficiency" + 0.021*"trends" + 0.021*"collection" + '
        '0.021*"datasets" + 0.021*"programming" + 0.021*"implement" + '
        '0.021*"code"'),
    (   2,
        '0.007*"insights" + 0.007*"product" + 0.006*"solutions" + '
        '0.006*"marketing" + 0.006*"visualization" + 0.006*"role" + '
        '0.005*"customer" + 0.005*"drive" + 0.005*"understand" + '
        '0.005*"advanced"')]


[('research', 0.006701625),
 ('financial', 0.0060773734),
 ('health', 0.0057612094),
 ('office', 0.0050414796),
 ('duties', 0.005034944),
 ('software', 0.004999746),
 ('perform', 0.0049505252),
 ('job', 0.004786722),
 ('healthcare', 0.0046979417),
 ('program', 0.0045955554)]

In [7]:
lda(df_DS).show_topic(0)

[   (   0,
        '0.020*"ml" + 0.014*"analytic" + 0.013*"professional" + '
        '0.011*"clinical" + 0.009*"healthcare" + 0.008*"hours" + '
        '0.008*"health" + 0.008*"services" + 0.008*"deliver" + '
        '0.007*"writing"'),
    (   1,
        '0.047*"quantum" + 0.016*"application" + 0.013*"qiskit" + '
        '0.012*"computers" + 0.012*"employee" + 0.011*"flexible" + '
        '0.011*"linear" + 0.011*"diverse" + 0.011*"career" + 0.011*"looking"'),
    (   2,
        '0.005*"applied" + 0.005*"provide" + 0.005*"decision" + '
        '0.005*"internal" + 0.005*"performance" + 0.005*"sets" + '
        '0.005*"identify" + 0.005*"based" + 0.004*"cloud" + 0.004*"analyze"')]


[('ml', 0.019926121),
 ('analytic', 0.013514839),
 ('professional', 0.012630859),
 ('clinical', 0.011359002),
 ('healthcare', 0.009352398),
 ('hours', 0.008444445),
 ('health', 0.008096305),
 ('services', 0.007961692),
 ('deliver', 0.00771099),
 ('writing', 0.007326064)]

In [8]:
lda(df_DE).show_topic(0)

[   (   0,
        '0.018*"azure" + 0.014*"spark" + 0.012*"pipeline" + '
        '0.011*"infrastructure" + 0.010*"processes" + 0.010*"apache" + '
        '0.009*"variety" + 0.009*"nosql" + 0.008*"optimizing" + '
        '0.008*"internal"'),
    (   1,
        '0.007*"project" + 0.006*"models" + 0.006*"learning" + 0.006*"agile" + '
        '0.006*"streaming" + 0.006*"code" + 0.006*"customer" + 0.006*"able" + '
        '0.006*"developing" + 0.005*"projects"'),
    (   2,
        '0.008*"use" + 0.007*"processes" + 0.007*"integration" + '
        '0.006*"protected" + 0.006*"provide" + 0.006*"environment" + '
        '0.006*"opportunity" + 0.006*"national" + 0.006*"production" + '
        '0.006*"security"')]


[('azure', 0.018425904),
 ('spark', 0.014158488),
 ('pipeline', 0.011764156),
 ('infrastructure', 0.010547885),
 ('processes', 0.010135129),
 ('apache', 0.009501849),
 ('variety', 0.009330016),
 ('nosql', 0.008586794),
 ('optimizing', 0.008498363),
 ('internal', 0.0084456755)]

# LDA Ver2

In [9]:
df_BA=pd.read_csv('BA')

In [10]:
l1=[]
for i in df_BA.index:
    sents=df_BA.loc[i]['JD']
    l1.append(sents.replace('\n',''))
    
texts = [[word for word in story.lower().split()
            if word not in STOPWORDS and word.isalnum()]
            for story in l1]
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=20,no_above=0.2, keep_n= 100000)
corpus = [dictionary.doc2bow(text) for text in texts]

In [11]:
lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=3,
              passes=10)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(8))

[   (   0,
        '0.012*"required" + 0.006*"health" + 0.006*"sap" + 0.006*"services" + '
        '0.005*"reporting" + 0.005*"developing" + 0.005*"duties" + '
        '0.005*"employment" + 0.005*"based" + 0.005*"able"'),
    (   1,
        '0.011*"client" + 0.008*"financial" + 0.007*"time" + '
        '0.007*"improvement" + 0.007*"high" + 0.006*"change" + 0.006*"level" + '
        '0.006*"environment" + 0.006*"problems" + 0.005*"ms"'),
    (   2,
        '0.006*"agile" + 0.006*"drive" + 0.005*"verbal" + 0.005*"excellent" + '
        '0.005*"users" + 0.005*"microsoft" + 0.005*"sales" + 0.005*"training" '
        '+ 0.005*"reporting" + 0.005*"candidate"')]


In [12]:
import spacy
nlp = spacy.load('en_core_web_sm') 
# excluded tags
excluded_tags = {"VERB", "ADJ", "ADV", "ADP"}

l1=[]
for i in df_BA.index:
    sents=df_BA.loc[i]['JD']
    l1.append(sents.split("\n"))

sentences = l1[0]
new_sentences = []
for sentence in sentences:
    new_sentence = []
    for token in nlp(sentence):
        if token.pos_ not in excluded_tags:
            new_sentence.append(token.text)
    new_sentences.append(" ".join(new_sentence))

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

# Match Topic to Document

In [None]:
from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)