In [1]:
import pandas as pd
import numpy as np
# do the LDA
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df_BA=pd.read_csv('BA')
df_DS=pd.read_csv('DS')
df_DA=pd.read_csv('DA')
df_DE=pd.read_csv('DE')

In [3]:
my_stop_words = STOPWORDS.union(set(['experience', 'work','requirements','ability','years','analyst',
                                     'required','including','best','new','applicants','jobs','candidate',
                                     'help','regard','qualified','employment','consideration','applications',
                                     'position','able','application','role'
                                    ]))

In [7]:
def word_freq_filter(df):
    l1=""
    for i in df_BA.index:
        sents=df_BA.loc[i]['JD']
        l1+=(sents.replace('\n',''))
    word_tokens = word_tokenize(l1)
    stop_words = set(stopwords.words('english')) 
    wordlist=[word for word in word_tokens if word.isalnum() and word not in stop_words]
    full_text=""
    for word in wordlist:
        full_text+=word+" "
    allWords = nltk.tokenize.word_tokenize(full_text)
    allWordDist = nltk.FreqDist(w.lower() for w in allWords)
    mostCommon= allWordDist.most_common(500)
    common_words = []
    for item in mostCommon:
        common_words.append(item[0])
    leastCommon= allWordDist.most_common()[:-100-1:-1]
    least_words = []
    for item in leastCommon:
        least_words.append(item[0])
    return common_words+least_words

# LDA Model

In [8]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def lda(df,num_topics = 3,passes = 30,num_words=8):
    l1=[]
    for i in df.index:
        sents=df.loc[i]['JD']
        l1.append(sents.replace('\n',''))
    
    texts = [[word for word in story.lower().split()
            if word not in my_stop_words and word not in freq_words and word.isalnum()]
            for story in l1]
    dictionary = corpora.Dictionary(texts) #(word_id,word) pairs
    #dictionary.filter_extremes(no_below=20,no_above=0.2, keep_n= 100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words))
    
    return lda

In [13]:
freq_words=word_freq_filter(df_BA)
BA_lda=lda(df_BA).show_topic(0)
BA_lda

[   (   0,
        '0.002*"credit" + 0.002*"concise" + 0.002*"corporate" + '
        '0.002*"feedback" + 0.002*"challenges" + 0.002*"asset" + '
        '0.002*"physical" + 0.002*"mortgage" + 0.002*"party" + '
        '0.002*"integrate"'),
    (   1,
        '0.002*"Ã¢" + 0.002*"preparing" + 0.002*"custom" + 0.002*"junior" + '
        '0.002*"core" + 0.002*"maintains" + 0.002*"available" + '
        '0.002*"exposure" + 0.002*"sprint" + 0.002*"essential"'),
    (   2,
        '0.002*"human" + 0.002*"texas" + 0.002*"pricing" + 0.002*"medicaid" + '
        '0.002*"medical" + 0.002*"day" + 0.002*"assistance" + '
        '0.002*"influence" + 0.002*"navision" + 0.001*"vendor"')]


('credit', 0.002144061)

In [10]:
freq_words=word_freq_filter(df_DA)
DA_lda=lda(df_DA).show_topic(0)
DA_lda

[   (   0,
        '0.003*"quantitative" + 0.003*"analytic" + 0.003*"power" + '
        '0.003*"medical" + 0.002*"visualization" + 0.002*"bi" + 0.002*"ad" + '
        '0.002*"digital" + 0.002*"paid" + 0.002*"sets"'),
    (   1,
        '0.021*"statistical" + 0.009*"interpret" + 0.008*"packages" + '
        '0.008*"sources" + 0.007*"etl" + 0.007*"techniques" + '
        '0.007*"programming" + 0.006*"bi" + 0.005*"datasets" + '
        '0.005*"statistics"'),
    (   2,
        '0.006*"statistical" + 0.004*"clinical" + 0.003*"quantitative" + '
        '0.003*"public" + 0.003*"visualization" + 0.003*"sources" + '
        '0.003*"analytic" + 0.003*"programming" + 0.002*"evaluation" + '
        '0.002*"employee"')]


[('quantitative', 0.0029192201),
 ('analytic', 0.002582666),
 ('power', 0.0025672012),
 ('medical', 0.0025372605),
 ('visualization', 0.0023865846),
 ('bi', 0.0023516295),
 ('ad', 0.0022938624),
 ('digital', 0.0020825127),
 ('paid', 0.0020721192),
 ('sets', 0.002037219)]

In [11]:
freq_words=word_freq_filter(df_DS)
DS_lda=lda(df_DS).show_topic(0)
DS_lda

[   (   0,
        '0.018*"quantum" + 0.011*"machine" + 0.010*"ml" + 0.008*"learning" + '
        '0.006*"algorithms" + 0.005*"diverse" + 0.005*"qiskit" + '
        '0.005*"computers" + 0.005*"looking" + 0.004*"natural"'),
    (   1,
        '0.022*"machine" + 0.021*"learning" + 0.016*"statistical" + '
        '0.009*"quantitative" + 0.007*"techniques" + 0.007*"predictive" + '
        '0.007*"algorithms" + 0.006*"programming" + 0.006*"big" + '
        '0.006*"python"'),
    (   2,
        '0.017*"statistical" + 0.010*"analytic" + 0.007*"scientist" + '
        '0.006*"machine" + 0.006*"techniques" + 0.005*"sources" + '
        '0.005*"visualization" + 0.004*"clinical" + 0.004*"learning" + '
        '0.004*"programming"')]


[('quantum', 0.017677797),
 ('machine', 0.011216796),
 ('ml', 0.009750406),
 ('learning', 0.0077902162),
 ('algorithms', 0.0060242736),
 ('diverse', 0.0051248088),
 ('qiskit', 0.005024482),
 ('computers', 0.0046378705),
 ('looking', 0.004583131),
 ('natural', 0.0044884305)]

In [12]:
freq_words=word_freq_filter(df_DE)
DE_lda=lda(df_DE).show_topic(0)
DE_lda

[   (   0,
        '0.013*"big" + 0.006*"etl" + 0.005*"engineer" + 0.005*"warehouse" + '
        '0.005*"python" + 0.004*"pipelines" + 0.004*"bull" + 0.004*"streaming" '
        '+ 0.004*"aws" + 0.004*"programming"'),
    (   1,
        '0.012*"aws" + 0.009*"python" + 0.008*"pipelines" + 0.008*"etl" + '
        '0.007*"big" + 0.006*"engineer" + 0.006*"programming" + 0.005*"spark" '
        '+ 0.004*"apache" + 0.004*"distributed"'),
    (   2,
        '0.012*"big" + 0.009*"azure" + 0.008*"machine" + 0.007*"learning" + '
        '0.007*"engineer" + 0.007*"infrastructure" + 0.006*"etl" + '
        '0.006*"pipelines" + 0.005*"programming" + 0.005*"spark"')]


[('big', 0.013139406),
 ('etl', 0.006475453),
 ('engineer', 0.004845691),
 ('warehouse', 0.0047575575),
 ('python', 0.004501147),
 ('pipelines', 0.004488647),
 ('bull', 0.004450075),
 ('streaming', 0.0040275455),
 ('aws', 0.003955949),
 ('programming', 0.0037249345)]

In [None]:
import csv

In [None]:
with open('LDA_List', 'w') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
      
    write.writerow(BA_lda)
    write.writerow(DA_lda)
    write.writerow(DS_lda)
    write.writerow(DE_lda)

# Match Topic to Document

In [None]:
from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)