In [1]:
import pandas as pd
import numpy as np
# do the LDA
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
import pprint
import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
df_BA=pd.read_csv('BA')
df_DS=pd.read_csv('DS')
df_DA=pd.read_csv('DA')
df_DE=pd.read_csv('DE')

In [3]:
my_stop_words = STOPWORDS.union(set(['experience', 'work','requirements','ability','years','analyst',
                                     'required','including','best','new','applicants','jobs','candidate',
                                     'help','regard','qualified','employment','consideration','applications',
                                     'position','able','application','role','business','analysis','analyze','data'
                                    ]))

In [4]:
def word_freq_filter(df):
    l1=""
    for i in df_BA.index:
        sents=df_BA.loc[i]['JD']
        l1+=(sents.replace('\n',''))
    word_tokens = word_tokenize(l1)
    stop_words = set(stopwords.words('english')) 
    wordlist=[word for word in word_tokens if word.isalnum() and word not in stop_words]
    full_text=""
    for word in wordlist:
        full_text+=word+" "
    allWords = nltk.tokenize.word_tokenize(full_text)
    allWordDist = nltk.FreqDist(w.lower() for w in allWords)
    mostCommon= allWordDist.most_common(500)
    common_words = []
    for item in mostCommon:
        common_words.append(item[0])
    leastCommon= allWordDist.most_common()[:-100-1:-1]
    least_words = []
    for item in leastCommon:
        least_words.append(item[0])
    return common_words+least_words

# LDA Model

In [5]:
def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def lda(df,num_topics = 3,passes = 30,num_words=8):
    l1=[]
    for i in df.index:
        sents=df.loc[i]['JD']
        l1.append(sents.replace('\n',''))
    
    texts = [[word for word in story.lower().split()
            if word not in my_stop_words and word not in freq_words and word.isalnum()]
            for story in l1]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    
    lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words))
    
    res=[]
    
    for i in range(3):
        res.append(lda.show_topic(i))
    res1=[]
    for index in res:
        for i,j in index:
            res1.append(i)
    
    return res1

In [6]:
freq_words=word_freq_filter(df_BA)
BA_lda=lda(df_BA)

[   (   0,
        '0.003*"â" + 0.002*"day" + 0.002*"common" + 0.002*"party" + '
        '0.002*"concise" + 0.002*"corporate" + 0.002*"3rd" + 0.002*"input" + '
        '0.002*"challenges" + 0.002*"skillsstrong"'),
    (   1,
        '0.002*"texas" + 0.002*"quantitative" + 0.002*"human" + '
        '0.002*"judgment" + 0.002*"employee" + 0.002*"quickly" + '
        '0.002*"learning" + 0.002*"participates" + 0.002*"digital" + '
        '0.002*"essential"'),
    (   2,
        '0.002*"handle" + 0.002*"line" + 0.002*"available" + 0.002*"applying" '
        '+ 0.002*"release" + 0.002*"credit" + 0.002*"provider" + 0.002*"basis" '
        '+ 0.002*"asset" + 0.002*"diverse"')]


In [7]:
freq_words=word_freq_filter(df_DA)
DA_lda=lda(df_DA)
DA_lda

[   (   0,
        '0.029*"statistical" + 0.017*"interpret" + 0.017*"packages" + '
        '0.011*"etl" + 0.010*"sources" + 0.009*"programming" + '
        '0.009*"optimize" + 0.009*"collection" + 0.009*"amounts" + '
        '0.009*"primary"'),
    (   1,
        '0.007*"statistical" + 0.005*"clinical" + 0.003*"public" + '
        '0.003*"programming" + 0.002*"collection" + 0.002*"evaluation" + '
        '0.002*"integrity" + 0.002*"school" + 0.002*"sources" + 0.002*"sas"'),
    (   2,
        '0.006*"statistical" + 0.005*"visualization" + 0.005*"quantitative" + '
        '0.003*"sources" + 0.003*"analytic" + 0.003*"actionable" + 0.003*"bi" '
        '+ 0.003*"sets" + 0.003*"power" + 0.003*"programming"')]


['statistical',
 'interpret',
 'packages',
 'etl',
 'sources',
 'programming',
 'optimize',
 'collection',
 'amounts',
 'primary',
 'statistical',
 'clinical',
 'public',
 'programming',
 'collection',
 'evaluation',
 'integrity',
 'school',
 'sources',
 'sas',
 'statistical',
 'visualization',
 'quantitative',
 'sources',
 'analytic',
 'actionable',
 'bi',
 'sets',
 'power',
 'programming']

In [8]:
freq_words=word_freq_filter(df_DS)
DS_lda=lda(df_DS)
DS_lda

[   (   0,
        '0.016*"statistical" + 0.009*"machine" + 0.008*"learning" + '
        '0.008*"analytic" + 0.008*"scientist" + 0.006*"clinical" + '
        '0.006*"predictive" + 0.005*"techniques" + 0.005*"algorithms" + '
        '0.005*"visualization"'),
    (   1,
        '0.023*"machine" + 0.021*"learning" + 0.016*"statistical" + '
        '0.009*"quantitative" + 0.007*"techniques" + 0.006*"programming" + '
        '0.006*"predictive" + 0.006*"python" + 0.006*"big" + '
        '0.005*"algorithms"'),
    (   2,
        '0.020*"quantum" + 0.009*"ml" + 0.007*"machine" + 0.006*"algorithms" + '
        '0.006*"qiskit" + 0.005*"computers" + 0.005*"learning" + '
        '0.005*"looking" + 0.005*"linear" + 0.005*"diverse"')]


['statistical',
 'machine',
 'learning',
 'analytic',
 'scientist',
 'clinical',
 'predictive',
 'techniques',
 'algorithms',
 'visualization',
 'machine',
 'learning',
 'statistical',
 'quantitative',
 'techniques',
 'programming',
 'predictive',
 'python',
 'big',
 'algorithms',
 'quantum',
 'ml',
 'machine',
 'algorithms',
 'qiskit',
 'computers',
 'learning',
 'looking',
 'linear',
 'diverse']

In [9]:
freq_words=word_freq_filter(df_DE)
DE_lda=lda(df_DE)
DE_lda

[   (   0,
        '0.011*"aws" + 0.009*"big" + 0.007*"engineer" + 0.007*"infrastructure" '
        '+ 0.006*"pipelines" + 0.006*"etl" + 0.006*"relational" + '
        '0.005*"python" + 0.005*"programming" + 0.005*"pipeline"'),
    (   1,
        '0.009*"etl" + 0.008*"python" + 0.008*"pipelines" + 0.007*"big" + '
        '0.007*"azure" + 0.006*"engineer" + 0.006*"aws" + 0.006*"warehouse" + '
        '0.005*"bi" + 0.005*"learning"'),
    (   2,
        '0.016*"big" + 0.009*"hadoop" + 0.007*"azure" + 0.006*"spark" + '
        '0.006*"programming" + 0.005*"etl" + 0.005*"engineer" + '
        '0.005*"pipelines" + 0.004*"python" + 0.004*"streaming"')]


['aws',
 'big',
 'engineer',
 'infrastructure',
 'pipelines',
 'etl',
 'relational',
 'python',
 'programming',
 'pipeline',
 'etl',
 'python',
 'pipelines',
 'big',
 'azure',
 'engineer',
 'aws',
 'warehouse',
 'bi',
 'learning',
 'big',
 'hadoop',
 'azure',
 'spark',
 'programming',
 'etl',
 'engineer',
 'pipelines',
 'python',
 'streaming']

In [10]:
import csv

In [11]:
with open('LDA_List', 'w') as f:
      
    write = csv.writer(f)
      
    write.writerow(BA_lda)
    write.writerow(DA_lda)
    write.writerow(DS_lda)
    write.writerow(DE_lda)

# Match Topic to Document

In [12]:
from operator import itemgetter
lda.get_document_topics(corpus_new[0],minimum_probability=0.05,per_word_topics=False)
sorted(lda.get_document_topics(corpus_new[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True)

AttributeError: 'function' object has no attribute 'get_document_topics'