# Introduction

Sample code for using LDA for topic modeling. Data used are job reviews scraped from Glassdoor.

In [23]:
import nltk
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from collections import Counter
import pandas_profiling

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Bidirectional, GRU
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras import regularizers

import pickle
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer

nlp = spacy.load('en_core_web_sm')
stops = stopwords.words('english')
stops2 = spacy.lang.en.stop_words.STOP_WORDS

word_lem = WordNetLemmatizer()

In [24]:
stops = stopwords.words('english')
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


def clean_text(text, clean_only=False, 
               parts_of_speech=['ADJ' ,'NOUN', 'ADV', 'VERB'],
              remove_sw=True, sw=stops):
    """
    Cleans text and filters according to part of speech.
    
    Parameters
    ----------
    text : str
    
    clean_only : bool
        default at false, will return cleaned string with no tagging
    
    parts_of_speech : list of strings
        refer to parts of speech in SpaCy
        
    remove_sw : bool
    
    sw : list of strings
        add your own if necessary
        
    Returns
    -------
    out3 : str
        string with parts of speech filtered
    """
    # cleaning
    text = text.lower()
    text = text.replace('\xa0', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = re.sub(r'[^\w\s]+', ' ', text)
    text = re.sub("p*\d", "", text)
    text = re.sub(r" +", ' ', text)
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    
    if clean_only == True:

        return text
    
    else: 
        # pass text into nlp then remove stopwords
        text = nlp(text)

        # .lemma_ and .pos_ are helpful extracting the lemmatized
        # word and part of speech.

        out = []
        for token in text:
             out.append((token.lemma_, token.pos_))
        poss = parts_of_speech

        out3 = ''

        for item in out:
            if item[1] in poss:
                out3 = out3 + ' ' + item[0]

        if remove_sw:
            dummy = out3.split()
            dummy = [word for word in dummy if word not in sw]
            out3 = ' '.join(dummy)
            return out3.strip()

        else:
            
            return out3.strip()

  text = re.sub("p*\d", "", text)


In [20]:
pickle_in = open("DF_glassdoor.pickle_v2","rb")
df = pickle.load(pickle_in)
df.head()

Unnamed: 0,review,rating
0,accomodating staff clean environment think con...,3.0
1,well know company philippine line money remitt...,5.0
2,know cebuana country big company pawnshop grea...,4.0
3,acra acra acra acra operation division family ...,3.0
4,salary always time much pressure especially se...,3.0


# LDA

In [21]:
import string 
import gensim
from gensim import corpora

In [5]:
def lda_prep(docs):
    """
    Creates a document-term matrix for LDA application.
    
    Parameters
    ----------
    docs : list of strings
    
    Returns
    -------
    doc_term_matrix : array
        Use this as input for the LDA model
    dictionary : something
        this one too
    """
    docs2 = [x.split(' ') for x in docs]
    dictionary = corpora.Dictionary(docs2)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs2]
    
    return doc_term_matrix, dictionary

In [6]:
corp = list(df['review'])

In [7]:
dt_matrix, mdict = lda_prep(corp)

In [8]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix. Let's try with 2 topics
ldamodel = Lda(dt_matrix, num_topics=5, id2word = mdict, passes=50)

In [9]:
ldamodel.print_topics(num_words=7)

[(0,
  '0.022*"company" + 0.019*"people" + 0.014*"employee" + 0.013*"great" + 0.011*"life" + 0.010*"manager" + 0.010*"make"'),
 (1,
  '0.018*"time" + 0.017*"jollibee" + 0.017*"customer" + 0.011*"always" + 0.011*"give" + 0.011*"service" + 0.010*"day"'),
 (2,
  '0.023*"employee" + 0.013*"get" + 0.012*"pay" + 0.011*"hour" + 0.011*"job" + 0.011*"people" + 0.010*"time"'),
 (3,
  '0.019*"benefit" + 0.014*"employee" + 0.012*"company" + 0.010*"great" + 0.009*"lot" + 0.009*"management" + 0.008*"give"'),
 (4,
  '0.023*"company" + 0.013*"time" + 0.011*"family" + 0.011*"help" + 0.009*"training" + 0.009*"lot" + 0.009*"salary"')]

In [13]:
new_doc = "complexity science class has been cancelled"
new_str = new_doc.split(' ')
print(new_str)
new_doc_bow = mdict.doc2bow(new_str)
print(new_doc_bow)
probs = ldamodel.get_document_topics(new_doc_bow)
probs

['complexity', 'science', 'class', 'has', 'been', 'cancelled']
[(922, 1)]


[(0, 0.594819),
 (1, 0.10292091),
 (2, 0.100007154),
 (3, 0.102245696),
 (4, 0.10000727)]

In [14]:
probs.sort(key=lambda x: -x[1])
probs[0][0]

0

In [15]:
def get_topic(input_string):
    new_str = clean_text(input_string).split(' ')
    new_doc_bow = mdict.doc2bow(new_str)
    probs = ldamodel.get_document_topics(new_doc_bow)
    probs.sort(key=lambda x: -x[1])
    
    return probs[0][0]

In [25]:
get_topic("financial matters and eating for cryptocurrency")

1

In [33]:
get_topic("I love my job it is very tiring i want to work somewhere else")

0

In [31]:
df['topic'] = df['review'].apply(lambda x: get_topic(x))
df.head()

Unnamed: 0,review,rating,topic
0,accomodating staff clean environment think con...,3.0,0
1,well know company philippine line money remitt...,5.0,2
2,know cebuana country big company pawnshop grea...,4.0,4
3,acra acra acra acra operation division family ...,3.0,0
4,salary always time much pressure especially se...,3.0,4


In [10]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
# movies_vis_data = pyLDAvis.prepare(**movies_model_data)

Below is an implementation of pyLDAvis, which visualizes LDA results.

In [11]:
out5 = pyLDAvis.gensim.prepare(ldamodel, dt_matrix, mdict)
out5

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
