## Loading Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('final_reviews.csv')

In [3]:
df.head(5)

Unnamed: 0,Work/Life Balance,Culture & Values,Diversity and Inclusion,Career Opportunities,Compensation and Benefits,Senior Management,rating,Title,Current/Time,Job Line,pros,cons,Company,Date,Role
0,3.0,4.0,4.0,4.0,4.0,4.0,4,Continuously learning,"Current Employee, more than 1 year",15 May 2023 - Technology Lead - in Pune,Infosys offers best learnings. As the company ...,Expect great hike if you have come in limelight,Infosis,,
1,1.0,3.0,2.0,3.0,2.0,1.0,3,Management heirarcy,"Current Employee, more than 1 year",15 May 2023 - Associate Consultant - in Pune,Some people have no work some have a lot,"Less increment, with lot of pressure",Infosis,15 May 2023,Associate Consultant
2,5.0,5.0,5.0,5.0,5.0,5.0,5,Good,Former Employee,15 May 2023 - Analyst - in Pune,Good company to work with,Nothing bad as of while working with.,Infosis,15 May 2023,Analyst
3,4.0,4.0,4.0,4.0,4.0,4.0,4,Good company,"Former Employee, more than 1 year",15 May 2023 - Technology Analyst - in Bengaluru,Work life balance is good based on project,No cons found for my experience,Infosis,15 May 2023,Technology Analyst
4,5.0,5.0,5.0,5.0,4.0,5.0,5,Good company,"Current Employee, more than 3 years",14 May 2023 - Consultant - in Bengaluru,#NAME?,Annual Salary hike is very less,Infosis,14 May 2023,Consultant


# Data Cleaning

In [4]:
df.columns

Index(['Work/Life Balance', 'Culture & Values', 'Diversity and Inclusion',
       'Career Opportunities', 'Compensation and Benefits',
       'Senior Management', 'rating', 'Title', 'Current/Time', 'Job Line',
       'pros', 'cons', 'Company', 'Date', 'Role'],
      dtype='object')

In [5]:
df = df.drop(columns=['Work/Life Balance', 'Culture & Values', 'Diversity and Inclusion','Career Opportunities', 'Compensation and Benefits','Senior Management', 'rating', 'Title', 'Current/Time', 'Job Line', 'Company', 'Date', 'Role'], axis = 1)

In [6]:
df.head(10)

Unnamed: 0,pros,cons
0,Infosys offers best learnings. As the company ...,Expect great hike if you have come in limelight
1,Some people have no work some have a lot,"Less increment, with lot of pressure"
2,Good company to work with,Nothing bad as of while working with.
3,Work life balance is good based on project,No cons found for my experience
4,#NAME?,Annual Salary hike is very less
5,you will get time for other things most of time,less salary and no learnings
6,Work life balance Flexibility on work front as...,"Politics and bloody politics, if your manager ..."
7,Work-life balance. No work pressure.,No scope for growth for freshers.
8,Good for experience people if you are in with ...,Project onboarding and off boarding specifical...
9,Projects and clients Flexibility in time Work ...,Very less recognition for the amount of work w...


In [7]:
df['text'] = df['pros'] +" "+ df['cons'] 

In [8]:
df.head(5)

Unnamed: 0,pros,cons,text
0,Infosys offers best learnings. As the company ...,Expect great hike if you have come in limelight,Infosys offers best learnings. As the company ...
1,Some people have no work some have a lot,"Less increment, with lot of pressure",Some people have no work some have a lot Less ...
2,Good company to work with,Nothing bad as of while working with.,Good company to work with Nothing bad as of wh...
3,Work life balance is good based on project,No cons found for my experience,Work life balance is good based on project No ...
4,#NAME?,Annual Salary hike is very less,#NAME? Annual Salary hike is very less


In [9]:
df = df.drop(columns=['pros', 'cons'], axis = 1)

In [10]:
df.sample(5)

Unnamed: 0,text
77714,Supportive team and manager. Had fun working. ...
151401,Less salaray in the company and Good work envi...
272022,Good working environment Scope for upscale Fle...
239291,Facility and infrastructure\nTravel Opportunit...
1212,Worklife balance Culture & Values Diversity & ...


# Remove punctuation/lower casing

In [13]:
# Load the regular expression library
import re

# Remove punctuation
df['text_processed'] = df['text'].map(lambda x: re.sub('[,\.!?]', '', str(x)))

# Convert the titles to lowercase
df['text_processed'] = df['text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
df['text_processed'].head()

0    infosys offers best learnings as the company i...
1    some people have no work some have a lot less ...
2    good company to work with nothing bad as of wh...
3    work life balance is good based on project no ...
4                #name annual salary hike is very less
Name: text_processed, dtype: object

# Tokenize words and further clean-up text

In [15]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df.text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['infosys', 'offers', 'best', 'learnings', 'as', 'the', 'company', 'is', 'huge', 'it', 'makes', 'you', 'interact', 'with', 'different', 'group', 'of', 'people', 'to', 'get', 'the', 'work', 'done', 'expect', 'great', 'hike', 'if', 'you', 'have', 'come']


# Phrase Modeling: Bigram and Trigram Models

In [16]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Remove Stopwords, Make Bigrams and Lemmatize

In [17]:
# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['company', 'review'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [19]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 1.3 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2023-08-18 15:14:40.267851: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-08-18 15:14:40.268570: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-08-18 15:14:57.351638: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2023-08-18 15:14:57.355845: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-08-18 15:14:57.361894: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: Luqmaan
2023-08-18 15:14:57.362064: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: Luqmaan


In [20]:
import spacy

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['infosy', 'offer', 'good', 'learning', 'huge', 'make', 'interact', 'different', 'group', 'people', 'get', 'work', 'do', 'expect', 'great', 'hike', 'come', 'limelight']


# Data transformation: Corpus and Dictionary

In [21]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]


# Base Model

In [22]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)


In [23]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.116*"growth" + 0.108*"opportunity" + 0.060*"onsite" + 0.047*"good" + '
  '0.042*"career" + 0.032*"less" + 0.029*"slow" + 0.027*"salary" + '
  '0.023*"hike" + 0.021*"term"'),
 (1,
  '0.062*"work" + 0.044*"time" + 0.031*"hour" + 0.030*"shift" + 0.022*"long" + '
  '0.021*"day" + 0.018*"activity" + 0.017*"office" + 0.013*"use" + '
  '0.012*"location"'),
 (2,
  '0.209*"work" + 0.114*"life" + 0.105*"balance" + 0.095*"good" + 0.037*"hike" '
  '+ 0.034*"salary" + 0.033*"less" + 0.026*"pressure" + 0.017*"culture" + '
  '0.016*"home"'),
 (3,
  '0.080*"good" + 0.073*"work" + 0.057*"team" + 0.051*"project" + '
  '0.039*"manager" + 0.034*"management" + 0.028*"employee" + 0.027*"friendly" '
  '+ 0.025*"flexible" + 0.023*"hike"'),
 (4,
  '0.160*"salary" + 0.101*"hike" + 0.096*"less" + 0.072*"good" + 0.067*"job" + '
  '0.066*"low" + 0.034*"compare" + 0.029*"security" + 0.029*"market" + '
  '0.027*"increment"'),
 (5,
  '0.063*"learn" + 0.048*"good" + 0.048*"project" + 0.046*"technology" + '
 

# Compute Model Perplexity and Coherence Score

In [24]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5424226675106311


# Hyperparameter Tuning

In [25]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./results/lda_tuning_results.csv', index=False)
    pbar.close()

  0%|                                                                                          | 0/540 [00:00<?, ?it/s]

# Final Model