<a href="https://colab.research.google.com/github/m-haghighi/Topic-Modeling/blob/main/Gensim_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Import Libraries

In [None]:
# for text preprocessing
import re
import spacy
import pandas as pd
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import numpy for matrix operation
import numpy as np

# Importing Gensim
import gensim
from gensim import corpora


In [None]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
Istopwords=set(open('/content/5303 stop-words_updated_V1.txt').read().split())
stop=set(stopwords.words('english'))
stop.update(Istopwords)

In [None]:

from numpy.ma.core import shape
df=pd.DataFrame(pd.read_csv('/content/A2_Corpus_G11.csv'))
corpus=(df.content.values)

### 2. Text Preprocessing

Steps to preprocess text data:

1. Convert the text into lowercase
2. Split text into words
3. Remove the stop loss words
3. Remove the Punctuation, any symbols and special characters
4. Normalize the word 

In [None]:
#from collections import Counter
from string import punctuation
# lemmatization
lemma = WordNetLemmatizer() 
nlp = spacy.load("en")
def get_hotwords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN','VERB'] 
    doc = nlp(text.lower()) 
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation or token.text in stop):
            continue
        if(token.pos_ in pos_tag):
            result.append(lemma.lemmatize(token.text))
    return result
clean_corpus = [get_hotwords(doc) for doc in corpus ]
print(clean_corpus)

[['solution', 'problem', 'meaning', 'keep', 'delighted', 'discovery', 'direction'], ['meaningful', 'communicate', 'meaning'], ['meaning', 'divided', 'symbolic', 'meaning', 'researcher', 'symbolic', 'researcher', 'categorization', 'definition', 'consensus', 'product', 'pure', 'functionality', 'utility', 'intangible', 'value', 'intangible', 'value', 'socio', 'symbolism', 'experience'], ['order', 'differentiate', 'type', 'norman', 'verganti', 'define', 'incremental', 'improvement', 'solution', 'radical', 'change'], ['enablers', 'barrier', 'study', 'swedish', 'furniture', 'company', 'insight', 'enablers', 'barrier', 'issue', 'development', 'link', 'demonstrate', 'trade', 'offs', 'furnitureco', 'encounter', 'development', 'color', 'material', 'matrix', 'company', 'manage', 'furniture', 'development', 'form', 'establishing', 'collaboration', 'designer'], ['enablers', 'barrier', 'connection', 'interaction', 'relevance', 'meaning'], ['limitation', 'material', 'choice', 'flexibility', 'supplier

In [None]:
clean_corpus

[['solution',
  'problem',
  'meaning',
  'keep',
  'delighted',
  'discovery',
  'direction'],
 ['meaningful', 'communicate', 'meaning'],
 ['meaning',
  'divided',
  'symbolic',
  'meaning',
  'researcher',
  'symbolic',
  'researcher',
  'categorization',
  'definition',
  'consensus',
  'product',
  'pure',
  'functionality',
  'utility',
  'intangible',
  'value',
  'intangible',
  'value',
  'socio',
  'symbolism',
  'experience'],
 ['order',
  'differentiate',
  'type',
  'norman',
  'verganti',
  'define',
  'incremental',
  'improvement',
  'solution',
  'radical',
  'change'],
 ['enablers',
  'barrier',
  'study',
  'swedish',
  'furniture',
  'company',
  'insight',
  'enablers',
  'barrier',
  'issue',
  'development',
  'link',
  'demonstrate',
  'trade',
  'offs',
  'furnitureco',
  'encounter',
  'development',
  'color',
  'material',
  'matrix',
  'company',
  'manage',
  'furniture',
  'development',
  'form',
  'establishing',
  'collaboration',
  'designer'],
 ['enab

### 3. Creating Document Term Matrix

Using gensim for Document Term Matrix(DTM), we don't need to create the DTM matrix from scratch explicitly. The gensim library has internal mechanism to create the DTM.

The only requirement for gensis package is we need to pass the cleaned data in the form of tokenized words.

In [None]:
# Creating the term dictionary of our courpus that is of all the words (Sepcific to Genism syntax perspective), 
# where every unique term is assigned an index. 

dict_ = corpora.Dictionary(clean_corpus)

print(dict_)

Dictionary(2581 unique tokens: ['delighted', 'direction', 'discovery', 'keep', 'meaning']...)


In [None]:
# The dictionary had 52 unqiue words in the cleaned corpus.

for i in dict_.values():
    print(i)

delighted
direction
discovery
keep
meaning
problem
solution
communicate
meaningful
categorization
consensus
definition
divided
experience
functionality
intangible
product
pure
researcher
socio
symbolic
symbolism
utility
value
change
define
differentiate
improvement
incremental
norman
order
radical
type
verganti
barrier
collaboration
color
company
demonstrate
designer
development
enablers
encounter
establishing
form
furniture
furnitureco
insight
issue
link
manage
material
matrix
offs
study
swedish
trade
connection
interaction
relevance
addressing
ambiguous
capturing
choice
communication
consumer
demonstrates
flexibility
hinders
interfere
interpreting
limitation
phase
prevents
strengthen
supplier
tangible
channel
learning
match
operate
opposing
sort
taste
yield
community
consideration
login
lurker
lurking
post.four
reason
special
user
contribute
factor
identity
impression
influence
motivational
norm
pro
reciprocity
reputation
sharing
spend
usability
willingness
critical
entangled
formed


Now, the next step is to convert the corpus (the list of documents) into a document-term Matrix using the dictionary that we had prepared above. (The vectorizer used here is the Bag of Words).	

In [None]:
# Converting list of documents (corpus) into Document Term Matrix using the dictionary 
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (7, 1), (8, 1)],
 [(4, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 2),
  (16, 1),
  (17, 1),
  (18, 2),
  (19, 1),
  (20, 2),
  (21, 1),
  (22, 1),
  (23, 2)],
 [(6, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1)],
 [(34, 2),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 1),
  (39, 1),
  (40, 3),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(4, 1), (34, 1), (41, 1), (57, 1), (58, 1), (59, 1)],
 [(4, 3),
  (15, 1),
  (16, 1),
  (34, 1),
  (37, 2),
  (40, 1),
  (41, 1),
  (51, 1),
  (54, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 3),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1)],
 [(16, 3),
  (65, 2),


The output implies: 

* Document wise we have the index of the word and its frequency.
* The 0th word is repeated 1 time, then the 1st word repeated 1 and so on ...	

### 4. Implementation of LDA

In [None]:
# Creating the object for LDA model using gensim library

Lda = gensim.models.ldamodel.LdaModel

In [None]:
# Running and Training LDA model on the document term matrix.

ldamodel = Lda(doc_term_matrix, num_topics=35, id2word = dict_, passes=5, random_state=0, eval_every=0, chunksize=100)

In [None]:
pd.options.display.max_rows = 4000

In [None]:
# Prints the topics with the indexes: 0,1,2 :

ldamodel.print_topics()

# we need to manually check whethere the topics are different from one another or not

[(15,
  '0.158*"industry" + 0.094*"opportunity" + 0.054*"investor" + 0.048*"development" + 0.019*"company" + 0.018*"associated" + 0.018*"strategy" + 0.016*"economy" + 0.014*"founder" + 0.012*"bundle"'),
 (11,
  '0.240*"customer" + 0.146*"firm" + 0.063*"offering" + 0.051*"capture" + 0.019*"stakeholder" + 0.016*"activity" + 0.014*"mechanism" + 0.013*"company" + 0.013*"perspective" + 0.012*"architecture"'),
 (20,
  '0.056*"member" + 0.027*"participation" + 0.026*"community" + 0.022*"explain" + 0.020*"tenant" + 0.020*"p.2" + 0.020*"return" + 0.018*"stage" + 0.015*"running" + 0.014*"variation"'),
 (17,
  '0.078*"communication" + 0.044*"area" + 0.041*"trade" + 0.026*"feedback" + 0.023*"point" + 0.017*"translate" + 0.017*"executive" + 0.017*"trained" + 0.016*"working" + 0.014*"facilitate"'),
 (34,
  '0.175*"performance" + 0.046*"problem" + 0.040*"novel" + 0.033*"solution" + 0.027*"category" + 0.026*"offer" + 0.021*"link" + 0.016*"effort" + 0.012*"difficult" + 0.012*"author"'),
 (19,
  '0.071*

### 4a. Extracting Topics from the Corpus 

In [None]:
print(ldamodel.print_topics(num_topics=35, num_words=10))

# num_topics mean: how many topics want to extract 
# num_words: the number of words that want per topic

[(0, '0.160*"asset" + 0.101*"scaling" + 0.022*"conflicting" + 0.022*"firm" + 0.020*"specialization" + 0.019*"organisation" + 0.016*"necessity" + 0.015*"country" + 0.013*"capture" + 0.011*"inoculate"'), (1, '0.143*"management" + 0.061*"practice" + 0.033*"pattern" + 0.029*"partnering" + 0.026*"attention" + 0.018*"implication" + 0.017*"technique" + 0.016*"bpm" + 0.015*"respond" + 0.012*"reward"'), (2, '0.062*"company" + 0.041*"role" + 0.035*"operation" + 0.034*"researcher" + 0.034*"objective" + 0.033*"group" + 0.026*"meet" + 0.025*"target" + 0.024*"engagement" + 0.024*"unique"'), (3, '0.089*"structure" + 0.050*"component" + 0.039*"achieve" + 0.037*"process" + 0.036*"perception" + 0.035*"delivery" + 0.031*"proposition" + 0.029*"activity" + 0.028*"model" + 0.021*"provision"'), (4, '0.101*"change" + 0.083*"cost" + 0.038*"effect" + 0.037*"health" + 0.028*"issue" + 0.027*"indicator" + 0.022*"education" + 0.022*"pressure" + 0.017*"sector" + 0.016*"release"'), (5, '0.058*"cost" + 0.055*"benefit"

In [None]:
!pip install  pyLDAvis

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix ,dict_)
vis

  from collections import Iterable


-----------------------