# Dataset


In [1]:
import pandas as pd

In [3]:
dataset_df = pd.read_csv(r"/content/Query.tsv",dtype=str,encoding='ISO-8859-1',sep='\t')

In [4]:
dataset_df

Unnamed: 0,Query
0,internet explorer
1,windowsmedia
2,microsoft
3,outlook.com
4,Preview window
...,...
244,meta
245,download internet explorer
246,encarta
247,windows live mail


In [5]:
dataset_df

Unnamed: 0,Query
0,internet explorer
1,windowsmedia
2,microsoft
3,outlook.com
4,Preview window
...,...
244,meta
245,download internet explorer
246,encarta
247,windows live mail


# Dataset preprocessing

In [6]:
%%capture
!pip install -U gensim

In [7]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string,strip_tags,strip_punctuation,strip_numeric,remove_stopwords,strip_short
from gensim.corpora.dictionary import Dictionary
from gensim import models

In [8]:
help(preprocess_string)

Help on function preprocess_string in module gensim.parsing.preprocessing:

preprocess_string(s, filters=[<function <lambda> at 0x7a33170879a0>, <function strip_tags at 0x7a3317087400>, <function strip_punctuation at 0x7a3317087370>, <function strip_multiple_whitespaces at 0x7a33170876d0>, <function strip_numeric at 0x7a33170875b0>, <function remove_stopwords at 0x7a3317087250>, <function strip_short at 0x7a3317087490>, <function stem_text at 0x7a33170877f0>])
    Apply list of chosen filters to `s`.
    
    Default list of filters:
    
    * :func:`~gensim.parsing.preprocessing.strip_tags`,
    * :func:`~gensim.parsing.preprocessing.strip_punctuation`,
    * :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`,
    * :func:`~gensim.parsing.preprocessing.strip_numeric`,
    * :func:`~gensim.parsing.preprocessing.remove_stopwords`,
    * :func:`~gensim.parsing.preprocessing.strip_short`,
    * :func:`~gensim.parsing.preprocessing.stem_text`.
    
    Parameters
    -------

In [9]:
dataset_df['Clean_Queries'] = dataset_df['Query'].apply(preprocess_string)

In [10]:
dataset_df

Unnamed: 0,Query,Clean_Queries
0,internet explorer,"[internet, explor]"
1,windowsmedia,[windowsmedia]
2,microsoft,[microsoft]
3,outlook.com,"[outlook, com]"
4,Preview window,"[preview, window]"
...,...,...
244,meta,[meta]
245,download internet explorer,"[download, internet, explor]"
246,encarta,[encarta]
247,windows live mail,"[window, live, mail]"


In [11]:
filters=[lambda x: x.lower(),strip_tags,strip_punctuation,remove_stopwords,strip_short]
dataset_df['Clean_Queries1'] = dataset_df['Query'].apply(lambda x: preprocess_string(x,filters))

In [12]:
dataset_df

Unnamed: 0,Query,Clean_Queries,Clean_Queries1
0,internet explorer,"[internet, explor]","[internet, explorer]"
1,windowsmedia,[windowsmedia],[windowsmedia]
2,microsoft,[microsoft],[microsoft]
3,outlook.com,"[outlook, com]","[outlook, com]"
4,Preview window,"[preview, window]","[preview, window]"
...,...,...,...
244,meta,[meta],[meta]
245,download internet explorer,"[download, internet, explor]","[download, internet, explorer]"
246,encarta,[encarta],[encarta]
247,windows live mail,"[window, live, mail]","[windows, live, mail]"


In [13]:
dataset_dictionary = Dictionary(dataset_df['Clean_Queries1'])

In [14]:
len(dataset_dictionary)

201

In [15]:
print(dataset_dictionary.token2id)

{'explorer': 0, 'internet': 1, 'windowsmedia': 2, 'microsoft': 3, 'com': 4, 'outlook': 5, 'preview': 6, 'window': 7, 'homepage': 8, 'msn': 9, 'hotm': 10, 'skype': 11, 'account': 12, 'google': 13, 'toolbar': 14, 'history': 15, 'manage': 16, 'flash': 17, 'object': 18, 'ocx': 19, 'shockwave': 20, 'windows': 21, 'java': 22, 'office': 23, 'setup': 24, 'www': 25, 'media': 26, 'player': 27, 'update': 28, 'settings': 29, 'download': 30, 'free': 31, 'app': 32, 'store': 33, 'sharepoint': 34, 'marinenet': 35, 'mil': 36, 'usmc': 37, 'essentials': 38, 'security': 39, 'login': 40, 'downloads': 41, 'office365': 42, 'defender': 43, 'backgrounds': 44, 'desktop': 45, 'menu': 46, 'w2express': 47, 'support': 48, 'firefox': 49, 'mozilla': 50, 'live': 51, 'password': 52, 'reset': 53, 'background': 54, 'xbox': 55, 'maker': 56, 'movie': 57, 'help': 58, 'word': 59, 'mail': 60, '360': 61, 'email': 62, 'express': 63, 'proxy': 64, 'server': 65, 'onedrive': 66, 'device': 67, 'manager': 68, 'apk': 69, 'extension': 

In [16]:
dataset_corpus_bow = [dataset_dictionary.doc2bow(text) for text in dataset_df['Clean_Queries1']] #create a dataset corpus with bag of word vectorization

In [17]:
len(dataset_corpus_bow)

249

In [18]:
#Preview window
print(dataset_corpus_bow[4])

[(6, 1), (7, 1)]


In [19]:
tfidf = models.TfidfModel(dataset_corpus_bow)
dataset_corpus_tfidf = tfidf[dataset_corpus_bow]

In [20]:
len(dataset_corpus_tfidf)

249

In [21]:
print(dataset_corpus_tfidf[4])

[(6, 0.7071067811865475), (7, 0.7071067811865475)]


# Topic Modelling with Latent Dirichlet Allocation(LDA)


In [22]:
from gensim.models.ldamodel import LdaModel

In [23]:
lda_bow = LdaModel(dataset_corpus_bow,num_topics=20,id2word=dataset_dictionary,random_state=0)



In [24]:
lda_topics_bow = lda_bow.print_topics(num_words=8)
for topic in lda_topics_bow:
  print(topic)

(0, '0.101*"windows" + 0.076*"explorer" + 0.076*"internet" + 0.051*"toolbar" + 0.051*"mail" + 0.051*"outlook" + 0.051*"google" + 0.026*"flash"')
(1, '0.087*"windows" + 0.058*"extension" + 0.058*"file" + 0.058*"download" + 0.058*"desktop" + 0.030*"365" + 0.030*"2014" + 0.030*"chrome"')
(2, '0.071*"homepage" + 0.071*"xbox" + 0.071*"com" + 0.036*"setup" + 0.036*"firefox" + 0.036*"browser" + 0.036*"support" + 0.036*"explorer"')
(3, '0.044*"changer" + 0.044*"bing" + 0.044*"explorer" + 0.044*"daily" + 0.044*"browsing" + 0.044*"background" + 0.044*"desktop" + 0.044*"office"')
(4, '0.046*"updates" + 0.046*"internet" + 0.046*"download" + 0.046*"outlook" + 0.046*"access" + 0.046*"explorer" + 0.046*"java" + 0.046*"web"')
(5, '0.073*"email" + 0.073*"com" + 0.037*"ask" + 0.037*"account" + 0.037*"page" + 0.037*"inbox" + 0.037*"home" + 0.037*"open"')
(6, '0.152*"windows" + 0.048*"password" + 0.024*"change" + 0.024*"skype" + 0.024*"bing" + 0.024*"google" + 0.024*"w2express" + 0.024*"reset"')
(7, '0.08

In [25]:
lda_tfidf = LdaModel(dataset_corpus_tfidf, id2word=dataset_dictionary, num_topics=20)



In [26]:
lda_topics_tfidf = lda_tfidf.print_topics(num_words=8)
for topic in lda_topics_tfidf:
  print(topic)

(0, '0.056*"log" + 0.053*"account" + 0.043*"app" + 0.043*"iexplorer" + 0.043*"theme" + 0.043*"java" + 0.043*"miracast" + 0.034*"menu"')
(1, '0.075*"update" + 0.045*"windows" + 0.045*"download" + 0.034*"desktop" + 0.032*"explorer" + 0.032*"internet" + 0.030*"bing" + 0.029*"w2express"')
(2, '0.052*"desktop" + 0.052*"net" + 0.046*"downloads" + 0.043*"microsoft" + 0.040*"speakers" + 0.039*"help" + 0.035*"inbox" + 0.035*"bluetooth"')
(3, '0.055*"msn" + 0.055*"homepage" + 0.041*"drive" + 0.033*"antivirus" + 0.032*"acsm" + 0.031*"ssv" + 0.031*"email" + 0.025*"2014"')
(4, '0.054*"zune" + 0.054*"hotmil" + 0.054*"settings" + 0.054*"mse" + 0.054*"msconfig" + 0.054*"internet" + 0.039*"app" + 0.039*"security"')
(5, '0.067*"java" + 0.061*"history" + 0.057*"download" + 0.048*"wallpaper" + 0.048*"nslookup" + 0.048*"cookies" + 0.041*"directx" + 0.039*"delete"')
(6, '0.063*"msn" + 0.051*"xbox" + 0.045*"download" + 0.042*"microsoft" + 0.041*"account" + 0.031*"onedrive" + 0.031*"trovi" + 0.027*"music"')
(

# Topic Modelling with Latent Semantic Analysis/Indexing(LSA/LSI)



In [27]:
from gensim.models.lsimodel import LsiModel

In [28]:
lsi_bow = LsiModel(corpus=dataset_corpus_bow,id2word=dataset_dictionary,num_topics=20)

In [29]:
lsi_topics_bow = lsi_bow.print_topics(num_words=8)
for topic in lsi_topics_bow:
  print(topic)

(0, '0.760*"com" + 0.366*"www" + 0.353*"microsoft" + 0.175*"windows" + 0.156*"outlook" + 0.135*"office" + 0.110*"java" + 0.108*"download"')
(1, '-0.886*"windows" + -0.204*"download" + 0.184*"com" + -0.168*"internet" + -0.168*"explorer" + -0.124*"update" + 0.088*"www" + -0.081*"free"')
(2, '0.650*"internet" + 0.649*"explorer" + -0.293*"windows" + 0.194*"download" + 0.096*"update" + 0.061*"homepage" + 0.052*"adblock" + 0.052*"browsing"')
(3, '-0.861*"microsoft" + 0.313*"com" + -0.165*"login" + 0.160*"www" + 0.135*"outlook" + -0.114*"365" + -0.095*"essentials" + -0.088*"downloads"')
(4, '-0.690*"extension" + -0.690*"file" + -0.082*"exe" + -0.077*"rtc" + -0.077*"lnk" + -0.077*"dmp" + -0.077*"acsm" + -0.077*"emz"')
(5, '-0.698*"download" + 0.312*"live" + -0.292*"skype" + -0.259*"java" + -0.201*"free" + 0.163*"xbox" + 0.145*"internet" + 0.144*"explorer"')
(6, '0.544*"live" + 0.541*"xbox" + 0.296*"download" + 0.283*"free" + 0.187*"games" + 0.187*"latest" + -0.169*"www" + 0.166*"account"')
(7,

In [30]:
lsi_tfidf = LsiModel(dataset_corpus_tfidf, id2word=dataset_dictionary, num_topics=20)

In [31]:
lsi_topics_tfidf = lsi_tfidf.print_topics(num_words=8)
for topic in lsi_topics_tfidf:
  print(topic)

(0, '-0.614*"explorer" + -0.565*"internet" + -0.367*"windows" + -0.267*"download" + -0.207*"update" + -0.093*"com" + -0.086*"java" + -0.082*"skype"')
(1, '-0.546*"microsoft" + -0.495*"com" + -0.309*"www" + 0.233*"explorer" + 0.211*"internet" + -0.204*"windows" + -0.177*"outlook" + -0.152*"java"')
(2, '-0.815*"windows" + 0.270*"explorer" + 0.242*"internet" + 0.237*"microsoft" + 0.189*"com" + -0.130*"update" + 0.125*"www" + -0.095*"live"')
(3, '-0.655*"microsoft" + 0.384*"com" + 0.271*"outlook" + 0.264*"java" + 0.263*"www" + 0.180*"skype" + -0.154*"essentials" + -0.152*"login"')
(4, '-0.581*"xbox" + -0.520*"live" + 0.327*"download" + 0.318*"skype" + 0.232*"java" + -0.141*"mail" + -0.121*"account" + 0.115*"www"')
(5, '0.503*"skype" + 0.469*"download" + -0.425*"update" + 0.339*"xbox" + -0.222*"outlook" + 0.193*"free" + 0.149*"live" + -0.148*"java"')
(6, '0.672*"extension" + 0.672*"file" + 0.122*"exe" + 0.111*"rtc" + 0.109*"acsm" + 0.109*"deskthemepack" + 0.109*"apk" + 0.109*"lnk"')
(7, '-0

# Topic Modelling Visualization with pyLDAvis

In [32]:
%%capture
!pip install pyLDAvis

In [33]:
import pyLDAvis
import pyLDAvis.gensim_models

In [34]:
pyLDAvis.enable_notebook()

  and should_run_async(code)


In [35]:
vis_bow = pyLDAvis.gensim_models.prepare(lda_bow, dataset_corpus_bow, dataset_dictionary)
vis_bow

  and should_run_async(code)


In [None]:
vis_tfidf = pyLDAvis.gensim_models.prepare(lda_tfidf, dataset_corpus_tfidf, dataset_dictionary)
vis_tfidf

  and should_run_async(code)


# Model evaluation for Topic Modelling

In [36]:
from gensim.models import CoherenceModel

  and should_run_async(code)


In [37]:
texts= dataset_df['Clean_Queries1']
texts = [x for x in texts if x]

  and should_run_async(code)


In [38]:
cm_lda_bow_cv = CoherenceModel(model=lda_bow,texts=texts,dictionary=dataset_dictionary,coherence='c_v')
cm_lda_bow_cv.get_coherence()

  and should_run_async(code)


0.6722894548901733

In [39]:
cm_lsi_bow_cv = CoherenceModel(model=lsi_bow, texts=texts, dictionary=dataset_dictionary, coherence='c_v')
cm_lsi_bow_cv.get_coherence()

  and should_run_async(code)


0.6387972651957761