In [30]:
import nltk
import gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
import pandas as pd
import json
from nltk.tokenize import RegexpTokenizer
en_stop = set(get_stop_words('en'))
tokenizer = RegexpTokenizer(r'\w+')

In [60]:
polClean=json.load(open("privacyClean.json"))

polData=pd.DataFrame.from_dict(polClean, orient="index", dtype='unicode')
polData.columns=["Policy"]

polData.head()

Unnamed: 0,Policy
4shared.com,"This privacy policy (""Policy"") explains how p..."
www_venere.com,\n\n\nPriceGrabber Terms of Use and Privacy St...
brookstone.com,Privacy Policy &amp.;\nSecurity Effective Jun...
www_sitepoint.com,\n\n\nPriceGrabber Terms of Use and Privacy St...
haven.com,Privacy Policy \n\nThis Privacy Statement set...


In [26]:
polText=polData["Policy"].values

In [29]:
%%time
polTextLow=[text.lower() for text in polText]
polTokens=[tokenizer.tokenize(text) for text in polTextLow]

CPU times: user 3.81 s, sys: 1.08 s, total: 4.89 s
Wall time: 4.88 s


In [33]:
%%time
# remove the stop words
polTokenNoStop=map(lambda tokenlist: [token for token in tokenlist if token not in en_stop], polTokens)

CPU times: user 971 ms, sys: 465 ms, total: 1.44 s
Wall time: 1.43 s


In [37]:
%%time
from nltk.stem.porter import PorterStemmer
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# Stem the token
polStem=map(lambda tokenlist:[p_stemmer.stem(token) for token in tokenlist],polTokenNoStop)

CPU times: user 46.6 s, sys: 1.57 s, total: 48.2 s
Wall time: 48 s


## Topic Modeling 

In [42]:
from gensim import corpora, models
# assign unique integers
dictionary = corpora.Dictionary(polStem)


In [46]:
%%time
# Changing to bag of words
bow=[dictionary.doc2bow(text) for text in polStem]

CPU times: user 2.36 s, sys: 619 ms, total: 2.98 s
Wall time: 2.94 s


In [47]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(bow, num_topics=5, id2word = dictionary, passes=100)

CPU times: user 1h 26min 36s, sys: 25.7 s, total: 1h 27min 1s
Wall time: 1h 11min 30s


In [49]:
print(ldamodel.print_topics(num_topics=5, num_words=4))

[(0, u'0.029*"de" + 0.013*"e" + 0.013*"o" + 0.012*"que"'), (1, u'0.024*"microsoft" + 0.024*"data" + 0.018*"servic" + 0.013*"can"'), (2, u'0.039*"pricegrabb" + 0.030*"site" + 0.020*"use" + 0.017*"may"'), (3, u'0.045*"inform" + 0.023*"use" + 0.022*"may" + 0.019*"servic"'), (4, u'0.061*"de" + 0.026*"le" + 0.020*"et" + 0.019*"\xe0"')]


In [57]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(bow, num_topics=10, id2word = dictionary, passes=50)

CPU times: user 1h 39min 53s, sys: 1min 44s, total: 1h 41min 37s
Wall time: 39min 47s


In [58]:
#ldamodel2=ldamodel

In [59]:
print(ldamodel2.print_topics(num_topics=10))

[(0, u'0.018*"servic" + 0.013*"inform" + 0.013*"may" + 0.012*"inde" + 0.012*"use" + 0.011*"websit" + 0.011*"us" + 0.009*"provid" + 0.008*"s" + 0.008*"job"'), (1, u'0.036*"site" + 0.032*"pricegrabb" + 0.022*"may" + 0.020*"inform" + 0.017*"use" + 0.014*"parti" + 0.012*"third" + 0.010*"merchant" + 0.010*"s" + 0.010*"provid"'), (2, u'0.020*"use" + 0.016*"servic" + 0.016*"site" + 0.014*"content" + 0.013*"may" + 0.012*"inform" + 0.011*"user" + 0.011*"term" + 0.010*"right" + 0.009*"s"'), (3, u'0.029*"die" + 0.024*"und" + 0.022*"der" + 0.018*"sie" + 0.015*"gm" + 0.013*"oder" + 0.012*"von" + 0.012*"zu" + 0.010*"de" + 0.010*"daten"'), (4, u'0.057*"amp" + 0.021*"gov" + 0.012*"avg" + 0.012*"see" + 0.010*"depot" + 0.009*"product" + 0.009*"offic" + 0.008*"beemp3" + 0.008*"busi" + 0.007*"hous"'), (5, u'0.039*"pricegrabb" + 0.030*"site" + 0.022*"use" + 0.017*"inform" + 0.016*"parti" + 0.014*"may" + 0.013*"merchant" + 0.011*"s" + 0.011*"third" + 0.011*"review"'), (6, u'0.056*"de" + 0.019*"le" + 0.014*"

## Try without stemming

In [61]:
dictionaryNoStem = corpora.Dictionary(polTokenNoStop)
bowNoStop=[dictionary.doc2bow(text) for text in polStem]

In [63]:
%%time
ldamodelNoStop = gensim.models.ldamodel.LdaModel(bowNoStop, num_topics=10, id2word = dictionaryNoStem, passes=50)

CPU times: user 1h 40min 26s, sys: 1min 43s, total: 1h 42min 10s
Wall time: 40min 4s


In [67]:
topicsNoStop=ldamodelNoStop.print_topics(num_topics=10)

In [69]:
ldamodelNoStop20 = gensim.models.ldamodel.LdaModel(bowNoStop, num_topics=20, id2word = dictionaryNoStem, passes=100)

In [70]:
topicsNoStop20=ldamodelNoStop20.print_topics(num_topics=20)

In [71]:
topicsNoStop20

[(0,
  u'0.152*"improve" + 0.050*"criptografamos" + 0.024*"forwarded" + 0.008*"wish" + 0.007*"intercepting" + 0.007*"vulnerabilities" + 0.005*"checked" + 0.004*"1290" + 0.004*"marilyn" + 0.004*"endobj"'),
 (1,
  u'0.014*"reporters" + 0.014*"segment" + 0.012*"pspmainhowtocontactusmodulepspmainmicrosofthealthandbandmodule" + 0.011*"component" + 0.010*"massachusetts" + 0.007*"sensors" + 0.006*"apparent" + 0.005*"522360" + 0.005*"setup" + 0.004*"webmarketing"'),
 (2,
  u'0.033*"thirty" + 0.029*"company" + 0.018*"reject" + 0.018*"information" + 0.016*"files" + 0.014*"processes" + 0.013*"track" + 0.012*"sale" + 0.012*"means" + 0.012*"absolute"'),
 (3,
  u'0.030*"plays" + 0.025*"lethal" + 0.023*"_atuvc" + 0.019*"podcast" + 0.014*"carriers" + 0.013*"lifestyle" + 0.013*"truste" + 0.011*"reveal" + 0.010*"strategic" + 0.010*"s_vi"'),
 (4,
  u'0.036*"see" + 0.020*"means" + 0.011*"exclusive" + 0.009*"contract" + 0.009*"apply" + 0.008*"control" + 0.008*"adjust" + 0.007*"read" + 0.007*"web" + 0.007*"