## Topic Modelling
This notebook contains a demo of LDA and LSA using the gensim library. The dataset's link can be found in the `BookSummaries_Link.md` file under the Data folder in Ch7.

In [16]:
# Import OS 
import os
# For NLTK virtual environments are high recommended and it requires python verisions higher than 3.5
# !pip install gensim
# !pip install nltk

In [5]:
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
# from sklearn.feature_extraction.text import TfidfVectorizer
# stopwords = TfidfVectorizer(stop_words='english').get_stop_words()

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mccar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mccar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import tarfile

#tokenize, remove stopwords, non-alphabetic words, lowercase
def preprocess(textstring):
   stops =  set(stopwords.words('english'))
   # stops = set(stopwords)
   # print(textstring)
   tokens = word_tokenize(textstring)
   return [token.lower() for token in tokens if token.isalpha() and token not in stops]

# This is a sample path of your downloaded data set. This is currently set to a windows based path . 
# Please update it to your actual download path regradless of your choice of operating system 

if False:
   temp='../data/bigdata/booksumms/booksummaries.tar.gz' 
   tar = tarfile.open(temp, "r:gz")
   tar.extractall('../data/bigdata/booksumms/')      
   tar.close()
data_path = '../data/bigdata/booksumms/booksummaries.txt' 

summaries = []
for line in open(data_path, encoding="utf-8"):
   temp = line.split("\t")
   summaries.append(preprocess(temp[6]))

# Create a dictionary representation of the documents.

dictionary = Dictionary(summaries)

# Filter infrequent or too frequent words.

dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(summary) for summary in summaries]

# Make a index to word dictionary.

temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

#Train the topic model

model = LdaModel(corpus=corpus, id2word=id2word,iterations=400, num_topics=10)
top_topics = list(model.top_topics(corpus))
pprint(top_topics)


[([(0.007278158, 'she'),
   (0.00681306, 'he'),
   (0.005864592, 'mother'),
   (0.005447434, 'one'),
   (0.004145695, 'life'),
   (0.004128951, 'time'),
   (0.0040128184, 'father'),
   (0.004005924, 'family'),
   (0.0039365673, 'home'),
   (0.0038908832, 'school'),
   (0.0038257958, 'new'),
   (0.0037833296, 'back'),
   (0.0036737144, 'tells'),
   (0.003506203, 'also'),
   (0.0034893362, 'they'),
   (0.0032621392, 'go'),
   (0.0032428734, 'when'),
   (0.003189927, 'day'),
   (0.0030907076, 'book'),
   (0.0030232212, 'story')],
  -0.9380709233842918),
 ([(0.0060619954, 'king'),
   (0.0053629978, 'one'),
   (0.005105551, 'world'),
   (0.004732793, 'city'),
   (0.004453306, 'he'),
   (0.0041299853, 'they'),
   (0.003975691, 'find'),
   (0.003933881, 'in'),
   (0.0035775476, 'army'),
   (0.0033902617, 'way'),
   (0.0033655926, 'back'),
   (0.0032340211, 'battle'),
   (0.0030773696, 'also'),
   (0.0029530493, 'new'),
   (0.0029476401, 'two'),
   (0.0029461442, 'war'),
   (0.002928865, 'powe

In [8]:
for idx in range(10):
    print("Topic #%s:" % idx, model.print_topic(idx, 10))
print("=" * 20)

Topic #0: 0.006*"he" + 0.005*"police" + 0.005*"sam" + 0.004*"one" + 0.004*"man" + 0.003*"new" + 0.003*"president" + 0.003*"two" + 0.003*"in" + 0.003*"back"
Topic #1: 0.012*"ship" + 0.010*"earth" + 0.006*"planet" + 0.006*"space" + 0.006*"crew" + 0.005*"human" + 0.005*"they" + 0.005*"new" + 0.004*"alex" + 0.004*"one"
Topic #2: 0.007*"he" + 0.004*"one" + 0.004*"family" + 0.004*"life" + 0.004*"story" + 0.004*"two" + 0.003*"back" + 0.003*"time" + 0.003*"new" + 0.003*"michael"
Topic #3: 0.021*"david" + 0.012*"leo" + 0.012*"rachel" + 0.011*"mark" + 0.007*"martin" + 0.007*"hugh" + 0.007*"dexter" + 0.006*"toby" + 0.005*"anne" + 0.005*"malcolm"
Topic #4: 0.007*"she" + 0.007*"he" + 0.006*"mother" + 0.005*"one" + 0.004*"life" + 0.004*"time" + 0.004*"father" + 0.004*"family" + 0.004*"home" + 0.004*"school"
Topic #5: 0.006*"one" + 0.004*"he" + 0.004*"ship" + 0.004*"two" + 0.003*"in" + 0.003*"also" + 0.003*"time" + 0.003*"story" + 0.003*"however" + 0.003*"novel"
Topic #6: 0.008*"he" + 0.006*"father" 

In [9]:
from gensim.models import LsiModel
lsamodel = LsiModel(corpus, num_topics=10, id2word = id2word)  # train model

pprint(lsamodel.print_topics(num_topics=10, num_words=10))


[(0,
  '0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + '
  '0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"'),
 (1,
  '0.493*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.179*"house" + 0.160*"she" + '
  '0.153*"father" + 0.147*"mr" + 0.146*"he" + 0.139*"tells" + -0.126*"one"'),
 (2,
  '-0.557*"tom" + -0.252*"sophia" + 0.213*"she" + 0.190*"he" + -0.185*"mrs" + '
  '0.163*"tells" + 0.143*"mother" + -0.136*"mr" + -0.130*"western" + '
  '-0.102*"however"'),
 (3,
  '-0.233*"they" + -0.201*"ship" + 0.186*"he" + -0.183*"david" + -0.178*"back" '
  '+ -0.164*"tells" + 0.159*"family" + 0.158*"life" + -0.156*"find" + '
  '0.153*"narrator"'),
 (4,
  '0.663*"he" + -0.257*"mother" + -0.214*"she" + -0.196*"father" + '
  '-0.182*"family" + 0.121*"narrator" + 0.120*"monk" + -0.100*"novel" + '
  '-0.096*"school" + -0.095*"children"'),
 (5,
  '-0.487*"david" + 0.242*"king" + -0.168*"rosa" + -0.165*"book" + '
  '-0.125*"harlan" + 0.120*"he" + -0.114*"she" + -0.109

In [10]:
for idx in range(10):
    print("Topic #%s:" % idx, lsamodel.print_topic(idx, 10))
 
print("=" * 20)

Topic #0: 0.305*"he" + 0.215*"one" + 0.150*"she" + 0.140*"time" + 0.132*"back" + 0.131*"also" + 0.127*"two" + 0.125*"they" + 0.123*"tells" + 0.118*"in"
Topic #1: 0.493*"tom" + 0.226*"sophia" + 0.182*"mrs" + 0.179*"house" + 0.160*"she" + 0.153*"father" + 0.147*"mr" + 0.146*"he" + 0.139*"tells" + -0.126*"one"
Topic #2: -0.557*"tom" + -0.252*"sophia" + 0.213*"she" + 0.190*"he" + -0.185*"mrs" + 0.163*"tells" + 0.143*"mother" + -0.136*"mr" + -0.130*"western" + -0.102*"however"
Topic #3: -0.233*"they" + -0.201*"ship" + 0.186*"he" + -0.183*"david" + -0.178*"back" + -0.164*"tells" + 0.159*"family" + 0.158*"life" + -0.156*"find" + 0.153*"narrator"
Topic #4: 0.663*"he" + -0.257*"mother" + -0.214*"she" + -0.196*"father" + -0.182*"family" + 0.121*"narrator" + 0.120*"monk" + -0.100*"novel" + -0.096*"school" + -0.095*"children"
Topic #5: -0.487*"david" + 0.242*"king" + -0.168*"rosa" + -0.165*"book" + -0.125*"harlan" + 0.120*"he" + -0.114*"she" + -0.109*"gould" + 0.108*"anita" + -0.106*"would"
Topic 