# Topic modelling
__Definition:__ A statistical model for discovering the abstract "topics" that occur in a collection of documents. 

For instance we can go through the articles published by a newspaper and check what the top 5 topics are. 

_Spoiler: We won't do that._

We will try to extract the main topics in the King James' Bible, and see what they are.

Please download and extract the .csv file from [here](https://www.kaggle.com/phyred23/bibleverses/downloads/bible_data_set.csv) and place it in the root of the project. 

In [7]:
import re
from nltk.corpus import stopwords
stopWordsSet = set(stopwords.words('english'))
otherWords = 'shall unto yee ye lord god thou thy thee said upon shalt saith'.split(' ')
for word in otherWords:
    stopWordsSet.add(word)
def getWords(text):
    return [x for x in re.compile('\w+').findall(text) if x not in stopWordsSet]

# read line by line, remove \n, split to columns, skip 1st line
print(2)
lines = [x.strip('\n').split(',') for x in open('bible_data_set.csv', 'r').readlines() if ',' in x][1:]
# keep book, chapter and text

filteredLines = [[x[0], ' '.join(x[4:])] for x in lines]

documentDictionary = {}
for line in filteredLines:
    if line[0] not in documentDictionary:
        documentDictionary[line[0]] = getWords(line[1].lower())
    else:
        documentDictionary[line[0]] += getWords(line[1].lower())
        
from gensim import corpora, models
dictionary = corpora.Dictionary(documentDictionary.values())
dictionary.save('bible.dict')
corpus = [dictionary.doc2bow(line) for line in documentDictionary.values()]
lsi = models.lsimodel.LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)

# print the most contributing words (both positively and negatively) for each of the first ten topics
lsi.print_topics(10)


2


[(0,
  '0.552*"ye" + 0.245*"king" + 0.233*"israel" + 0.193*"man" + 0.190*"son" + 0.163*"hath" + 0.154*"house" + 0.153*"people" + 0.143*"come" + 0.135*"children"'),
 (1,
  '-0.777*"ye" + 0.321*"king" + 0.297*"son" + 0.177*"israel" + 0.144*"came" + 0.135*"house" + 0.131*"man" + 0.092*"people" + 0.087*"children" + 0.080*"david"'),
 (2,
  '-0.879*"son" + -0.189*"ye" + 0.153*"hath" + 0.131*"people" + 0.120*"israel" + 0.116*"one" + 0.102*"let" + 0.100*"us" + 0.086*"come" + 0.081*"every"'),
 (3,
  '-0.669*"king" + 0.468*"man" + 0.319*"hath" + 0.219*"son" + -0.170*"israel" + 0.130*"every" + -0.129*"ye" + 0.116*"us" + 0.108*"one" + 0.088*"let"'),
 (4,
  '-0.563*"israel" + -0.551*"children" + 0.440*"king" + 0.244*"hath" + 0.162*"man" + -0.118*"land" + 0.112*"let" + 0.083*"us" + -0.075*"moses" + -0.066*"son"'),
 (5,
  '-0.767*"hath" + 0.489*"man" + 0.195*"one" + 0.165*"every" + 0.140*"came" + -0.108*"us" + -0.093*"israel" + -0.084*"children" + -0.073*"son" + 0.069*"men"'),
 (6,
  '0.564*"man" + 0

In [8]:
lsi = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

# print the most contributing words (both positively and negatively) for each of the first ten topics
lsi.print_topics(10)

[(0,
  '0.021*"ye" + 0.021*"moses" + 0.018*"eat" + 0.016*"saying" + 0.015*"day" + 0.013*"spake" + 0.012*"came" + 0.012*"holy" + 0.011*"bread" + 0.011*"people"'),
 (1,
  '0.023*"sons" + 0.016*"son" + 0.015*"children" + 0.013*"speak" + 0.012*"ye" + 0.010*"israel" + 0.009*"man" + 0.009*"daughters" + 0.007*"wisdom" + 0.007*"men"'),
 (2,
  '0.024*"one" + 0.013*"every" + 0.013*"daughter" + 0.012*"man" + 0.011*"sin" + 0.011*"day" + 0.010*"young" + 0.009*"saul" + 0.009*"month" + 0.008*"beast"'),
 (3,
  '0.042*"king" + 0.031*"israel" + 0.027*"land" + 0.023*"came" + 0.019*"children" + 0.016*"david" + 0.016*"people" + 0.016*"men" + 0.014*"judah" + 0.013*"went"'),
 (4,
  '0.037*"ye" + 0.018*"jesus" + 0.016*"father" + 0.014*"christ" + 0.012*"hath" + 0.012*"us" + 0.012*"come" + 0.011*"man" + 0.011*"also" + 0.010*"things"'),
 (5,
  '0.018*"let" + 0.018*"hath" + 0.016*"us" + 0.015*"thine" + 0.015*"man" + 0.012*"go" + 0.011*"hast" + 0.010*"may" + 0.010*"people" + 0.010*"hand"'),
 (6,
  '0.017*"voice" +