In [12]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
import pandas as pd
import re
import csv
from tqdm import tqdm

from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from spacy.lang.en.stop_words import STOP_WORDS
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [13]:
# Read and process data from the CSV file
data = []
with open("booksummaries.txt", "r") as f:
    reader = csv.reader(f, dialect="excel-tab")
    for row in tqdm(reader):
        data.append(row)

16559it [00:01, 10094.81it/s]


In [14]:
# Extract relevant information from the data and create a DataFrame
book_index = []
book_id = []
book_author = []
book_name = []
summary = []
genre = []
a = 1
for i in tqdm(data):
    book_index.append(a)
    a = a + 1
    book_id.append(i[0])
    book_name.append(i[2])
    book_author.append(i[3])
    genre.append(i[5])
    summary.append(i[6])

book_df = pd.DataFrame(
    {
        "Index": book_index,
        "ID": book_id,
        "BookTitle": book_name,
        "Author": book_author,
        "Genre": genre,
        "Summary": summary,
    }
)
book_df.head()

100%|█████████████████████████████████| 16559/16559 [00:00<00:00, 168431.28it/s]


Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary
0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,2,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,3,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,4,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,5,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


In [15]:
# Load the SpaCy English model
nlp = spacy.load("en_core_web_sm")

In [16]:
data = book_df['Summary'].values.tolist()

In [17]:
#removing punctuations and others characters
def preprocess(string):
    return re.sub('[^\w_\s-]', ' ',str(string))

data = list(map(preprocess,data))  

In [20]:
#data cleaning and lemmatization
lemma_doc = []
for datum in data:
    sent = nlp(str(datum).lower())
    text = []
    for w in sent:
        if not w.is_stop and not w.is_punct and not w.like_num and (len(str(w)) > 4):
            #adding the lemmatized version of the words
            text.append(w.lemma_)
    lemma_doc.append(text)
    

In [21]:
lemma_doc[0]

['major',
 'manor',
 'call',
 'animal',
 'meeting',
 'compare',
 'human',
 'parasite',
 'teach',
 'animal',
 'revolutionary',
 'beast',
 'england',
 'major',
 'young',
 'snowball',
 'napoleon',
 'assume',
 'command',
 'dream',
 'philosophy',
 'animal',
 'revolt',
 'drive',
 'drunken',
 'irresponsible',
 'jones',
 'rename',
 'animal',
 'adopt',
 'commandment',
 'animal',
 'important',
 'animal',
 'equal',
 'snowball',
 'attempt',
 'teach',
 'animal',
 'read',
 'write',
 'plentiful',
 'smoothly',
 'elevate',
 'position',
 'leadership',
 'aside',
 'special',
 'item',
 'ostensibly',
 'personal',
 'health',
 'napoleon',
 'take',
 'train',
 'privately',
 'napoleon',
 'snowball',
 'struggle',
 'leadership',
 'snowball',
 'announce',
 'plan',
 'build',
 'windmill',
 'napoleon',
 'chase',
 'snowball',
 'declare',
 'leader',
 'napoleon',
 'enact',
 'change',
 'governance',
 'structure',
 'replace',
 'meeting',
 'committee',
 'young',
 'name',
 'squealer',
 'mouthpiece',
 'napoleon',
 'claim',
 '

In [22]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(lemma_doc, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lemma_doc], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[lemma_doc[0]]])

['major', 'manor', 'call', 'animal', 'meeting', 'compare', 'human', 'parasite', 'teach', 'animal', 'revolutionary', 'beast', 'england', 'major', 'young', 'snowball', 'napoleon', 'assume', 'command', 'dream', 'philosophy', 'animal', 'revolt', 'drive', 'drunken', 'irresponsible', 'jones', 'rename', 'animal', 'adopt', 'commandment', 'animal', 'important', 'animal', 'equal', 'snowball', 'attempt', 'teach', 'animal', 'read', 'write', 'plentiful', 'smoothly', 'elevate', 'position', 'leadership', 'aside', 'special', 'item', 'ostensibly', 'personal', 'health', 'napoleon', 'take', 'train', 'privately', 'napoleon', 'snowball', 'struggle', 'leadership', 'snowball', 'announce', 'plan', 'build', 'windmill', 'napoleon', 'chase', 'snowball', 'declare', 'leader', 'napoleon', 'enact', 'change', 'governance', 'structure', 'replace', 'meeting', 'committee', 'young', 'name', 'squealer', 'mouthpiece', 'napoleon', 'claim', 'credit', 'windmill', 'animal', 'hard', 'promise', 'easy', 'life', 'windmill', 'viole

In [23]:
#Creates Word to IDs mapping
word2id = corpora.Dictionary(lemma_doc)

In [24]:
# Creates bag of words and a corpus
documents = lemma_doc
corpus = [word2id.doc2bow(doc) for doc in documents]

print('Corpus sample')
sample = corpus[0]
for i in range(len(sample)):
    print('Word', sample[i][0], ':', word2id[sample[i][0]], ' || Number of occurences:', sample[i][1])

Corpus sample
Word 0 : abolish  || Number of occurences: 1
Word 1 : abuse  || Number of occurences: 2
Word 2 : account  || Number of occurences: 1
Word 3 : accusation  || Number of occurences: 1
Word 4 : accuse  || Number of occurences: 1
Word 5 : actual  || Number of occurences: 1
Word 6 : adapt  || Number of occurences: 1
Word 7 : add  || Number of occurences: 1
Word 8 : adopt  || Number of occurences: 2
Word 9 : alcohol  || Number of occurences: 4
Word 10 : alliance  || Number of occurences: 1
Word 11 : allusion  || Number of occurences: 1
Word 12 : alter  || Number of occurences: 1
Word 13 : alteration  || Number of occurences: 1
Word 14 : animal  || Number of occurences: 37
Word 15 : animalism  || Number of occurences: 2
Word 16 : annihilate  || Number of occurences: 1
Word 17 : announce  || Number of occurences: 2
Word 18 : anthem  || Number of occurences: 1
Word 19 : appear  || Number of occurences: 1
Word 20 : append  || Number of occurences: 1
Word 21 : argument  || Number of 

In [25]:
hdp = models.HdpModel(corpus,word2id)

In [26]:
hdp_topics = hdp.print_topics()
for topic in hdp_topics:
    print(topic)

(0, '0.004*find + 0.004*take + 0.004*return + 0.004*story + 0.003*father + 0.003*leave + 0.003*begin + 0.003*friend + 0.003*family + 0.003*tell')
(1, '0.004*take + 0.004*find + 0.004*return + 0.004*father + 0.004*story + 0.003*leave + 0.003*friend + 0.003*tell + 0.003*begin + 0.003*family')
(2, '0.004*take + 0.004*return + 0.004*father + 0.004*find + 0.003*leave + 0.003*friend + 0.003*story + 0.003*young + 0.003*begin + 0.003*house')
(3, '0.004*take + 0.004*return + 0.003*find + 0.003*leave + 0.003*kill + 0.003*story + 0.003*world + 0.003*begin + 0.003*tell + 0.002*child')
(4, '0.003*return + 0.003*family + 0.003*leave + 0.003*find + 0.003*story + 0.003*begin + 0.003*take + 0.003*father + 0.002*child + 0.002*tell')
(5, '0.003*eragon + 0.003*kill + 0.003*father + 0.003*return + 0.003*take + 0.002*dragon + 0.002*begin + 0.002*death + 0.002*leave + 0.002*force')
(6, '0.003*marlow + 0.003*kurtz + 0.003*letter + 0.002*chapter + 0.002*death + 0.002*house + 0.001*story + 0.001*find + 0.001*re

In [27]:
print('HDP model created: '+str(len(hdp_topics))+' Topics')

HDP model created: 20 Topics


In [28]:
lda_model = LdaModel(corpus=corpus, id2word=word2id, num_topics=5, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto', per_word_topics=True)

In [32]:
#Book - Topic Distribution for first Book
def get_article_topic_distribution(article):
    return lda.get_document_topics(article)
#Returns a list containing a list of tuple
#Each inner list corresponds to an article and each tuple refers to topicID and its corresponding probability  
map(get_article_topic_distribution, corpus)

<map at 0x13fda69b0>

In [33]:
lda_model.print_topics()

[(0,
  '0.010*"father" + 0.009*"family" + 0.009*"mother" + 0.008*"friend" + 0.007*"child" + 0.007*"tell" + 0.007*"young" + 0.006*"leave" + 0.006*"story" + 0.006*"house"'),
 (1,
  '0.008*"attack" + 0.008*"kill" + 0.007*"return" + 0.007*"escape" + 0.006*"force" + 0.006*"take" + 0.006*"leave" + 0.005*"find" + 0.005*"battle" + 0.005*"group"'),
 (2,
  '0.009*"world" + 0.008*"novel" + 0.007*"story" + 0.006*"people" + 0.005*"human" + 0.004*"character" + 0.004*"include" + 0.004*"chapter" + 0.004*"state" + 0.003*"earth"'),
 (3,
  '0.021*"artemis" + 0.015*"ayesha" + 0.011*"roman" + 0.009*"julius" + 0.008*"torak" + 0.006*"marcus" + 0.006*"caesar" + 0.005*"wraith" + 0.005*"nephilim" + 0.004*"eater"'),
 (4,
  '0.013*"murder" + 0.010*"police" + 0.009*"kill" + 0.008*"find" + 0.006*"harry" + 0.005*"vampire" + 0.005*"david" + 0.004*"discover" + 0.004*"crime" + 0.004*"death"')]

In [34]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model,corpus,word2id)