**Introduction:** The provided code performs topic modeling on a collection of book summaries using Latent Dirichlet Allocation (LDA) with Gensim. 

**1. Import necessary libraries**

In [47]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy
import pandas as pd
import re
import csv
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
from spacy.lang.en.stop_words import STOP_WORDS
import pyLDAvis.gensim

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

2. **Data Acquisition:** For our book recommendation system, we used the CMU Book Summary Dataset from Kaggle (https://www.kaggle.com/datasets/ymaricar/cmu-book-summary-dataset). It contains plot summaries for 16 559 books extracted from Wikipedia, along with their metadata.

In [48]:
# Read and process data from the file
data = []
with open("booksummaries.txt", "r") as f:
    reader = csv.reader(f, dialect="excel-tab")
    for row in tqdm(reader):
        data.append(row)

16559it [00:03, 4497.43it/s]


In [49]:
# Extract relevant information from the data and create a DataFrame
book_index = []
book_id = []
book_author = []
book_name = []
summary = []
genre = []
a = 1
for i in tqdm(data):
    book_index.append(a)
    a = a + 1
    book_id.append(i[0])
    book_name.append(i[2])
    book_author.append(i[3])
    genre.append(i[5])
    summary.append(i[6])

book_df = pd.DataFrame(
    {
        "Index": book_index,
        "ID": book_id,
        "BookTitle": book_name,
        "Author": book_author,
        "Genre": genre,
        "Summary": summary,
    }
)
book_df.head()

100%|█████████████████████████████████| 16559/16559 [00:00<00:00, 148650.08it/s]


Unnamed: 0,Index,ID,BookTitle,Author,Genre,Summary
0,1,620,Animal Farm,George Orwell,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca..."
1,2,843,A Clockwork Orange,Anthony Burgess,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan..."
2,3,986,The Plague,Albert Camus,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...
3,4,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...
4,5,2080,A Fire Upon the Deep,Vernor Vinge,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...


**3. Text Processing**

In [50]:
# Load the SpaCy English model
nlp = spacy.load("en_core_web_sm")

In [51]:
data = book_df['Summary'].values.tolist()

In [52]:
#removing punctuations and other characters
def preprocess(string):
    return re.sub('[^\w_\s-]', ' ',str(string))

data = list(map(preprocess,data))  

In [53]:
#data cleaning and lemmatization
lemma_doc = []
for datum in data:
    sent = nlp(str(datum).lower())
    text = []
    for w in sent:
        if not w.is_stop and not w.is_punct and not w.like_num and (len(str(w)) > 4):
            #adding the lemmatized version of the words
            text.append(w.lemma_)
    lemma_doc.append(text)
    

In [54]:
lemma_doc[0]

['major',
 'manor',
 'call',
 'animal',
 'meeting',
 'compare',
 'human',
 'parasite',
 'teach',
 'animal',
 'revolutionary',
 'beast',
 'england',
 'major',
 'young',
 'snowball',
 'napoleon',
 'assume',
 'command',
 'dream',
 'philosophy',
 'animal',
 'revolt',
 'drive',
 'drunken',
 'irresponsible',
 'jones',
 'rename',
 'animal',
 'adopt',
 'commandment',
 'animal',
 'important',
 'animal',
 'equal',
 'snowball',
 'attempt',
 'teach',
 'animal',
 'read',
 'write',
 'plentiful',
 'smoothly',
 'elevate',
 'position',
 'leadership',
 'aside',
 'special',
 'item',
 'ostensibly',
 'personal',
 'health',
 'napoleon',
 'take',
 'train',
 'privately',
 'napoleon',
 'snowball',
 'struggle',
 'leadership',
 'snowball',
 'announce',
 'plan',
 'build',
 'windmill',
 'napoleon',
 'chase',
 'snowball',
 'declare',
 'leader',
 'napoleon',
 'enact',
 'change',
 'governance',
 'structure',
 'replace',
 'meeting',
 'committee',
 'young',
 'name',
 'squealer',
 'mouthpiece',
 'napoleon',
 'claim',
 '

In [55]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(lemma_doc, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[lemma_doc], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[lemma_doc[0]]])

['major', 'manor', 'call', 'animal', 'meeting', 'compare', 'human', 'parasite', 'teach', 'animal', 'revolutionary', 'beast', 'england', 'major', 'young', 'snowball', 'napoleon', 'assume', 'command', 'dream', 'philosophy', 'animal', 'revolt', 'drive', 'drunken', 'irresponsible', 'jones', 'rename', 'animal', 'adopt', 'commandment', 'animal', 'important', 'animal', 'equal', 'snowball', 'attempt', 'teach', 'animal', 'read', 'write', 'plentiful', 'smoothly', 'elevate', 'position', 'leadership', 'aside', 'special', 'item', 'ostensibly', 'personal', 'health', 'napoleon', 'take', 'train', 'privately', 'napoleon', 'snowball', 'struggle', 'leadership', 'snowball', 'announce', 'plan', 'build', 'windmill', 'napoleon', 'chase', 'snowball', 'declare', 'leader', 'napoleon', 'enact', 'change', 'governance', 'structure', 'replace', 'meeting', 'committee', 'young', 'name', 'squealer', 'mouthpiece', 'napoleon', 'claim', 'credit', 'windmill', 'animal', 'hard', 'promise', 'easy', 'life', 'windmill', 'viole

**4. Create the Dictionary and Corpus needed for Topic Modeling**

In [56]:
#Creates Word to IDs mapping
word2id = corpora.Dictionary(lemma_doc)

In [57]:
# Creates bag of words and a corpus
documents = lemma_doc
corpus = [word2id.doc2bow(doc) for doc in documents]

print('Corpus sample')
sample = corpus[0]
for i in range(len(sample)):
    print('Word', sample[i][0], ':', word2id[sample[i][0]], ' || Number of occurences:', sample[i][1])

Corpus sample
Word 0 : abolish  || Number of occurences: 1
Word 1 : abuse  || Number of occurences: 2
Word 2 : account  || Number of occurences: 1
Word 3 : accusation  || Number of occurences: 1
Word 4 : accuse  || Number of occurences: 1
Word 5 : actual  || Number of occurences: 1
Word 6 : adapt  || Number of occurences: 1
Word 7 : add  || Number of occurences: 1
Word 8 : adopt  || Number of occurences: 2
Word 9 : alcohol  || Number of occurences: 4
Word 10 : alliance  || Number of occurences: 1
Word 11 : allusion  || Number of occurences: 1
Word 12 : alter  || Number of occurences: 1
Word 13 : alteration  || Number of occurences: 1
Word 14 : animal  || Number of occurences: 37
Word 15 : animalism  || Number of occurences: 2
Word 16 : annihilate  || Number of occurences: 1
Word 17 : announce  || Number of occurences: 2
Word 18 : anthem  || Number of occurences: 1
Word 19 : appear  || Number of occurences: 1
Word 20 : append  || Number of occurences: 1
Word 21 : argument  || Number of 

**5. Hierarchical Dirichlet Processing**

In [58]:
hdp = models.HdpModel(corpus,word2id)

In [59]:
hdp_topics = hdp.print_topics()
for topic in hdp_topics:
    print(topic)

(0, '0.004*find + 0.004*take + 0.004*return + 0.004*story + 0.003*father + 0.003*leave + 0.003*begin + 0.003*friend + 0.003*tell + 0.003*family')
(1, '0.004*find + 0.004*take + 0.004*return + 0.004*father + 0.003*story + 0.003*leave + 0.003*begin + 0.003*family + 0.003*friend + 0.003*tell')
(2, '0.005*harry + 0.004*return + 0.003*take + 0.003*find + 0.003*kill + 0.003*leave + 0.003*tell + 0.003*story + 0.003*father + 0.003*later')
(3, '0.004*story + 0.004*return + 0.003*friend + 0.003*father + 0.003*young + 0.003*mother + 0.003*family + 0.003*find + 0.003*take + 0.003*leave')
(4, '0.003*return + 0.003*find + 0.003*take + 0.003*leave + 0.003*family + 0.003*friend + 0.003*tell + 0.003*father + 0.002*kellen + 0.002*story')
(5, '0.002*take + 0.002*kill + 0.002*father + 0.002*carrie + 0.002*leave + 0.002*oedipus + 0.002*story + 0.002*year + 0.002*leama + 0.002*world')
(6, '0.003*candide + 0.003*siegfrie + 0.002*world + 0.002*animal + 0.002*gunther + 0.002*story + 0.002*kriemhild + 0.002*ish

In [60]:
print('HDP model created: '+str(len(hdp_topics))+' Topics')

HDP model created: 20 Topics


**6. Latent Dirichlet Allocation Model**

In [62]:
lda_model = LdaModel(corpus=corpus, id2word=word2id, num_topics=10, random_state=42, update_every=1, chunksize=100, 
                     passes=10, alpha='auto', per_word_topics=True)

In [63]:
lda_model.print_topics()

[(0,
  '0.018*"world" + 0.013*"power" + 0.009*"people" + 0.009*"human" + 0.008*"travel" + 0.008*"great" + 0.006*"dragon" + 0.006*"begin" + 0.006*"magic" + 0.006*"journey"'),
 (1,
  '0.019*"tell" + 0.016*"find" + 0.014*"leave" + 0.009*"night" + 0.009*"try" + 0.009*"take" + 0.008*"house" + 0.008*"arrive" + 0.008*"call" + 0.008*"decide"'),
 (2,
  '0.014*"human" + 0.014*"earth" + 0.010*"planet" + 0.008*"system" + 0.008*"space" + 0.006*"alien" + 0.006*"destroy" + 0.006*"control" + 0.005*"doctor" + 0.005*"world"'),
 (3,
  '0.122*"harry" + 0.018*"dresden" + 0.018*"undead" + 0.014*"werewolf" + 0.010*"burrow" + 0.008*"bosch" + 0.008*"ministry" + 0.007*"snowman" + 0.007*"tommy" + 0.006*"godmother"'),
 (4,
  '0.027*"danny" + 0.026*"malcolm" + 0.025*"carter" + 0.023*"terry" + 0.018*"barley" + 0.017*"spenser" + 0.013*"assignment" + 0.012*"cammie" + 0.012*"ravenpaw" + 0.011*"willie"'),
 (5,
  '0.016*"murder" + 0.010*"police" + 0.008*"london" + 0.007*"find" + 0.007*"death" + 0.007*"henry" + 0.006*"ca

**7. Vizualization of the topics**

In [64]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model,corpus,word2id)

In [65]:
len(corpus)

16559

In [66]:
def get_lda_vector(id):
    document_topics = lda_model.get_document_topics(
        corpus[id], minimum_probability=0
    )  # minimum_probability=0 is needed because otherwise the length of the returned vector is not always the same
    return [v for _, v in document_topics]

In [67]:
def get_nearest(title, top_n=3):
    # get index from book dataframe
    df = book_df.set_index("BookTitle")
    try:
        idx = np.where(np.array(df.index) == title)[0][0]
    except:
        print(f"Book {title} not found. Try again :)")
        return None
    topic_prob_title = get_lda_vector(idx)
    # select row of pairwise distances
    b = np.array(
        [
            -cosine_similarity([topic_prob_title], [get_lda_vector(i)])[0][0]
            for i in range(0, len(df))
        ]
    )
    # sort indices and return top n (note: 0 is the book itself)
    result_indices = b.argsort()[0 : top_n + 1]
    result_indices_filtered = [x for x in result_indices if x != idx]
    # print("You might want to read:")
    for book_title in df.index[result_indices_filtered].values:
        print(f"- {book_title}")

In [68]:
get_nearest("Dune")

- Heretics of Dune
- Sten Adventures Book 7: Vortex
- A Princess of Mars


In [69]:
get_nearest("Into the Wild")

- We
- Facial Justice
- Los Premios


In [70]:
get_nearest("Harry Potter and the Philosopher's Stone")

- Harry Potter and the Half-Blood Prince
- Ghost Story
- Harry Potter and the Goblet of Fire


In [71]:
get_nearest("James Bond: The Authorised Biography of 007")

- Firewall
- Windfall
- Walter


In [72]:
get_nearest("Harry Potter and the Half-Blood Prince")

- Harry Potter and the Philosopher's Stone
- Harry Potter and the Chamber of Secrets
- Ghost Story


In [73]:
get_nearest("War and Peace")

- Les Illusions perdues
- Soll und Haben
- Tread Softly in this Place


In [74]:
get_nearest("iWoz: Computer Geek to Cult Icon - How I Invented the Personal Computer, Co-Founded Apple, and Had Fun Doing It")

- White Noise
- Waking the Dead
- Tom Swift and His Air Scout


In [77]:
get_nearest("Children of Dune")

- Winds of Fury
- Strands of Starlight
- The Captives of Kaag
