### Topic Modeling with LDA after removing Named Entities (using TF-IDF)

In [19]:
from gensim import corpora, models
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
import pyLDAvis
import pyLDAvis.gensim_models

In [1]:
import pickle

# Load the processed corpus from the .pkl file
with open("../processed_corpus.pkl", "rb") as f:
    processed_corpus = pickle.load(f)

In [2]:
with open("../unique_named_entities.pkl", "rb") as f:
    unique_named_entities = pickle.load(f)

In [3]:
#filtered_tokens_lists = []          # we want a list of tokens per tragedy
#for tragedy in processed corpus:
  #  filtered_tokens = []       # initialize a list for the tragedy being iterated over
  #  for token in tragedy["tokens"]:
        #if token.lower() not in unique_named_entities:
          #  filtered_tokens.append(token)
   # filtered_token_lists.append(filtered_tokens)

# Can also be written as
# filtered_tokens_lists = []
# for tragedy in processed_corpus:
    # filtered = [token for token in tragedy["tokens"] if token.lower() not in unique_named_entities]  # make the if a list comprehension
    # filtered_tokens_lists.append(filtered)


#list comprehension within a list comprehension

filtered_token_lists = [
    [token for token in tragedy["tokens"] if token.lower() not in unique_named_entities]
    for tragedy in processed_corpus
]

In [5]:
# Extract token lists from processed_corpus
token_lists = [tragedy["tokens"] for tragedy in processed_corpus]

for i in range(11):
    print(len(token_lists[i])) #check length of tokens for each play before removing the named entities

2638
4695
5190
4030
5462
3915
3613
4321
4241
7543
2422


In [4]:
for i in range(11):
    print(len(filtered_token_lists[i])) #check length of tokens for each play

1948
3282
3966
2994
4075
2855
2627
3307
3209
5313
1734


In [32]:
# Step 1: Create dictionary and BoW corpus from processed tokens

# Create a dictionary from the tokenized texts
dictionary = corpora.Dictionary(filtered_token_lists)

# Filter out extreme tokens (optional)
dictionary.filter_extremes(no_below=2) #no_below=2 removes words that appear in fewer than 2 plays #no_above=0.5 removes words that appear in more than 50% of the plays

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(tokens) for tokens in filtered_token_lists]

In [33]:
#pickle.dump(corpus, open('ner_corpus.pkl', 'wb'))
#dictionary.save('ner_dictionary.gensim')

In [34]:
# Step 2: Apply TF-IDF model to BoW corpus
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

# Step 3: Run LDA on the TF-IDF weighted corpus
lda_tfidf_model = models.LdaModel(
    corpus=corpus_tfidf,
    id2word=dictionary,
    num_topics=3,
    passes=20
)

In [35]:
lda_tfidf_model.show_topics()

[(0,
  '0.002*"princeps" + 0.001*"temere" + 0.001*"lis" + 0.001*"mensa" + 0.001*"classis" + 0.001*"actus" + 0.001*"plebs" + 0.001*"lacero" + 0.001*"epulae" + 0.001*"forum"'),
 (1,
  '0.001*"uoltus" + 0.001*"eruo" + 0.001*"aspectus" + 0.001*"maereo" + 0.001*"bustum" + 0.001*"classis" + 0.001*"decem" + 0.001*"uolucer" + 0.001*"colus" + 0.001*"fragor"'),
 (2,
  '0.001*"peruro" + 0.001*"paeniteo" + 0.001*"profugio" + 0.001*"furiosus" + 0.001*"carina" + 0.001*"nubo" + 0.001*"mortifer" + 0.001*"uere" + 0.001*"angustus" + 0.001*"ultio"')]

In [36]:
for idx, topic in lda_tfidf_model.print_topics(num_words=10):
    print(f"Topic #{idx + 1}: {topic}")

Topic #1: 0.002*"princeps" + 0.001*"temere" + 0.001*"lis" + 0.001*"mensa" + 0.001*"classis" + 0.001*"actus" + 0.001*"plebs" + 0.001*"lacero" + 0.001*"epulae" + 0.001*"forum"
Topic #2: 0.001*"uoltus" + 0.001*"eruo" + 0.001*"aspectus" + 0.001*"maereo" + 0.001*"bustum" + 0.001*"classis" + 0.001*"decem" + 0.001*"uolucer" + 0.001*"colus" + 0.001*"fragor"
Topic #3: 0.001*"peruro" + 0.001*"paeniteo" + 0.001*"profugio" + 0.001*"furiosus" + 0.001*"carina" + 0.001*"nubo" + 0.001*"mortifer" + 0.001*"uere" + 0.001*"angustus" + 0.001*"ultio"


In [42]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_tfidf_model, corpus_tfidf, dictionary)

In [43]:
document_topics = []
for i, bow in enumerate(corpus_tfidf):
    topic_dist = lda_tfidf_model.get_document_topics(bow)
    document_topics.append((processed_corpus[i]["title"], topic_dist))

for title, topics in document_topics:
    # Get topic with highest probability
    top_topic = max(topics, key=lambda x: x[1])
    print(f"{title:<20} → Topic #{top_topic[0] + 1} (weight: {top_topic[1]:.3f})")
   
#get topic distribution for each play
for title, topics in document_topics:
    print(f"\n{title}")
    for topic_id, weight in sorted(topics, key=lambda x: -x[1]):
        print(f"  Topic #{topic_id + 1}: {weight:.3f}")

Phoenissae           → Topic #2 (weight: 0.955)
Troades              → Topic #2 (weight: 0.960)
Phaedra              → Topic #1 (weight: 0.967)
Agamemnon            → Topic #1 (weight: 0.962)
Hercules Furens      → Topic #2 (weight: 0.965)
Medea                → Topic #3 (weight: 0.960)
Octavia              → Topic #1 (weight: 0.940)
Oedipus              → Topic #2 (weight: 0.965)
Thyestes             → Topic #1 (weight: 0.961)
Hercules Oetaeus     → Topic #2 (weight: 0.958)
Ecerinis             → Topic #1 (weight: 0.951)

Phoenissae
  Topic #2: 0.955
  Topic #1: 0.023
  Topic #3: 0.022

Troades
  Topic #2: 0.960
  Topic #1: 0.020
  Topic #3: 0.019

Phaedra
  Topic #1: 0.967
  Topic #2: 0.017
  Topic #3: 0.016

Agamemnon
  Topic #1: 0.962
  Topic #2: 0.019
  Topic #3: 0.019

Hercules Furens
  Topic #2: 0.965
  Topic #1: 0.018
  Topic #3: 0.017

Medea
  Topic #3: 0.960
  Topic #2: 0.020
  Topic #1: 0.020

Octavia
  Topic #1: 0.940
  Topic #2: 0.031
  Topic #3: 0.029

Oedipus
  Topic #2:

In [54]:
hdp_model = models.HdpModel(corpus=corpus, id2word=dictionary)
hdp_model.show_topics()[:5]

[(0,
  '0.004*mensa + 0.004*miseria + 0.004*numquid + 0.003*cibus + 0.003*aenus + 0.003*peruro + 0.003*amputo + 0.003*asper + 0.003*uoluptas + 0.003*festum + 0.003*ultrix + 0.003*attingo + 0.003*punio + 0.003*carina + 0.003*paeniteo + 0.003*uindico + 0.003*egomet + 0.003*profugio + 0.003*nubo + 0.003*uere'),
 (1,
  '0.013*uoltus + 0.007*axis + 0.006*uolucer + 0.006*maereo + 0.005*colus + 0.005*inquio + 0.005*uindico + 0.005*triumphus + 0.005*salto + 0.004*palla + 0.004*abrumpo + 0.004*deficio + 0.004*inuado + 0.004*lasso + 0.004*tepidus + 0.004*centum + 0.004*siccus + 0.004*quotus + 0.004*caelestis + 0.003*discutio'),
 (2,
  '0.004*saltus + 0.003*perpetuus + 0.003*remeo + 0.003*abscondo + 0.003*sanus + 0.003*reditus + 0.003*restituo + 0.003*uindico + 0.003*ratio + 0.003*ritus + 0.003*immitis + 0.003*pons + 0.003*uicina + 0.003*antrum + 0.003*torreo + 0.003*filum + 0.003*colligo + 0.003*caelebs + 0.003*sospes + 0.003*descendo'),
 (3,
  '0.004*aspectus + 0.004*uolucer + 0.004*aer + 0.004

In [55]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(hdp_model, corpus, dictionary)