### Topic Modeling with LDA after removing Named Entities (using TF-IDF)

In [1]:
from gensim import corpora, models
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
import pickle

# Load the processed corpus from the .pkl file
with open("../processed_corpus.pkl", "rb") as f:
    processed_corpus = pickle.load(f)

In [3]:
with open("../unique_named_entities.pkl", "rb") as f:
    unique_named_entities = pickle.load(f)

In [4]:
#filtered_tokens_lists = []          # we want a list of tokens per tragedy
#for tragedy in processed corpus:
  #  filtered_tokens = []       # initialize a list for the tragedy being iterated over
  #  for token in tragedy["tokens"]:
        #if token.lower() not in unique_named_entities:
          #  filtered_tokens.append(token)
   # filtered_token_lists.append(filtered_tokens)

# Can also be written as
# filtered_tokens_lists = []
# for tragedy in processed_corpus:
    # filtered = [token for token in tragedy["tokens"] if token.lower() not in unique_named_entities]  # make the if a list comprehension
    # filtered_tokens_lists.append(filtered)


#list comprehension within a list comprehension

filtered_token_lists = [
    [token for token in tragedy["tokens"] if token.lower() not in unique_named_entities]
    for tragedy in processed_corpus
]

In [5]:
# Extract token lists from processed_corpus
token_lists = [tragedy["tokens"] for tragedy in processed_corpus]

for i in range(11):
    print(len(token_lists[i])) #check length of tokens for each play before removing the named entities

2548
4502
5065
3918
5314
3784
3526
4197
4094
7280
2330


In [6]:
for i in range(11):
    print(len(filtered_token_lists[i])) #check length of tokens for each play

1881
3130
3866
2908
3965
2756
2552
3212
3100
5138
1660


In [7]:
# Step 1: Create dictionary and BoW corpus from processed tokens

# Create a dictionary from the tokenized texts
dictionary = corpora.Dictionary(filtered_token_lists)

# Filter out extreme tokens (optional)
dictionary.filter_extremes(no_below=2) #no_below=2 removes words that appear in fewer than 2 plays #no_above=0.5 removes words that appear in more than 50% of the plays

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(tokens) for tokens in filtered_token_lists]

In [8]:
pickle.dump(corpus, open('ner_corpus.pkl', 'wb'))
dictionary.save('ner_dictionary.gensim')

In [9]:
# Step 2: Apply TF-IDF model to BoW corpus
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]

# Step 3: Run LDA on the TF-IDF weighted corpus
lda_tfidf_model = models.LdaModel(
    corpus=corpus_tfidf,
    id2word=dictionary,
    num_topics=3,
    passes=20
)

In [10]:
lda_tfidf_model.show_topics()

[(0,
  '0.001*"uoltus" + 0.001*"remeo" + 0.001*"curuus" + 0.001*"uirus" + 0.001*"periclum" + 0.001*"classis" + 0.001*"paeniteo" + 0.001*"altrix" + 0.001*"mortifer" + 0.001*"uindico"'),
 (1,
  '0.001*"lis" + 0.001*"mensa" + 0.001*"actus" + 0.001*"forum" + 0.001*"uinum" + 0.001*"cibus" + 0.001*"plebs" + 0.001*"triplex" + 0.001*"rabies" + 0.001*"excludo"'),
 (2,
  '0.002*"princeps" + 0.001*"aspectus" + 0.001*"eruo" + 0.001*"classis" + 0.001*"bustum" + 0.001*"uolucer" + 0.001*"ferax" + 0.001*"pestifer" + 0.001*"maereo" + 0.001*"animosus"')]

In [11]:
for idx, topic in lda_tfidf_model.print_topics(num_words=10):
    print(f"Topic #{idx + 1}: {topic}")

Topic #1: 0.001*"uoltus" + 0.001*"remeo" + 0.001*"curuus" + 0.001*"uirus" + 0.001*"periclum" + 0.001*"classis" + 0.001*"paeniteo" + 0.001*"altrix" + 0.001*"mortifer" + 0.001*"uindico"
Topic #2: 0.001*"lis" + 0.001*"mensa" + 0.001*"actus" + 0.001*"forum" + 0.001*"uinum" + 0.001*"cibus" + 0.001*"plebs" + 0.001*"triplex" + 0.001*"rabies" + 0.001*"excludo"
Topic #3: 0.002*"princeps" + 0.001*"aspectus" + 0.001*"eruo" + 0.001*"classis" + 0.001*"bustum" + 0.001*"uolucer" + 0.001*"ferax" + 0.001*"pestifer" + 0.001*"maereo" + 0.001*"animosus"


In [12]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_tfidf_model, corpus_tfidf, dictionary)

In [13]:
document_topics = []
for i, bow in enumerate(corpus_tfidf):
    topic_dist = lda_tfidf_model.get_document_topics(bow)
    document_topics.append((processed_corpus[i]["title"], topic_dist))

for title, topics in document_topics:
    # Get topic with highest probability
    top_topic = max(topics, key=lambda x: x[1])
    print(f"{title:<20} → Topic #{top_topic[0] + 1} (weight: {top_topic[1]:.3f})")
   
#get topic distribution for each play
for title, topics in document_topics:
    print(f"\n{title}")
    for topic_id, weight in sorted(topics, key=lambda x: -x[1]):
        print(f"  Topic #{topic_id + 1}: {weight:.3f}")

Phoenissae           → Topic #3 (weight: 0.953)
Troades              → Topic #3 (weight: 0.959)
Phaedra              → Topic #1 (weight: 0.967)
Agamemnon            → Topic #1 (weight: 0.962)
Hercules Furens      → Topic #3 (weight: 0.963)
Medea                → Topic #1 (weight: 0.961)
Octavia              → Topic #3 (weight: 0.938)
Oedipus              → Topic #1 (weight: 0.964)
Thyestes             → Topic #2 (weight: 0.959)
Hercules Oetaeus     → Topic #1 (weight: 0.956)
Ecerinis             → Topic #2 (weight: 0.949)

Phoenissae
  Topic #3: 0.953
  Topic #1: 0.024
  Topic #2: 0.023

Troades
  Topic #3: 0.959
  Topic #1: 0.021
  Topic #2: 0.020

Phaedra
  Topic #1: 0.967
  Topic #3: 0.017
  Topic #2: 0.016

Agamemnon
  Topic #1: 0.962
  Topic #3: 0.019
  Topic #2: 0.019

Hercules Furens
  Topic #3: 0.963
  Topic #1: 0.019
  Topic #2: 0.018

Medea
  Topic #1: 0.961
  Topic #3: 0.019
  Topic #2: 0.019

Octavia
  Topic #3: 0.938
  Topic #1: 0.031
  Topic #2: 0.030

Oedipus
  Topic #1:

In [14]:
hdp_model = models.HdpModel(corpus=corpus, id2word=dictionary)
hdp_model.show_topics()[:5]

[(0,
  '0.003*cognatus + 0.003*mensa + 0.003*miseria + 0.003*curuus + 0.003*siccus + 0.002*discutio + 0.002*aenus + 0.002*paeniteo + 0.002*limes + 0.002*perfidus + 0.002*fungor + 0.002*mortifer + 0.002*lingua + 0.002*asper + 0.002*laurus + 0.002*cohors + 0.002*bos + 0.002*rictus + 0.002*fibra + 0.002*amputo'),
 (1,
  '0.014*uoltus + 0.007*axis + 0.007*uolucer + 0.007*maereo + 0.006*uindico + 0.006*colus + 0.005*triumphus + 0.004*inuado + 0.004*abrumpo + 0.004*deficio + 0.004*palla + 0.004*femineus + 0.004*intendo + 0.004*tepidus + 0.004*discutio + 0.004*caelestis + 0.004*siccus + 0.004*lasso + 0.004*vinco + 0.004*tingo'),
 (2,
  '0.004*saltus + 0.004*caelebs + 0.003*immitis + 0.003*peruius + 0.003*perpetuus + 0.003*uicina + 0.003*abscondo + 0.003*ritus + 0.003*ratio + 0.003*restituo + 0.003*sanus + 0.003*remeo + 0.003*reditus + 0.003*uindico + 0.003*pons + 0.003*filum + 0.003*asper + 0.003*acuo + 0.003*promitto + 0.003*vastus'),
 (3,
  '0.004*aspectus + 0.004*uolucer + 0.004*ferax + 0.