In [1]:
import numpy as np
import pandas as pd
from gensim import matutils, corpora
import pickle

In [2]:
with open('acts_filtered_LDA_models.pkl', 'rb') as f:
    acts_filtered_LDA_models = pickle.load(f)

with open('acts_filtered_corpus.pkl', 'rb') as f:
    acts_filtered_corpus = pickle.load(f)

with open('../../Acts/processed_acts.pkl', 'rb') as f:
    processed_acts = pickle.load(f)

acts_filtered_dictionary = corpora.Dictionary.load('acts_filtered_dictionary.gensim')

In [3]:
def relevance_keywords(model, corpus, dictionary=None, topn=10, lam=0.6, return_scores=False):
    # Always map with the model’s own dictionary
    model_dict = model.id2word
    # Use the model’s V for p(w)
    phi = model.get_topics()  # (K, V)
    K, V = phi.shape

    # p(w) over the same V space
    csc = matutils.corpus2csc(corpus, num_terms=V)
    term_freq = np.asarray(csc.sum(axis=1)).ravel().astype(float)
    term_freq[term_freq == 0.0] = 1e-12
    p_w = term_freq / term_freq.sum()

    log_phi = np.log(np.clip(phi, 1e-12, None))
    log_pw  = np.log(p_w)
    relevance = log_phi - (1.0 - lam) * log_pw[np.newaxis, :]

    # Safe accessor (works if id2token is dict or list-like)
    i2t = getattr(model_dict, "id2token", None)
    if i2t is None:
        inv = {v: k for k, v in model_dict.token2id.items()}
        def get_tok(i): return inv.get(i)
    elif isinstance(i2t, dict):
        def get_tok(i): return i2t.get(int(i))
    else:  # list-like
        def get_tok(i):
            i = int(i)
            return i2t[i] if 0 <= i < len(i2t) else None

    out = {}
    for k in range(K):
        order = np.argsort(-relevance[k])
        picked, scores = [], []
        for i in order:
            tok = get_tok(i)
            if tok is None:
                continue
            picked.append(tok)
            scores.append(float(relevance[k, int(i)]))
            if len(picked) == topn:
                break
        out[k] = list(zip(picked, scores)) if return_scores else picked
    return out

In [4]:
acts_filtered_ideal_topic_num = 21
acts_filtered_model = acts_filtered_LDA_models[acts_filtered_ideal_topic_num]
num_keywords = 10
lam = 0.6

rel_kws = relevance_keywords(
    acts_filtered_model,
    acts_filtered_corpus,
    acts_filtered_dictionary,
    topn=num_keywords,
    lam=lam,
    return_scores=False  # set True if I want to inspect scores
)

In [5]:
print(f"Top {num_keywords} keywords per topic (lambda={lam}):")
for topic_id, keywords in rel_kws.items():
    print(f"Topic {topic_id + 1}: {', '.join(keywords)}")

Top 10 keywords per topic (lambda=0.6):
Topic 1: lis, homo, tyrannus, subicio, finis, error, uis, servo, exacuo, omnipotens
Topic 2: suscito, abstineo, princeps, uiolentum, udus, iactatus, concremo, illucesco, consido, deditus
Topic 3: sentio, insuetus, coerceo, biga, insulto, remeo, suscipio, nereis, recolligo, relevo
Topic 4: spiro, iudex, classis, leuo, uinum, tuba, sors, fumus, lamento, ide
Topic 5: nupta, nurus, altrix, cognatus, neo, famula, amarus, cornu, amictus, gremium
Topic 6: accerso, grauida, uetitus, introitus, exerceo, dissideo, furialis, averto, labrum, absolo
Topic 7: puer, nepos, reueho, carina, adicio, extremus, extimesco, murus, aurum, exul
Topic 8: astrum, uoltus, ferio, orbis, pestis, sono, arcus, gemo, dextera, laus
Topic 9: afflico, coruscus, profusus, inauspicatus, cunctor, retento, congredior, occultum, discordo, perdendus
Topic 10: currus, fletus, uotum, sidus, fessus, condo, recipio, colo, somnus, excipio
Topic 11: grandis, exagito, detego, luo, forum, scand

In [6]:
# Must run BEFORE building topic_to_acts
assert len(acts_filtered_corpus) == len(processed_acts), \
    f"Mismatch: corpus={len(acts_filtered_corpus)} vs processed_acts={len(processed_acts)}"

K = acts_filtered_model.num_topics
print(f"[INFO] num_topics K={K}, docs={len(acts_filtered_corpus)}")

# Quick peek at one document distribution with dense probs:
tmp = acts_filtered_model.get_document_topics(acts_filtered_corpus[0], minimum_probability=0.0)
print(f"[INFO] first doc dense dist len={len(tmp)} (expect {K})")


[INFO] num_topics K=21, docs=56
[INFO] first doc dense dist len=21 (expect 21)


In [7]:
# --- Top 5 acts per topic -------------------------
K = acts_filtered_model.num_topics #grab the number of topics K from my trained LDA model

# Recompute full distributions with minimum_probability=0.0
# so every topic gets a score for every act.
topic_to_acts = {k: [] for k in range(K)}      #Build a dictionary that maps each topic to an empty list
for i, bow in enumerate(acts_filtered_corpus):  #Iterate through every act’s text and get its list of word-frequency pairs (BoW)
    full_dist = acts_filtered_model.get_document_topics(bow, minimum_probability=0.0) #get full topic distribution for this act, minimum_prob=0.0 forces all topics to be included
                                                                                    #get_document_topics returns a list of (topic_id, weight) pairs
    act_id = processed_acts[i]["id"]
    for k, w in full_dist:                 #loop through every (topic_id, weight) pair for this act
        topic_to_acts[k].append((act_id, float(w)))  #Append a tuple (act_id, weight) to the list for topic k

# Keep only the top 5 acts per topic, sorted by descending weight
top5_by_topic = {
    k: sorted(v, key=lambda t: -t[1])[:5]  #k is topic_id, v is list of (act_id, weight) tuples #for k, v in topic_to_acts.items()
                                            #each tuple t has t[0]=act_id, t[1]=weight; -t[1] sorts by negative weight (descending--largest first)
                                            #key=lambda t: -t[1] is the sorting rule, need to use sort() to actually do something with the rule
    for k, v in topic_to_acts.items()
}


In [8]:
for k in range(K):
    print(f"[TOP5] topic {k}: {top5_by_topic[k]}")

[TOP5] topic 0: [('seneca-octavia_act2', 0.9985973834991455), ('mussato-ecerinis_act2', 0.9971780180931091), ('mussato-ecerinis_act3', 0.2429867833852768), ('seneca-troades_act2', 0.01885152794420719), ('seneca-octavia_act4', 0.0004982870304957032)]
[TOP5] topic 1: [('seneca-phoenissae_act2', 0.9931109547615051), ('seneca-octavia_act4', 0.9880447387695312), ('seneca-hercules-furens_act3', 0.010152732022106647), ('seneca-agamemnon_act3', 0.0035550412721931934), ('seneca-octavia_act6', 0.0004027545510325581)]
[TOP5] topic 2: [('seneca-phaedra_act1', 0.9989547729492188), ('seneca-agamemnon_act4', 0.9950188994407654), ('seneca-octavia_act5', 0.611312210559845), ('seneca-thyestes_act1', 0.047320447862148285), ('seneca-oedipus_act2', 0.0381777286529541)]
[TOP5] topic 3: [('seneca-troades_act4', 0.9983115792274475), ('mussato-ecerinis_act1', 0.6200554370880127), ('seneca-thyestes_act4', 0.17086420953273773), ('seneca-troades_act1', 0.11220882833003998), ('seneca-agamemnon_act2', 0.07081460952

In [9]:
# Build the spreadsheet rows:
# Topic Number should match gensim indexing (+1 to go 1..K).
rows = []
for topic_id in range(acts_filtered_ideal_topic_num):
    # Defensive: skip if model has fewer topics than expected
    if topic_id not in rel_kws:
        print(f"[WARN] rel_kws missing topic {topic_id}; skipping")
        continue

    # Acts ranked by this topic (up to 5)
    top_acts = top5_by_topic.get(topic_id, [])

    for j, token in enumerate(rel_kws[topic_id]):
        # For the first five rows within the topic, attach "act_id (weight)"
        play_act = ""
        if j < len(top_acts):
            act_id, w = top_acts[j]
            play_act = f"{act_id} ({w:.3f})"
            
    
        rows.append({
            "Topic Number": topic_id + 1,       # 1-based label for readability
            "Play_Act": play_act,                     
            "Latin Keywords": token,            # one per row
            "English Keywords": "",             # left empty
            "Themes": "",                       # left empty
            "Topic Name": ""                    # left empty
        })

df = pd.DataFrame(rows, columns=[
    "Topic Number", "Play_Act", "Latin Keywords", "English Keywords", "Themes", "Topic Name"
])



In [10]:
# Sanity checks:

print("Row keys sample:", rows[0].keys())

filled = [r for r in rows if r["Play_Act"]]
print(f"[CHECK] rows with Play-Act filled: {len(filled)} of {len(rows)}")
print("[CHECK] sample 5:", filled[:5])

nonempty_count = (df["Play_Act"] != "").sum()
print(f"[CHECK] DataFrame non-empty '{"Play_Act"}': {nonempty_count}")

Row keys sample: dict_keys(['Topic Number', 'Play_Act', 'Latin Keywords', 'English Keywords', 'Themes', 'Topic Name'])
[CHECK] rows with Play-Act filled: 105 of 210
[CHECK] sample 5: [{'Topic Number': 1, 'Play_Act': 'seneca-octavia_act2 (0.999)', 'Latin Keywords': 'lis', 'English Keywords': '', 'Themes': '', 'Topic Name': ''}, {'Topic Number': 1, 'Play_Act': 'mussato-ecerinis_act2 (0.997)', 'Latin Keywords': 'homo', 'English Keywords': '', 'Themes': '', 'Topic Name': ''}, {'Topic Number': 1, 'Play_Act': 'mussato-ecerinis_act3 (0.243)', 'Latin Keywords': 'tyrannus', 'English Keywords': '', 'Themes': '', 'Topic Name': ''}, {'Topic Number': 1, 'Play_Act': 'seneca-troades_act2 (0.019)', 'Latin Keywords': 'subicio', 'English Keywords': '', 'Themes': '', 'Topic Name': ''}, {'Topic Number': 1, 'Play_Act': 'seneca-octavia_act4 (0.000)', 'Latin Keywords': 'finis', 'English Keywords': '', 'Themes': '', 'Topic Name': ''}]
[CHECK] DataFrame non-empty 'Play_Act': 105


In [11]:
csv_path = "../../Acts/acts_csvs/21acts_topics_keywords_lambda06.csv"
df.to_csv(csv_path, index=False)
print(f"Wrote {csv_path} with {len(df)} rows.")

Wrote ../../Acts/acts_csvs/21acts_topics_keywords_lambda06.csv with 210 rows.


In [12]:
acts_filtered_document_topics = []
for i, bow in enumerate(acts_filtered_corpus):
    topic_dist = acts_filtered_model.get_document_topics(bow)  # topic_dist is a list of tuples representing the topic distribution for the i-th document # each tuple is (topic_id, topic_weight)
    acts_filtered_document_topics.append((processed_acts[i]["id"], topic_dist))

for title, topics in acts_filtered_document_topics:
    # Get topic with highest probability
    top_topic = max(topics, key=lambda x: x[1])
    print(f"{title:<20} → Filtered Topic #{top_topic[0] + 1} (weight: {top_topic[1]:.3f})")
   
#get topic distribution for each play
for title, topics in acts_filtered_document_topics:
    print(f"\n{title}")
    top_three = sorted(topics, key=lambda x: -x[1])[:3]
    for topic_id, weight in top_three:
        print(f"  Filtered Topic #{topic_id + 1}: {weight:.3f}")

seneca-agamemnon_act1 → Filtered Topic #13 (weight: 0.997)
seneca-agamemnon_act2 → Filtered Topic #12 (weight: 0.826)
seneca-agamemnon_act3 → Filtered Topic #12 (weight: 0.337)
seneca-agamemnon_act4 → Filtered Topic #3 (weight: 0.995)
seneca-agamemnon_act5 → Filtered Topic #13 (weight: 0.871)
seneca-hercules-furens_act1 → Filtered Topic #10 (weight: 0.999)
seneca-hercules-furens_act2 → Filtered Topic #12 (weight: 0.507)
seneca-hercules-furens_act3 → Filtered Topic #12 (weight: 0.540)
seneca-hercules-furens_act4 → Filtered Topic #8 (weight: 0.660)
seneca-hercules-furens_act5 → Filtered Topic #8 (weight: 0.415)
seneca-hercules-oetaeus_act1 → Filtered Topic #8 (weight: 0.999)
seneca-hercules-oetaeus_act2 → Filtered Topic #8 (weight: 0.339)
seneca-hercules-oetaeus_act3 → Filtered Topic #8 (weight: 0.571)
seneca-hercules-oetaeus_act4 → Filtered Topic #8 (weight: 0.890)
seneca-hercules-oetaeus_act5 → Filtered Topic #8 (weight: 0.874)
seneca-medea_act1    → Filtered Topic #17 (weight: 0.711)
