# Results

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import pickle

## Retrieve Model

In [2]:
from pathlib import Path
import os
import sys

# Adjust the number of parent calls based on the nesting level
root_path = str(Path(os.getcwd()).resolve().parent)  
sys.path.append(root_path)

import model

Load the model from a file

In [3]:
with open("trained_model/ablima_model_200_iteration.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

## Results

In [4]:
# def print_topic_word_distribution(model):
#     phi = model.calculate_phi_update()  # This gives you the word-topic matrix

#     for topic_idx in range(model.num_topics):
#         print(f"Topic #{topic_idx+1}:\n")
        
#         for word_id in range(model.vocab_size):
#             word_probability = phi[topic_idx, word_id]
#             word = model.id2word[word_id]
#             print(f"{word}: {word_probability:.4f}")
        
#         print("\n\n")  # Print a newline to separate topics

# print_topic_word_distribution(loaded_model)


In [5]:
def print_top_words_per_topic(model, top_n=10):
    phi = model.calculate_phi_update()  # This gives you the word-topic matrix

    for topic_idx in range(model.num_topics):
        print(f"<< Topic # {topic_idx+1} >>")

        # Get the top N word indices for the topic sorted by probability
        top_word_indices = phi[topic_idx].argsort()[-top_n:][::-1]
        
        for word_id in top_word_indices:
            word_probability = phi[topic_idx, word_id]
            word = model.id2word[word_id]
            print(f"{word}: {word_probability:.4f}")

        print("\n")  # Print a newline to separate topics

# After running your model...
print_top_words_per_topic(loaded_model)


<< Topic # 1 >>
cause: 0.0148
problem: 0.0091
doctor: 0.0084
patient: 0.0084
effect: 0.0082
medical: 0.0076
good: 0.0071
result: 0.0068
disease: 0.0066
medicine: 0.0063


<< Topic # 2 >>
people: 0.0095
fbi: 0.0092
gun: 0.0088
government: 0.0081
koresh: 0.0073
law: 0.0072
child: 0.0069
state: 0.0066
day: 0.0066
evidence: 0.0061


<< Topic # 3 >>
game: 0.0261
play: 0.0143
team: 0.0141
player: 0.0115
win: 0.0090
year: 0.0090
hit: 0.0082
baseball: 0.0079
score: 0.0070
fan: 0.0070


<< Topic # 4 >>
file: 0.0139
image: 0.0113
graphic: 0.0108
format: 0.0106
available: 0.0091
version: 0.0089
ftp: 0.0085
convert: 0.0083
color: 0.0076
look: 0.0073


<< Topic # 5 >>
really: 0.0158
people: 0.0137
thing: 0.0126
good: 0.0123
tell: 0.0095
post: 0.0079
look: 0.0078
let: 0.0075
point: 0.0074
read: 0.0073


<< Topic # 6 >>
hand: 0.0068
away: 0.0066
little: 0.0063
ask: 0.0060
later: 0.0059
start: 0.0058
head: 0.0056
saw: 0.0056
day: 0.0053
leave: 0.0053


<< Topic # 7 >>
problem: 0.0064
small: 0.0058
typ

### Extract word for each topic

In [6]:
# After you've run Gibbs sampling
word_topic_matrix = loaded_model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [7]:
# Visualize the top N words for each topic
N_TOP_WORDS = 50

ALL_TOPIC_WORDS = []
for i in range(loaded_model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N_TOP_WORDS:][::-1]
    top_words = [loaded_model.id2word[idx] for idx in top_words_idx]

    ALL_TOPIC_WORDS.append(top_words)

    # print(f"Topic {i + 1}: {', '.join(top_words)} \n")

In [8]:
word_dic = {}

for topic_words in ALL_TOPIC_WORDS:
  for word in topic_words:
    if word in word_dic:
      word_dic[word] = word_dic[word] + 1
    else:
      word_dic[word] = 1

In [9]:
# word_dic

### Visualize the author-topic distribution

In [10]:
# Normalize the author_topic_matrix to get author-topic distribution

# Compute the sum of rows in author_topic_matrix
author_topic_sum = loaded_model.author_topic_matrix.sum(axis=1)[:, np.newaxis]

# Replace zero sums with a small epsilon value
epsilon = 1e-10
author_topic_sum[author_topic_sum == 0] = epsilon

# Perform element-wise division
author_topic_dist = loaded_model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N_TOP_TOPICS = 3
top_topics_list = []
for i, author in enumerate(loaded_model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N_TOP_TOPICS:][::-1]
    top_topics_list.append(top_topics_idx)
    # print(f"Author {i+1} => {author} : Topic IDs {top_topics_idx} \n")

In [11]:
top_topics_of_authors_df = pd.DataFrame({'authors': loaded_model.authors, 'topics': top_topics_list})
top_topics_of_authors_df.head(20)

Unnamed: 0,authors,topics
0,lerxst@wam.umd.edu (where's my thing),"[11, 14, 18]"
1,guykuo@carson.u.washington.edu (Guy Kuo),"[7, 4, 10]"
2,twillis@ec.ecn.purdue.edu (Thomas E Willis),"[4, 19, 18]"
3,jgreen@amber (Joe Green),"[13, 4, 5]"
4,jcm@head-cfa.harvard.edu (Jonathan McDowell),"[14, 5, 0]"
5,dfo@vttoulu.tko.vtt.fi (Foxvog Douglas),"[0, 4, 13]"
6,bmdelane@quads.uchicago.edu (brian manning del...,"[1, 14, 16]"
7,bgrubb@dante.nmsu.edu (GRUBB),"[8, 4, 13]"
8,holmes7000@iscsvax.uni.edu,"[19, 6, 1]"
9,kerr@ux1.cso.uiuc.edu (Stan Kerr),"[3, 4, 19]"


In [12]:
top_topics_of_authors_df[top_topics_of_authors_df['authors'] == 'lmvec@westminster.ac.uk (William Hargreaves)']


Unnamed: 0,authors,topics
2079,lmvec@westminster.ac.uk (William Hargreaves),"[8, 19, 11]"
