# Results

## Importing libraries

In [4]:
import numpy as np
import pandas as pd
import pickle

## Retrieve Model

In [7]:
from pathlib import Path
import os
import sys

# Adjust the number of parent calls based on the nesting level
root_path = str(Path(os.getcwd()).resolve().parent)  
sys.path.append(root_path)

import model

Load the model from a file

In [8]:
with open("trained_model/nips_1935_iteration_100.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

## Results

In [8]:
# def print_topic_word_distribution(model):
#     phi = model.calculate_phi_update()  # This gives you the word-topic matrix

#     for topic_idx in range(model.num_topics):
#         print(f"Topic #{topic_idx+1}:\n")
        
#         for word_id in range(model.vocab_size):
#             word_probability = phi[topic_idx, word_id]
#             word = model.id2word[word_id]
#             print(f"{word}: {word_probability:.4f}")
        
#         print("\n\n")  # Print a newline to separate topics

# print_topic_word_distribution(loaded_model)


In [11]:
def print_top_words_per_topic(model, top_n=10):
    phi = model.calculate_phi_update()  # This gives you the word-topic matrix

    for topic_idx in range(model.num_topics):
        print(f"<< Topic # {topic_idx+1} >>")

        # Get the top N word indices for the topic sorted by probability
        top_word_indices = phi[topic_idx].argsort()[-top_n:][::-1]
        
        for word_id in top_word_indices:
            word_probability = phi[topic_idx, word_id]
            word = model.id2word[word_id]
            print(f"{word}: {word_probability:.4f}")

        print("\n")  # Print a newline to separate topics

# After running your model...
print_top_words_per_topic(loaded_model)


<< Topic # 1 >>
david: 0.0085
michael: 0.0074
andrew: 0.0063
john: 0.0056
alex: 0.0054
geoffrey: 0.0049
peter: 0.0047
richard: 0.0045
thomas: 0.0041
ilya: 0.0039


<< Topic # 2 >>
node: 0.0043
binary: 0.0039
graph: 0.0038
assign: 0.0038
group: 0.0036
edge: 0.0035
capture: 0.0033
identify: 0.0032
connect: 0.0032
partition: 0.0029


<< Topic # 3 >>
layer: 0.0057
architecture: 0.0055
deep: 0.0054
bengio: 0.0052
hinton: 0.0051
convolutional: 0.0043
sutskever: 0.0041
unit: 0.0039
activation: 0.0035
lecun: 0.0034


<< Topic # 4 >>
bayesian: 0.0038
posterior: 0.0037
likelihood: 0.0036
noise: 0.0031
inference: 0.0030
variance: 0.0030
dynamic: 0.0029
simulation: 0.0027
fit: 0.0024
equation: 0.0024


<< Topic # 5 >>
iid: 0.0040
sense: 0.0034
family: 0.0033
finite: 0.0033
uniform: 0.0031
turn: 0.0031
literature: 0.0029
establish: 0.0029
implies: 0.0029
distance: 0.0028


<< Topic # 6 >>
convex: 0.0076
descent: 0.0062
minimization: 0.0057
norm: 0.0049
regularization: 0.0045
dual: 0.0044
convexity:

### Extract word for each topic

In [12]:
# After you've run Gibbs sampling
word_topic_matrix = loaded_model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [13]:
# Visualize the top N words for each topic
N_TOP_WORDS = 50

ALL_TOPIC_WORDS = []
for i in range(loaded_model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N_TOP_WORDS:][::-1]
    top_words = [loaded_model.id2word[idx] for idx in top_words_idx]

    ALL_TOPIC_WORDS.append(top_words)

    # print(f"Topic {i + 1}: {', '.join(top_words)} \n")

In [14]:
word_dic = {}

for topic_words in ALL_TOPIC_WORDS:
  for word in topic_words:
    if word in word_dic:
      word_dic[word] = word_dic[word] + 1
    else:
      word_dic[word] = 1

In [17]:
# word_dic

### Visualize the author-topic distribution

In [15]:
# Normalize the author_topic_matrix to get author-topic distribution

# Compute the sum of rows in author_topic_matrix
author_topic_sum = loaded_model.author_topic_matrix.sum(axis=1)[:, np.newaxis]

# Replace zero sums with a small epsilon value
epsilon = 1e-10
author_topic_sum[author_topic_sum == 0] = epsilon

# Perform element-wise division
author_topic_dist = loaded_model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N_TOP_TOPICS = 3
top_topics_list = []
for i, author in enumerate(loaded_model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N_TOP_TOPICS:][::-1]
    top_topics_list.append(top_topics_idx)
    # print(f"Author {i+1} => {author} : Topic IDs {top_topics_idx} \n")

In [17]:
top_topics_of_authors_df = pd.DataFrame({'authors': loaded_model.authors, 'topics': top_topics_list})
top_topics_of_authors_df.head(20)

Unnamed: 0,authors,topics
0,Sebastian Stober,"[2, 3, 1]"
1,Daniel J. Cameron,"[9, 8, 7]"
2,Jessica A. Grahn,"[9, 8, 7]"
3,Aurel A. Lazar,"[3, 4, 9]"
4,Yevgeniy Slutskiy,"[9, 8, 7]"
5,Chen-Yu Wei,"[8, 4, 5]"
6,Yi-Te Hong,"[9, 8, 7]"
7,Chi-Jen Lu,"[9, 8, 7]"
8,Katherine A. Heller,"[2, 3, 5]"
9,David B. Dunson,"[3, 4, 9]"


In [16]:
top_topics_of_authors_df[top_topics_of_authors_df['authors'] == 'Hongyuan Zha']


Unnamed: 0,authors,topics
655,Hongyuan Zha,"[9, 6, 0]"
