# ADMAGD - Results

## Importing libraries

In [14]:
import numpy as np
import pandas as pd

## Retrieve Model

In [15]:
model_file_name = "tfidf_train_extra_stopwords_admagd_model"

In [16]:
model_path = f"trained_ model/{model_file_name}"

### Pickle

In [17]:
import pickle

Load the model from a file

In [18]:
with open(f"{model_path}.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

### joblib

In [19]:
# from joblib import load

Load the model from a file

In [20]:
# loaded_model_joblib = load(f"{model_path}.joblib")

## Results

In [21]:
# def print_topic_word_distribution(model):
#     phi = model.calculate_phi_update()  # This gives you the word-topic matrix

#     for topic_idx in range(model.num_topics):
#         print(f"Topic #{topic_idx+1}:\n")
        
#         for word_id in range(model.vocab_size):
#             word_probability = phi[topic_idx, word_id]
#             word = model.id2word[word_id]
#             print(f"{word}: {word_probability:.4f}")
        
#         print("\n\n")  # Print a newline to separate topics

# print_topic_word_distribution(loaded_model)


In [22]:
def print_top_words_per_topic(model, top_n=10):
    phi = model.calculate_phi_update()  # This gives you the word-topic matrix

    for topic_idx in range(model.num_topics):
        print(f"<< Topic # {topic_idx+1} >>")

        # Get the top N word indices for the topic sorted by probability
        top_word_indices = phi[topic_idx].argsort()[-top_n:][::-1]
        
        for word_id in top_word_indices:
            word_probability = phi[topic_idx, word_id]
            word = model.id2word[word_id]
            print(f"{word}: {word_probability:.4f}")

        print("\n")  # Print a newline to separate topics

# After running your model...
# print_top_words_per_topic(loaded_model, 25)


### Extract word for each topic

In [23]:
# After you've run Gibbs sampling
word_topic_matrix = loaded_model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [24]:
# Visualize the top N words for each topic
N_TOP_WORDS = 50

ALL_TOPIC_WORDS = []
for i in range(loaded_model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N_TOP_WORDS:][::-1]
    top_words = [loaded_model.id2word[idx] for idx in top_words_idx]

    ALL_TOPIC_WORDS.append(top_words)

    print(f"Topic {i + 1}: {', '.join(top_words)} \n")

Topic 1: look, car, good, year, new, buy, really, work, thing, way, lot, old, let, people, right, little, drive, post, tell, price, probably, problem, try, include, great, pay, number, read, ask, bike, point, run, big, money, sure, help, offer, deal, light, wrong, sell, kind, course, leave, email, sale, question, long, pretty, far 

Topic 2: work, problem, window, card, drive, run, driver, email, try, program, file, disk, look, good, windows, new, memory, software, support, set, version, video, thing, machine, mode, computer, post, help, buy, bit, monitor, ram, color, interested, tell, way, graphic, include, sure, read, bus, question, follow, different, write, change, ide, screen, controller, instal 

Topic 3: work, problem, run, try, thing, good, way, new, program, help, look, really, chip, bit, number, window, question, available, people, line, high, speed, set, application, follow, sure, change, able, place, sell, post, send, include, email, write, card, list, right, user, best, pos

### Visualize the author-topic distribution

In [25]:
# Normalize the author_topic_matrix to get author-topic distribution

# Compute the sum of rows in author_topic_matrix
author_topic_sum = loaded_model.author_topic_matrix.sum(axis=1)[:, np.newaxis]

# Replace zero sums with a small epsilon value
epsilon = 1e-10
author_topic_sum[author_topic_sum == 0] = epsilon

# Perform element-wise division
author_topic_dist = loaded_model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N_TOP_TOPICS = 2
top_topics_list = []
for i, author in enumerate(loaded_model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N_TOP_TOPICS:][::-1]
    top_topics_list.append(top_topics_idx)
    # print(f"Author {i+1} => {author} : Topic IDs {top_topics_idx} \n")

In [26]:
top_topics_of_authors_df = pd.DataFrame({'authors': loaded_model.authors, 'topics': top_topics_list})
top_topics_of_authors_df

Unnamed: 0,authors,topics
0,lerxst@wam.umd.edu (where's my thing),"[9, 11]"
1,guykuo@carson.u.washington.edu (Guy Kuo),"[4, 1]"
2,twillis@ec.ecn.purdue.edu (Thomas E Willis),"[9, 15]"
3,jgreen@amber (Joe Green),"[16, 19]"
4,jcm@head-cfa.harvard.edu (Jonathan McDowell),"[3, 1]"
...,...,...
5764,bchuang@css.itd.umich.edu (Ben Chuang),"[5, 13]"
5765,shaig@composer.think.com (Shai Guday),"[15, 18]"
5766,mrj@cs.su.oz.au (Mark James),"[19, 18]"
5767,ebodin@pearl.tufts.edu,"[1, 19]"
