# ADMAGD - Results

## Importing libraries

In [17]:
import numpy as np
import pandas as pd

## Retrieve Model

In [26]:
model_file_name = "tfidf_train_extra_stopwords_200_iteration_admagd_model"

In [27]:
model_path = f"trained_ model/{model_file_name}"

### Pickle

In [28]:
import pickle

Load the model from a file

In [29]:
with open(f"{model_path}.pkl", 'rb') as f:
    loaded_model = pickle.load(f)

### joblib

In [6]:
# from joblib import load

Load the model from a file

In [7]:
# loaded_model_joblib = load(f"{model_path}.joblib")

## Results

In [8]:
# def print_topic_word_distribution(model):
#     phi = model.calculate_phi_update()  # This gives you the word-topic matrix

#     for topic_idx in range(model.num_topics):
#         print(f"Topic #{topic_idx+1}:\n")
        
#         for word_id in range(model.vocab_size):
#             word_probability = phi[topic_idx, word_id]
#             word = model.id2word[word_id]
#             print(f"{word}: {word_probability:.4f}")
        
#         print("\n\n")  # Print a newline to separate topics

# print_topic_word_distribution(loaded_model)


In [9]:
def print_top_words_per_topic(model, top_n=10):
    phi = model.calculate_phi_update()  # This gives you the word-topic matrix

    for topic_idx in range(model.num_topics):
        print(f"<< Topic # {topic_idx+1} >>")

        # Get the top N word indices for the topic sorted by probability
        top_word_indices = phi[topic_idx].argsort()[-top_n:][::-1]
        
        for word_id in top_word_indices:
            word_probability = phi[topic_idx, word_id]
            word = model.id2word[word_id]
            print(f"{word}: {word_probability:.4f}")

        print("\n")  # Print a newline to separate topics

# After running your model...
# print_top_words_per_topic(loaded_model, 25)


### Extract word for each topic

In [10]:
# After you've run Gibbs sampling
word_topic_matrix = loaded_model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [11]:
# Visualize the top N words for each topic
N_TOP_WORDS = 50

ALL_TOPIC_WORDS = []
for i in range(loaded_model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N_TOP_WORDS:][::-1]
    top_words = [loaded_model.id2word[idx] for idx in top_words_idx]

    ALL_TOPIC_WORDS.append(top_words)

    print(f"Topic {i + 1}: {', '.join(top_words)} \n")

Topic 1: window, run, file, problem, work, program, try, look, help, write, set, windows, way, thing, good, error, new, line, server, question, follow, code, application, version, color, machine, start, driver, manager, image, let, change, case, display, type, screen, appreciate, directory, create, user, tell, bit, client, support, different, people, number, example, command, email 

Topic 2: year, space, new, research, development, science, information, world, program, work, report, provide, cost, base, right, center, public, high, earth, place, send, available, number, people, date, plan, sure, start, orbit, international, follow, second, contact, money, project, april, end, include, limit, begin, day, write, press, national, material, technology, reach, increase, long, analysis 

Topic 3: good, look, car, year, thing, really, way, right, bike, work, try, new, drive, people, sure, turn, little, tell, point, line, problem, big, lot, day, far, dod, leave, power, probably, bit, bad, rid

In [12]:
word_dic = {}

for topic_words in ALL_TOPIC_WORDS:
  for word in topic_words:
    if word in word_dic:
      word_dic[word] = word_dic[word] + 1
    else:
      word_dic[word] = 1

In [13]:
word_dic

{'window': 4,
 'run': 11,
 'file': 5,
 'problem': 14,
 'work': 17,
 'program': 8,
 'try': 17,
 'look': 16,
 'help': 11,
 'write': 7,
 'set': 7,
 'windows': 3,
 'way': 16,
 'thing': 16,
 'good': 16,
 'error': 1,
 'new': 18,
 'line': 4,
 'server': 1,
 'question': 10,
 'follow': 9,
 'code': 1,
 'application': 3,
 'version': 4,
 'color': 2,
 'machine': 4,
 'start': 8,
 'driver': 3,
 'manager': 1,
 'image': 2,
 'let': 8,
 'change': 4,
 'case': 10,
 'display': 1,
 'type': 1,
 'screen': 1,
 'appreciate': 3,
 'directory': 1,
 'create': 1,
 'user': 1,
 'tell': 12,
 'bit': 8,
 'client': 1,
 'support': 3,
 'different': 2,
 'people': 14,
 'number': 12,
 'example': 3,
 'command': 1,
 'email': 9,
 'year': 15,
 'space': 2,
 'research': 1,
 'development': 1,
 'science': 1,
 'information': 5,
 'world': 4,
 'report': 2,
 'provide': 3,
 'cost': 3,
 'base': 3,
 'right': 15,
 'center': 1,
 'public': 3,
 'high': 3,
 'earth': 1,
 'place': 3,
 'send': 3,
 'available': 5,
 'date': 1,
 'plan': 1,
 'sure': 8,
 '

### Visualize the author-topic distribution

In [14]:
# Normalize the author_topic_matrix to get author-topic distribution

# Compute the sum of rows in author_topic_matrix
author_topic_sum = loaded_model.author_topic_matrix.sum(axis=1)[:, np.newaxis]

# Replace zero sums with a small epsilon value
epsilon = 1e-10
author_topic_sum[author_topic_sum == 0] = epsilon

# Perform element-wise division
author_topic_dist = loaded_model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N_TOP_TOPICS = 3
top_topics_list = []
for i, author in enumerate(loaded_model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N_TOP_TOPICS:][::-1]
    top_topics_list.append(top_topics_idx)
    # print(f"Author {i+1} => {author} : Topic IDs {top_topics_idx} \n")

In [15]:
top_topics_of_authors_df = pd.DataFrame({'authors': loaded_model.authors, 'topics': top_topics_list})
top_topics_of_authors_df

Unnamed: 0,authors,topics
0,lerxst@wam.umd.edu (where's my thing),"[14, 19, 8]"
1,guykuo@carson.u.washington.edu (Guy Kuo),"[8, 5, 10]"
2,twillis@ec.ecn.purdue.edu (Thomas E Willis),"[16, 11, 19]"
3,jgreen@amber (Joe Green),"[6, 19, 18]"
4,jcm@head-cfa.harvard.edu (Jonathan McDowell),"[14, 19, 8]"
...,...,...
5764,bchuang@css.itd.umich.edu (Ben Chuang),"[7, 19, 18]"
5765,shaig@composer.think.com (Shai Guday),"[4, 12, 19]"
5766,mrj@cs.su.oz.au (Mark James),"[19, 18, 1]"
5767,ebodin@pearl.tufts.edu,"[5, 19, 18]"


In [16]:
top_topics_of_authors_df[top_topics_of_authors_df['authors'] == 'lmvec@westminster.ac.uk (William Hargreaves)']


Unnamed: 0,authors,topics
2079,lmvec@westminster.ac.uk (William Hargreaves),"[18, 8, 19]"
