# ADMAGD - 20 news dataset

## Importing libraries

In [2]:
import numpy as np
from gensim import corpora
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [1]:
from model.admagd import ADMAGD

In [3]:
# Fetch data
categories = ['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('footers', 'quotes'))

# Vectorized the data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
X = vectorizer.fit_transform(newsgroups.data)

# Convert matrix to list of tuples (document, word count)
corpus = [list(zip(row.indices, row.data)) for row in X]

# id2word mapping
id2word = {v: k for k, v in vectorizer.vocabulary_.items()}

# Mock author2doc mapping (assuming each document has a single author)
authors = {f"author_{i}": [i] for i in range(len(corpus))}

In [4]:
authors

{'author_0': [0],
 'author_1': [1],
 'author_2': [2],
 'author_3': [3],
 'author_4': [4],
 'author_5': [5],
 'author_6': [6],
 'author_7': [7],
 'author_8': [8],
 'author_9': [9],
 'author_10': [10],
 'author_11': [11],
 'author_12': [12],
 'author_13': [13],
 'author_14': [14],
 'author_15': [15],
 'author_16': [16],
 'author_17': [17],
 'author_18': [18],
 'author_19': [19],
 'author_20': [20],
 'author_21': [21],
 'author_22': [22],
 'author_23': [23],
 'author_24': [24],
 'author_25': [25],
 'author_26': [26],
 'author_27': [27],
 'author_28': [28],
 'author_29': [29],
 'author_30': [30],
 'author_31': [31],
 'author_32': [32],
 'author_33': [33],
 'author_34': [34],
 'author_35': [35],
 'author_36': [36],
 'author_37': [37],
 'author_38': [38],
 'author_39': [39],
 'author_40': [40],
 'author_41': [41],
 'author_42': [42],
 'author_43': [43],
 'author_44': [44],
 'author_45': [45],
 'author_46': [46],
 'author_47': [47],
 'author_48': [48],
 'author_49': [49],
 'author_50': [50],


In [5]:
# Initialize alpha, beta, a, and b if different from the defaults
alpha_init = 0.1
beta_init = 0.1
a_init = 0.1
b_init = 0.1

num_topics=10

In [6]:
# Instantiate the ADMAGD model
model = ADMAGD(corpus=corpus, num_topics=num_topics, id2word=id2word, authors=authors, alpha_init=alpha_init, beta_init=beta_init, a_init=a_init, b_init=b_init)

In [7]:
# Run Gibbs sampling
model.gibbs_sampling(iterations=5)

iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4


In [8]:
# After you've run Gibbs sampling
word_topic_matrix = model.word_topic_matrix
word_topic_sum = word_topic_matrix.sum(axis=1)[:, np.newaxis]
word_topic_dist = word_topic_matrix / word_topic_sum

In [10]:
# Visualize the top N words for each topic
N = 10
for i in range(model.num_topics):
    top_words_idx = word_topic_dist[i].argsort()[-N:][::-1]
    top_words = [model.id2word[idx] for idx in top_words_idx]
    print(f"Topic {i + 1}: {', '.join(top_words)}")


Topic 1: posting, host, nntp, edu, university, distribution, com, computer, does, version
Topic 2: edu, posting, host, nntp, com, university, just, don, distribution, time
Topic 3: nntp, posting, com, host, edu, distribution, university, computer, like, time
Topic 4: nntp, host, posting, edu, university, com, usa, know, don, distribution
Topic 5: nntp, posting, host, edu, university, like, distribution, com, think, know
Topic 6: posting, host, com, nntp, university, reply, just, time, edu, world
Topic 7: com, nntp, posting, host, edu, university, think, reply, graphics, just
Topic 8: host, edu, com, nntp, like, posting, university, distribution, know, world
Topic 9: nntp, posting, host, edu, university, com, like, reply, computer, don
Topic 10: posting, host, edu, nntp, university, com, distribution, good, want, just


Visualize the author-topic distribution

In [11]:
# Normalize the author_topic_matrix to get author-topic distribution
author_topic_sum = model.author_topic_matrix.sum(axis=1)[:, np.newaxis]
author_topic_dist = model.author_topic_matrix / author_topic_sum

# Visualize the top N topics for each author
N = 3
for i, author in enumerate(model.authors):
    top_topics_idx = author_topic_dist[i].argsort()[-N:][::-1]
    print(f"Author {author}: Topic IDs {top_topics_idx}")

Author author_0: Topic IDs [4 9 1]
Author author_1: Topic IDs [8 4 9]
Author author_2: Topic IDs [4 9 8]
Author author_3: Topic IDs [5 0 9]
Author author_4: Topic IDs [9 3 5]
Author author_5: Topic IDs [1 5 7]
Author author_6: Topic IDs [5 0 9]
Author author_7: Topic IDs [4 2 5]
Author author_8: Topic IDs [9 0 8]
Author author_9: Topic IDs [0 3 9]
Author author_10: Topic IDs [1 8 9]
Author author_11: Topic IDs [4 7 2]
Author author_12: Topic IDs [9 7 4]
Author author_13: Topic IDs [6 8 9]
Author author_14: Topic IDs [9 5 2]
Author author_15: Topic IDs [4 3 9]
Author author_16: Topic IDs [0 2 3]
Author author_17: Topic IDs [4 7 9]
Author author_18: Topic IDs [7 5 2]
Author author_19: Topic IDs [1 3 9]
Author author_20: Topic IDs [5 0 1]
Author author_21: Topic IDs [8 4 9]
Author author_22: Topic IDs [0 6 9]
Author author_23: Topic IDs [8 9 7]
Author author_24: Topic IDs [4 0 9]
Author author_25: Topic IDs [4 7 1]
Author author_26: Topic IDs [4 2 9]
Author author_27: Topic IDs [0 6 9]
Au