# DMA - 20 news dataset

## Importing libraries

In [19]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

from model.dma import DMA

## Fetch news data

In [7]:
categories = ['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']

In [9]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

In [12]:
newsgroups_train.target_names

['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(newsgroups_train.data)

In [34]:
X.get_feature_names()

AttributeError: get_feature_names not found

In [17]:
K = 20  # number of topics
alpha = 0.5  # hyperparameter for the document-topic distribution
beta = 0.5  # hyperparameter for the topic-word distribution

In [21]:
# Convert sparse matrix to numpy array
X_array = X.toarray()

In [22]:
dma = DMA(K, alpha, beta)
dma.fit(X_array)

In [24]:
# Use the DMA model to predict the topic distribution for each document
p_z = dma.predict(X_array)

In [25]:
# Print the predicted topic distribution for each document
for i in range(5):
    print(f"Document {i+1} - Predicted Topic Distribution: {p_z[i]}")

Document 1 - Predicted Topic Distribution: [9.99999835e-01 1.64903223e-07 8.26606473e-50 1.10090313e-29
 1.28872832e-15 1.00087592e-45 3.16587680e-28 9.60996682e-30
 1.98654119e-35 1.66990311e-27 4.04659886e-37 9.77501145e-23
 2.70045974e-51 1.62902721e-28 5.67945129e-30 8.75881243e-25
 9.19999168e-29 4.72925592e-18 6.95236925e-25 4.32850524e-18]
Document 2 - Predicted Topic Distribution: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Document 3 - Predicted Topic Distribution: [2.01953835e-32 1.78010683e-38 8.78369441e-20 3.64176554e-33
 9.13066229e-29 9.57760838e-38 2.01209483e-40 2.37994321e-42
 5.12212849e-44 5.24925693e-35 1.00000000e+00 3.16672476e-38
 4.52304833e-56 1.48698308e-37 5.03584143e-33 8.59708325e-37
 4.73985897e-29 7.72880299e-26 1.50763909e-37 8.39057899e-36]
Document 4 - Predicted Topic Distribution: [1.00000000e+00 6.36778158e-42 3.92789436e-93 2.34120190e-56
 1.67481421e-53 1.74061229e-76 3.53507614e-68 4.59420559e-64
 3.39872326e-81 4.63704021e-62 8

In [26]:
def display_topics(model, vocab, n_top_words):
    for i, topic_dist in enumerate(model.phi):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print(f'Topic {i+1}: {" ".join(topic_words)}')


In [38]:
display_topics(dma, vectorizer.get_feature_names_out(), n_top_words=10)

Topic 1: com sun ca writes helmet lines article subject like east
Topic 2: com bike dod edu organization subject article lines new writes
Topic 3: file image jpeg files images color format gif bit program
Topic 4: edu subject posting lines nntp organization host university article ca
Topic 5: edu group graphics umich ch den p2 p3 ti newsgroup
Topic 6: gun militia state law people firearms right control amendment weapons
Topic 7: crime com study house handgun seattle vancouver att congress handguns
Topic 8: 000 com gun colorado manes rate uk weaver year homicide
Topic 9: 1993 apr 18 mil 00 run division 20 93 23
Topic 10: edu police don gun know like semi safety cops auto
Topic 11: edu lines subject organization graphics university 3d thanks version know
Topic 12: edu fbi batf people com waco compound lines subject article
Topic 13: data graphics ftp available pub use package sgi com processing
Topic 14: edu com stratus writes cdt sw article lines organization subject
Topic 15: edu david