# DMA - 20 news dataset

## Importing libraries

In [12]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from model.dma import DMA

## Fetch news data

In [13]:
categories = ['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']

In [19]:
# Load 20 Newsgroups data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [20]:
newsgroups_train.target_names

['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']

In [22]:
# Preprocess and convert documents to BoW representation
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(newsgroups_train.data)

In [23]:
# Convert sparse matrix to numpy array
X_array = X.toarray()

In [24]:
K = 20  # number of topics
alpha = 0.5  # hyperparameter for the document-topic distribution
beta = 0.5  # hyperparameter for the topic-word distribution

In [25]:
dma = DMA(K, alpha, beta)
dma.fit(X_array)

In [26]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

X_test = vectorizer.fit_transform(newsgroups_test.data)

X_test_array = X_test.toarray()

In [27]:
# Use the DMA model to predict the topic distribution for each document
p_z = dma.predict(X_test_array)

In [28]:
# Print the predicted topic distribution for each document
for i in range(5):
    print(f"Document {i+1} - Predicted Topic Distribution: {p_z[i]}")

Document 1 - Predicted Topic Distribution: [4.73716037e-08 9.21917109e-14 8.66493314e-09 2.26567772e-03
 6.48150632e-12 2.93040603e-04 4.25761813e-14 2.05599113e-12
 1.13364233e-11 4.37270564e-01 2.14973847e-07 9.61023794e-05
 6.08348688e-11 3.78674305e-07 4.67115553e-06 2.30218878e-05
 8.53931992e-05 1.32929590e-08 3.55761753e-10 5.59960866e-01]
Document 2 - Predicted Topic Distribution: [1.54423580e-02 9.99166468e-08 5.25416142e-04 1.73460251e-01
 2.57384518e-06 2.22969493e-03 1.68949209e-08 3.64062203e-10
 2.38281398e-05 8.02865286e-01 5.69197118e-06 1.66427661e-03
 1.47480641e-06 1.10983340e-05 3.65675619e-05 6.05341211e-05
 1.84304462e-03 9.17443543e-05 3.10003206e-06 1.73294285e-03]
Document 3 - Predicted Topic Distribution: [3.62195611e-04 3.47178282e-10 6.06138862e-07 1.92066083e-03
 2.88328994e-07 5.38469495e-10 6.70243425e-16 1.91898910e-13
 7.23044183e-08 9.03752087e-02 2.27337418e-06 1.53911188e-05
 1.45440890e-15 2.73664482e-04 7.02985474e-06 3.87751899e-06
 2.05557678e-04

In [29]:
def display_topics(model, vocab, n_top_words):
    for i, topic_dist in enumerate(model.phi):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print(f'Topic {i+1}: {" ".join(topic_words)}')


In [30]:
display_topics(dma, vectorizer.get_feature_names_out(), n_top_words=10)

Topic 1: path patches devices second paste richard rates averages sale umich
Topic 2: bindings kicked ridiculous locators green histograms wet east eddy nancy
Topic 3: tom powerful pronounced lack machine vast login pounds holds alias
Topic 4: mistake criminals thier riding lies enhancing haven street nancy lady
Topic 5: stills risc stopped modeling leaves american pica screeching van white
Topic 6: flow foot fool idea 212 ones 21 sect 205 line
Topic 7: imported kbytes prove fucking improvements comparable van gov awful flying
Topic 8: eddy pica locators kicked lack topic trained vice green martial
Topic 9: gunners solved decisions 400 pay battle sessions uuencoded mar property
Topic 10: tmp vice supplies tom court 000 siggraph distributed desert pavement
Topic 11: bindings nurbs variable bios exchange model green recommend terry mac
Topic 12: henry 22 6045 xx motorcyclists described 24th 144 49931 needless
Topic 13: happen d8 happens coreldraw 000 searching foot pp hawk leaves
Topic 1