# DMA - Scratch Implementation

Here, I'm implementing the DMA model. 

## Importing libraries

In [1]:
import numpy as np

## Model Implementation

In [1]:
class DMA:
    def __init__(self, K, alpha, beta):
        self.K = K  # number of topics
        self.alpha = alpha  # hyperparameter for the document-topic distribution
        self.beta = beta  # hyperparameter for the topic-word distribution

    def fit(self, X, max_iter=100):
        N, V = X.shape
        z = np.zeros((N, V), dtype=int)  # topic assignments for each word in each document
        n_z = np.zeros(self.K, dtype=int)  # number of words assigned to each topic
        n_zw = np.zeros((self.K, V), dtype=int)  # number of occurrences of each word in each topic
        n_zd = np.zeros((self.K, N), dtype=int)  # number of words assigned to each topic in each document
        n_z_sum = np.zeros(self.K, dtype=int)  # total number of words assigned to each topic

        # Initialize topic assignments randomly
        for i in range(N):
            for j in range(V):
                if X[i, j] > 0:
                    z[i, j] = np.random.choice(self.K)
                    n_z[z[i, j]] += X[i, j]
                    n_zw[z[i, j], j] += X[i, j]
                    n_zd[z[i, j], i] += X[i, j]
                    n_z_sum[z[i, j]] += X[i, j]

        # Iterate over the data and update topic assignments
        for _ in range(max_iter):
            for i in range(N):
                for j in range(V):
                    if X[i, j] > 0:
                        # Remove the current word from the topic assignment
                        n_z[z[i, j]] -= X[i, j]
                        n_zw[z[i, j], j] -= X[i, j]
                        n_zd[z[i, j], i] -= X[i, j]
                        n_z_sum[z[i, j]] -= X[i, j]

                        # Compute the posterior probabilities of the topics
                        p_z = (n_zw[:, j] + self.beta) / (n_z_sum + self.beta * V) * (n_zd[:, i] + self.alpha)
                        p_z /= p_z.sum()

                        # Sample a new topic assignment from the posterior probabilities
                        z[i, j] = np.random.choice(self.K, p=p_z)

                        # Add the current word to the new topic assignment
                        n_z[z[i, j]] += X[i, j]
                        n_zw[z[i, j], j] += X[i, j]
                        n_zd[z[i, j], i] += X[i, j]
                        n_z_sum[z[i, j]] += X[i, j]

        # Compute the topic-word distribution
        self.phi = (n_zw + self.beta) / (n_z_sum[:, np.newaxis] + self.beta * V)

    def predict(self, X):
        N, V = X.shape
        p_z = np.zeros((N, self.K))

        # Compute the posterior probabilities of the topics for each document
        for i in range(N):
            for j in range(V):
                if X[i, j] > 0:
                    p_z[i] += X[i, j] * np.log(self.phi[:, j])

        # Add the prior on the document-topic distribution
        p_z += np.log(self.alpha)

        # Normalize the probabilities for each document
        p_z = np.exp(p_z - p_z.max(axis=1, keepdims=True))
        p_z /= p_z.sum(axis=1, keepdims=True)

        return p_z


In [2]:
# Generate synthetic data
np.random.seed(42)
N = 100  # number of documents
V = 50  # vocabulary size
K = 5  # number of topics

# Generate document-topic distribution
theta = np.random.dirichlet([0.5] * K, N)

# Generate topic-word distribution
phi = np.random.dirichlet([0.5] * V, K)

# Generate documents
X = np.zeros((N, V), dtype=int)
for i in range(N):
    z = np.random.choice(K, p=theta[i])
    X[i] = np.random.multinomial(100, phi[z])

# Fit DMA model
alpha = 0.5
beta = 0.5

In [3]:
dma = DMA(K, alpha, beta)
dma.fit(X)

In [4]:
# Generate a new document
X_new = np.zeros((5, V), dtype=int)
for i in range(5):
    z = np.random.choice(K, p=theta[i])
    X_new[i] = np.random.multinomial(100, phi[z])

In [5]:
# Predict the topic distribution for the new document
predicted_topics = dma.predict(X_new)

In [6]:
# Print the predicted topic distribution for each document
for i in range(5):
    print(f"Document {i+1} - Predicted Topic Distribution: {predicted_topics[i]}")

Document 1 - Predicted Topic Distribution: [1.32269992e-175 1.56879766e-147 1.00000000e+000 2.65316137e-135
 1.48498789e-115]
Document 2 - Predicted Topic Distribution: [4.88275435e-078 1.83287752e-078 3.98570932e-086 4.70088113e-106
 1.00000000e+000]
Document 3 - Predicted Topic Distribution: [3.21519914e-169 2.17509076e-124 1.00000000e+000 9.11490836e-149
 1.18314102e-091]
Document 4 - Predicted Topic Distribution: [1.06264695e-085 3.47572747e-089 1.82573773e-085 1.16590233e-101
 1.00000000e+000]
Document 5 - Predicted Topic Distribution: [1.89812934e-151 1.09980934e-127 1.00000000e+000 8.97912522e-145
 1.35943884e-089]
