In [1]:
import numpy as np

### Initial data

- `BOW`: Un diccionario que representa un "Bag of Words" (bolsa de palabras), que asocia palabras con sus respectivas frecuencias en un corpus.
- `ULM`: Un diccionario que proporciona probabilidades iniciales de fondo (background) para cada palabra. Estas probabilidades suman 1 y representan un modelo base de cómo se distribuyen las palabras en general.

In [2]:
BOW = {
    'the' : 4,
    'paper' : 2,
    'text' : 4,
    'mining' : 2
}

ULM = {
    'the' : 0.5,
    'paper' : 0.3,
    'text' : 0.1,
    'mining' : 0.1
}

In [3]:
# Convierte los valores del diccionario `ULM`` en un array de NumPy para cálculos posteriores.
np.array(list(ULM.values()))

array([0.5, 0.3, 0.1, 0.1])

In [4]:
# Inicializa las probabilidades para los temas (topics) utilizando una distribución de Dirichlet.
weight_initialization = np.random.dirichlet(np.ones(4),size=1)

# La suma de todas las probabilidades generadas para asegurar que suman 1 (característica de la distribución de Dirichlet).
sum(weight_initialization.flatten()), weight_initialization.flatten().tolist()

(0.9999999999999999,
 [0.2764594829885556,
  0.393871557703625,
  0.1997951116104737,
  0.12987384769734567])

In [5]:
def em_algorithm(ulm, bow, p_topic_1, p_background, alpha):
    

    word_counts = np.array(list(bow.values()))
    #initial_ps = np.random.dirichlet(np.ones(len(word_counts)), size= 1).flatten().tolist()
    p_w_back = np.array(list(ulm.values()))
    # M-step
    p_w_topic = np.array([0.25, 0.25, 0.25, 0.25])
    # E-step
    p_z_word = (p_topic_1 * p_w_topic) / ((p_topic_1 * p_w_topic) + (p_background * p_w_back))
    # Likelihood
    likelihood = []
    for i in range(len(word_counts)):
        like = ((p_topic_1 * p_w_topic[i]) + (p_background * p_w_back[i])) ** word_counts[i]
        likelihood.append(like)

    log_like = np.log(np.prod(likelihood))

    
    lk = 99
    it = 1
    while lk >= alpha:
        # M-step
        new_p_w_topic = []
        for i in range(len(word_counts)):
            
            new_p_w_t = (word_counts[i] * p_z_word[i]) / sum(word_counts * p_z_word)
            new_p_w_topic.append(new_p_w_t)
        
        #E-step
        new_p_z_word = (p_topic_1 * np.array(new_p_w_topic)) / ((p_topic_1 * np.array(new_p_w_topic)) + (p_background * p_w_back))

        # Likelihood
        new_likelihood = []
        for i in range(len(word_counts)):
            new_like = ((p_topic_1 * new_p_w_topic[i]) + (p_background * p_w_back[i])) ** word_counts[i]
            new_likelihood.append(new_like)

        new_log_like = np.log(np.prod(new_likelihood))
        lk = np.absolute(log_like) - np.absolute(new_log_like)

        p_w_topic = new_p_w_topic
        p_z_word = new_p_z_word
        log_like = new_log_like
        it += 1
    
    print(it, lk)
    return p_w_topic, p_z_word

In [85]:
p_w_topic, p_z_word = em_algorithm(ULM, BOW, 0.5, 0.5, 1e-5)

24 9.818380137360805e-06


In [87]:
sum(p_w_topic), sum(p_z_word)

(1.0, 1.9033115393103048)