In [1]:
import pickle
from time import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA, NMF

In [2]:
with open('data/out/clean_stories.pickle', 'rb') as f:
    stories = pickle.load(f)

In [3]:
len(stories)

98

In [4]:
# data = [story['text'] for story in stories]
data = [text for story in stories for text in story['texts'].values()]

In [5]:
len(data)

5294

In [6]:
data_samples = data
n_samples = len(data_samples)
n_top_words = 10

In [7]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [8]:
nf_nc_values = [(50, 5), (100, 10), (150, 15), (200, 20)]

# NMF

In [9]:
for n_features, n_components in nf_nc_values:
    print(n_features, n_components)

    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10,
                                       max_features=n_features,)
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    print("done in %0.3fs.\n" % (time() - t0))

    # Fit the NMF model
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d, n_features=%d, and n_topics=%d."
          % (n_samples, n_features, n_components))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              alpha=.1, l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (Frobenius norm):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    # Fit the NMF model
    print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
          "tf-idf features, n_samples=%d, n_features=%d, and n_topics=%d."
          % (n_samples, n_features, n_components))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
              l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

50 5
Extracting tf-idf features for NMF...
done in 1.539s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=5294, n_features=50, and n_topics=5.
done in 0.402s.

Topics in NMF model (Frobenius norm):
Topic #0: ojos mano manos cabeza puerta cuerpo mirada labios sonrisa brazos
Topic #1: padre madre casa hermano puerta habitacion años vida tiempo noche
Topic #2: sangre vampiro vampiros vida tiempo cuerpo años mujer ojos dolor
Topic #3: gracias tiempo sonrisa chico corazon beso casa vida chica años
Topic #4: lobo hombre suelo dolor casa miedo chica ojos forma fuerte

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=5294, n_features=50, and n_topics=5.
done in 2.625s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: ojos cabeza voz mano brazos sonrisa manos puerta mirada rostro
Topic #1: casa padre años madre vida hermano noche pequeña paso pregunta
Topic #2: vampiro vampiros sangre vida años pala

# LDA

In [10]:
for n_features, n_components in nf_nc_values:
    print(n_features, n_components)

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10,
                                    max_features=n_features,)
    t0 = time()
    tf = tf_vectorizer.fit_transform(data_samples)
    print("done in %0.3fs." % (time() - t0))
    print()

    # Fit the LDA model
    print("Fitting LDA models with tf features, "
          "n_samples=%d, n_features=%d, and n_topics=%d."
          % (n_samples, n_features, n_components))
    lda = LDA(n_components=n_components, max_iter=10,
            learning_method='online',
            learning_offset=50.,
            random_state=0)
    t0 = time()
    lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

50 5
Extracting tf features for LDA...
done in 1.448s.

Fitting LDA models with tf features, n_samples=5294, n_features=50, and n_topics=5.
done in 19.004s.

Topics in LDA model:
Topic #0: puerta casa habitacion cama ojos chico chica tiempo gracias cabeza
Topic #1: padre madre casa ojos mirada tiempo pregunta voz sonrisa años
Topic #2: lobo hombre mujer ojos vida dolor años cuerpo voz corazon
Topic #3: ojos manos mano cuerpo cabeza mirada labios rostro sonrisa brazos
Topic #4: sangre vampiro vampiros ojos cabeza mano cuerpo manos rostro cuello

100 10
Extracting tf features for LDA...
done in 1.461s.

Fitting LDA models with tf features, n_samples=5294, n_features=100, and n_topics=10.
done in 26.881s.

Topics in LDA model:
Topic #0: casa auto puerta padres mañana cara hora noche gracias camino
Topic #1: ojos mano manos cabeza suelo brazos cuello cuerpo sangre labios
Topic #2: hombre mujer hermano años hermana lobo vida familia bosque puerta
Topic #3: luna manada lobo bosque ojos casa 

El algoritmo NMF (Non-negative Matrix Factorization) considera que un conjunto grande de documentos (donde cada documento está formado por un conjunto de palabras con sus respectivas frecuencias en el documento) se puede descomponer y reducir a un subconjunto menor de documentos arquetipos. Aplicado a extracción de tópicos, estos arquetipos representan los tópicos del conjunto total de documentos. [2][3][4]

LDA (Latent Dirichlet Allocation ) [5][6][7]. (Descripción completa y más referencias en el documento de Word adjunto.)

**References**

[2] Lee, Daniel D., and H. Sebastian Seung. 1999. “Learning the Parts of Objects by Non-Negative Matrix Factorization.” *Nature* 401 (6755): 788. https://doi.org/10.1038/44565.

[3] Cichocki, Andrzej, and P. H. A. N. Anh-Huy. “Fast local algorithms for large scale nonnegative matrix and tensor factorizations.” IEICE transactions on fundamentals of electronics, communications and computer sciences 92.3: 708-721, 2009.

[4] Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix factorization with the beta-divergence. Neural Computation, 23(9).

[5] Blei, David M., Andrew Y. Ng, and Michael I. Jordan. 2003. “Latent Dirichlet Allocation.” *Journal of Machine Learning Research* 3 (March): 993–1022.

[6] Hoffman, Matthew, Francis R. Bach, and David M. Blei. 2010. “Online Learning for Latent Dirichlet Allocation.” In *Advances in Neural Information Processing Systems 23*, edited by J. D. Lafferty, C. K. I. Williams, J. Shawe-Taylor, R. S. Zemel, and A. Culotta, 856–864. Curran Associates, Inc. http://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation.pdf.

[7] Hoffman, Matt, David M. Blei, Chong Wang, and John Paisley. 2013. “Stochastic Variational Inference.” *ArXiv:1206.7051 [Cs, Stat]*, June. http://arxiv.org/abs/1206.7051.