In [1]:
""" Get the 20 news groups data """
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import numpy as np

newsgroups_train = fetch_20newsgroups(shuffle=True, random_state=1, subset="train", 
                                      remove=("headers", "footers", "quotes"))

newsgroups_test = fetch_20newsgroups(shuffle=True, random_state=1, subset="test",
                                remove=("headers", "footers", "quotes"))

""" Prepare input for sklearn (counts) """
n_features = 3000
vectorizer = CountVectorizer(max_features=n_features, stop_words="english")

# Word counts per document matrix (input for sklearn)
W_train = vectorizer.fit_transform(newsgroups_train.data)
W_test = vectorizer.transform(newsgroups_test.data)

# Keep track of vocabulary to visualize top words of each topic
vocabulary = vectorizer.get_feature_names()

In [2]:
from sklearn.decomposition import LatentDirichletAllocation

def visualizeTopics(model, id2word, n_top_words=12):
    for i, topic in enumerate(model.components_):
        print "Topic {}:".format(i+1)
        print " ".join([id2word[j] 
                        for j in topic.argsort()[:-n_top_words - 1:-1]])

        
topic_numbers = range(2, 30, 2)
train_perplexity = []
test_perplexity = []

for n_topics in topic_numbers:
    print n_topics
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_method="batch")
    
    # Train on training set
    lda.fit(W_train)
    
    # Compute training and test perplexities
    train_perplexity.append(lda.perplexity(W_train))
    test_perplexity.append(lda.perplexity(W_test))


train_perplexity = np.array(train_perplexity)
test_perplexity = np.array(test_perplexity)

np.savetxt("perp_train_sklearn.txt", train_perplexity)
np.savetxt("perp_test_sklearn.txt", test_perplexity)
    
# Visualize learnt topics
# visualizeTopics(lda_sklearn, vocabulary)

2




4




6




8




10




12




14




16




18




20




22




24




26




28




NameError: name 'np' is not defined

In [None]:
from matplotlib import pyplot as plt

plt.plot(topic_numbers, train_perplexity, label="Training")
plt.plot(topic_numbers, test_perplexity, label="Testing")
plt.title("Perplexity as a function of the number of topics")
plt.xlabel("Number of topics")
plt.ylabel("Perplexity")
plt.legend()
plt.savefig("plots/perplexity_as_a_function_of_number_of_topics")
plt.show()

In [None]:
# Visualize learnt topics for the model with the best test set perplexity
lda = LatentDirichletAllocation(n_topics=12, learning_method="batch")
lda.fit(W_train)
visualizeTopics(lda, vocabulary)