topics

A topic modeling suite. Below is an example usage; loading a corpus, discovering a topic model for it and using that model to infer the topics of a new document.

import topics # Topic modelling library
import sys
from os import listdir
from os.path import isfile, join
import numpy as np


# Create a list of TextDocument based on the files in a directory given at the command line
fpaths = [ join(sys.argv[1], f) for f in listdir(sys.argv[1]) if isfile(join(sys.argv[1], f)) ]
docs = []
for f in fpaths:
	fobj = open(f)
	# Create the document with the contents of the file, lowercasing it
	docs.append(topics.TextDocument(fobj.read(), to_lower=True))
	fobj.close


# Create a corpus based on the list of documents, filtering the 25 most common
# words, words that appear less than 3 times, and some specific words.
filters = {	'filter_high':	(topics.Corpus.FILTER_FREQUENCY, 25),
			'filter_low': 	(topics.Corpus.FILTER_COUNT, 3),
			'filter_set': 	set(["nevertheless", "insofar", "etc"]) }
corp = topics.Corpus(docs, **filters)


# Learn a topic model on the corpus. 10 topics are learned.
NUM_TOPICS = 10
model = topics.gibbs_lda_learn(corp, NUM_TOPICS, num_iterations=1000)


# Show the list of words that contribute to each topic
for topic in range(NUM_TOPICS):
	print model.describe_topic(topic)

# Show the main topic for each document
for doc_idx in range(len(corp)):
	# A list of how each topic contributes to the document, in percent
	topic_percentages = model.describe_document(doc_idx)
	
	main_topic_percent = max(topic_percentages)
	main_topic = topic_percentages.index(main_topic_percent)
	print "Document %d consists mostly of topic %d" % (doc_idx, main_topic)

# Using the model, infer the topic distribution on a new document whose path was
# obtained at the command line. To filter this document, we just remove common
# and uncommon words (removing stop words would be good here too)
fobj = open(sys.argv[2])
new_doc = topics.TextDocument(fobj.read(), to_lower=True)
fobj.close()
new_model = topics.gibbs_lda_infer(new_doc, model, 
							filter_low=(topics.Corpus.FILTER_COUNT, 2),
							filter_high=(topics.Corpus.FILTER_COUNT, 10))
# new_model contains only our new document


# Show how each topic contributes to the new document
print new_model.describe_document(0)

Name		Name	Last commit message	Last commit date
Latest commit History 20 Commits
src		src
.gitignore		.gitignore
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

topics

About

Releases

Packages

Languages

networkdynamics/topics

Folders and files

Latest commit

History

Repository files navigation

topics

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages