Skip to content

networkdynamics/topics

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

20 Commits
 
 
 
 
 
 

Repository files navigation

topics

A topic modeling suite. Below is an example usage; loading a corpus, discovering a topic model for it and using that model to infer the topics of a new document.

import topics # Topic modelling library
import sys
from os import listdir
from os.path import isfile, join
import numpy as np


# Create a list of TextDocument based on the files in a directory given at the command line
fpaths = [ join(sys.argv[1], f) for f in listdir(sys.argv[1]) if isfile(join(sys.argv[1], f)) ]
docs = []
for f in fpaths:
	fobj = open(f)
	# Create the document with the contents of the file, lowercasing it
	docs.append(topics.TextDocument(fobj.read(), to_lower=True))
	fobj.close


# Create a corpus based on the list of documents, filtering the 25 most common
# words, words that appear less than 3 times, and some specific words.
filters = {	'filter_high':	(topics.Corpus.FILTER_FREQUENCY, 25),
			'filter_low': 	(topics.Corpus.FILTER_COUNT, 3),
			'filter_set': 	set(["nevertheless", "insofar", "etc"]) }
corp = topics.Corpus(docs, **filters)


# Learn a topic model on the corpus. 10 topics are learned.
NUM_TOPICS = 10
model = topics.gibbs_lda_learn(corp, NUM_TOPICS, num_iterations=1000)


# Show the list of words that contribute to each topic
for topic in range(NUM_TOPICS):
	print model.describe_topic(topic)

# Show the main topic for each document
for doc_idx in range(len(corp)):
	# A list of how each topic contributes to the document, in percent
	topic_percentages = model.describe_document(doc_idx)
	
	main_topic_percent = max(topic_percentages)
	main_topic = topic_percentages.index(main_topic_percent)
	print "Document %d consists mostly of topic %d" % (doc_idx, main_topic)

# Using the model, infer the topic distribution on a new document whose path was
# obtained at the command line. To filter this document, we just remove common
# and uncommon words (removing stop words would be good here too)
fobj = open(sys.argv[2])
new_doc = topics.TextDocument(fobj.read(), to_lower=True)
fobj.close()
new_model = topics.gibbs_lda_infer(new_doc, model, 
							filter_low=(topics.Corpus.FILTER_COUNT, 2),
							filter_high=(topics.Corpus.FILTER_COUNT, 10))
# new_model contains only our new document


# Show how each topic contributes to the new document
print new_model.describe_document(0)

Releases

No releases published

Packages

No packages published

Languages