Skip to content


Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

executable file 102 lines (80 sloc) 3.057 kb
STOPLIST = '../../tools/nltk_term_index.stoplist'
FILENAMES = ['ch%02d.xml' % n for n in range(13)]
TARGET_DIR = 'nlp/'
#FILENAMES = ['../doc/book/ll.xml']
import re, sys
import nltk
import epydoc.docbuilder, epydoc.cli
from epydoc import log
logger = epydoc.cli.ConsoleLogger(0)
logger._verbosity = 5
def find_all_names(stoplist):
ROOT = ['nltk']
logger._verbosity = 0
docindex = epydoc.docbuilder.build_doc_index(ROOT, add_submodules=True)
valdocs = sorted(docindex.reachable_valdocs(
#packages=False, bases=False, submodules=False,
logger._verbosity = 5
names = nltk.defaultdict(list)
n = 0
for valdoc in valdocs:
name = valdoc.canonical_name
if (name is not epydoc.apidoc.UNKNOWN and
name is not None and name[0] == 'nltk'):
n += 1
for i in range(len(name)):
key = str(name[i:])
if len(key) == 1: continue
if key in stoplist: continue
names[key].append(valdoc)'Found %s names from %s objects' % (len(names), n))
return names
SCAN_RE1 = "<programlisting>[\s\S]*?</programlisting>"
SCAN_RE2 = "<literal>[\s\S]*?</literal>"
SCAN_RE = re.compile("(%s)|(%s)" % (SCAN_RE1, SCAN_RE2))
TOKEN_RE = re.compile('[\w\.]+')
LINE_RE = re.compile('.*')
INDEXTERM = '<indexterm type="nltk"><primary>%s</primary></indexterm>'
def scan_xml(filenames, names):
fdist = nltk.FreqDist()
def linesub(match):
line =
for token in TOKEN_RE.findall(line):
if token in names:
targets = names[token]
if len(targets) > 1:
log.warning('%s is ambiguous: %s' % (
token, ', '.join(str(v.canonical_name)
for v in names[token])))
line += INDEXTERM % token
#line += INDEXTERM % names[token][0].canonical_name
return line
def scansub(match):
return LINE_RE.sub(linesub,
for filename in filenames:' %s' % filename)
src = open(filename, 'rb').read()
src = SCAN_RE.sub(scansub, src)
# out = open(filename[:-4]+'.li.xml', 'wb')
out = open(TARGET_DIR + filename, 'wb')
for word in fdist:
namestr = ('\n'+38*' ').join([str(v.canonical_name[:-1])
for v in names[word][:1]])
print '[%3d] %-30s %s' % (fdist[word], word, namestr)
def main():'Loading stoplist...')
stoplist = open(STOPLIST).read().split()' Stoplist contains %d words' % len(stoplist))'Running epydoc to build a name index...')
names = find_all_names(stoplist)'Scanning xml files...')
scan_xml(FILENAMES, names)
Jump to Line
Something went wrong with that request. Please try again.