Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of search index generation #60

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -6,3 +6,4 @@ htmlcov/
mkdocs.egg-info/
*.pyc
.coverage
.idea
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As other comment, let's keep these out, as they're really user-specific.

13 changes: 12 additions & 1 deletion mkdocs/build.py
@@ -1,6 +1,6 @@
#coding: utf-8

from mkdocs import nav, toc, utils
from mkdocs import nav, toc, utils, search
from urlparse import urljoin
import jinja2
import markdown
Expand Down Expand Up @@ -134,6 +134,7 @@ def build_pages(config):
site_navigation = nav.SiteNavigation(config['pages'])
loader = jinja2.FileSystemLoader(config['theme_dir'])
env = jinja2.Environment(loader=loader)
search_index = search.SearchIndex()

for page in site_navigation.walk_pages():
# Read the input file
Expand Down Expand Up @@ -162,6 +163,16 @@ def build_pages(config):
output_path = os.path.join(config['site_dir'], page.output_path)
utils.write_file(output_content.encode('utf-8'), output_path)

#add search entry
search_index.addEntryFromContext(
page, html_content, site_navigation,
table_of_contents, meta, config
)

#save search index to disk
output_content = search_index.generate_search_index()
output_path = os.path.join(config['site_dir'], 'search_content.json')
utils.write_file(output_content.encode('utf-8'), output_path)

def build(config):
"""
Expand Down
129 changes: 129 additions & 0 deletions mkdocs/search.py
@@ -0,0 +1,129 @@
from pprint import pprint
from HTMLParser import HTMLParser
import json
import re
from HTMLParser import HTMLParser

#dataholder for index
class SearchIndex(object):
def __init__(self):
self.pages = []

#add entry based on predetermined properties
def addEntryFromContext(self, page, content, nav, toc, meta, config):

#create parser for analysing content
#we parse the content since the toc dont have the data
#and we need to use toc urls
parser = ContentParser()
parser.feed(content)

#create entry for page
self.createEntry(
title=page.title,
text=self.strip_tags(content).rstrip('\n'),
tags="",
loc=page.abs_url
)

#check all found sections against toc, match on id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use # Spaces between the hash and the text

for section in parser.data:
#toc h1
for toc_item in toc:
#dont check sub sections if found
if toc_item.url[1:] == section.id and len(section.text) > 0:
#create entry
self.createEntry(
title=toc_item.title,
text=" ".join(section.text),
tags="",
loc=page.abs_url[:-1] + toc_item.url
)
#not fund, check h2
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: 'fund'

else:
#toc h2
for toc_sub_item in toc_item.children:
if toc_sub_item.url[1:] == section.id and len(section.text) > 0:
#create entry
self.createEntry(
title=toc_sub_item.title,
text=" ".join(section.text),
tags="",
loc=page.abs_url[:-1] + toc_sub_item.url
)

#create a index entry
def createEntry(self, title,text,tags,loc):
entry = SearchEntry(
title=title,
text=text.strip().encode('utf-8'),
tags=tags,
loc=loc
)
self.pages.append(entry)

#python to JSON conversion
def generate_search_index(self):
return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)

#strip html tags from data
def strip_tags(self,html):
s = MLStripper()
s.feed(html)
return s.get_data()

#data container for a index entry
class SearchEntry(object):
def __init__(self, title, text, tags, loc):
self.title = title
self.text = text
self.tags = tags
self.loc = loc

#class for stripping html tags
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)

#class for parsing html-sections
class ContentParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)

self.data = []
self.section = ""
self.is_header_tag = False

#hook - tag start
def handle_starttag(self, tag, attrs):
if tag in ("h1","h2"):
self.is_header_tag = True
self.section = ContentSection()
for attr in attrs:
if attr[0] == "id":
self.section.id = attr[1]

#hook - tag end
def handle_endtag(self, tag):
if tag in ("h1","h2"):
self.is_header_tag = False
self.data.append(self.section)

#hook - data
def handle_data(self, data):
if self.is_header_tag:
self.section.title = data
else:
self.section.text.append(data.rstrip('\n'))

#content-holder for html-sections
class ContentSection():
def __init__(self):
self.text = []
self.id = ""
self.title = ""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing a newline at the end here.