Permalink
Browse files

Implemented search for the website

  • Loading branch information...
mitsuhiko committed Sep 10, 2011
1 parent 37767dc commit 33c0a28cc0d339245d524d3eac67a121c48b6969
View
@@ -1,6 +1,8 @@
.DS_Store
*.pyc
*.pyo
+*.whoosh
+*.db
env
dist
_mailinglist/*
@@ -1,11 +1,8 @@
from flask import Flask, session, g, render_template
from flaskext.openid import OpenID
-import websiteconfig as config
-
app = Flask(__name__)
-app.debug = config.DEBUG
-app.secret_key = config.SECRET_KEY
+app.config.from_object('websiteconfig')
from flask_website.openid_auth import DatabaseOpenIDStore
oid = OpenID(app, store_factory=DatabaseOpenIDStore)
@@ -26,6 +23,7 @@ def load_current_user():
def remove_db_session(exception):
db_session.remove()
+
app.add_url_rule('/docs/', endpoint='docs.index', build_only=True)
app.add_url_rule('/docs/<path:page>/', endpoint='docs.show',
build_only=True)
View
@@ -1,16 +1,17 @@
from datetime import datetime
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
- ForeignKey
+ ForeignKey, event
from sqlalchemy.orm import scoped_session, sessionmaker, backref, relation
from sqlalchemy.ext.declarative import declarative_base
from werkzeug import cached_property, http_date
-from flask import url_for
-from flask_website import config
+from flask import url_for, Markup
+from flask_website import app, search
-engine = create_engine(config.DATABASE_URI, convert_unicode=True,
- **config.DATABASE_CONNECT_OPTIONS)
+engine = create_engine(app.config['DATABASE_URI'],
+ convert_unicode=True,
+ **app.config['DATABASE_CONNECT_OPTIONS'])
db_session = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine))
@@ -38,7 +39,7 @@ def to_json(self):
@property
def is_admin(self):
- return self.openid in config.ADMINS
+ return self.openid in app.config['ADMINS']
def __eq__(self, other):
return type(self) is type(other) and self.id == other.id
@@ -69,7 +70,7 @@ def url(self):
return url_for('snippets.category', slug=self.slug)
-class Snippet(Model):
+class Snippet(Model, search.Indexable):
__tablename__ = 'snippets'
id = Column('snippet_id', Integer, primary_key=True)
author_id = Column(Integer, ForeignKey('users.user_id'))
@@ -81,6 +82,8 @@ class Snippet(Model):
author = relation(User, backref=backref('snippets', lazy='dynamic'))
category = relation(Category, backref=backref('snippets', lazy='dynamic'))
+ search_document_kind = 'snippet'
+
def __init__(self, author, title, body, category):
self.author = author
self.title = title
@@ -96,6 +99,21 @@ def to_json(self):
author=self.author.to_json(),
category=self.category.slug)
+ def get_search_document(self):
+ return dict(
+ id=unicode(self.id),
+ title=self.title,
+ keywords=[self.category.name],
+ content=self.body
+ )
+
+ @classmethod
+ def describe_search_result(cls, result):
+ obj = cls.query.get(int(result['id']))
+ if obj is not None:
+ text = obj.rendered_body.striptags()
+ return Markup(result.highlights('content', text=text)) or None
+
@property
def url(self):
return url_for('snippets.show', id=self.id)
@@ -154,3 +172,6 @@ class OpenIDUserNonce(Model):
server_url = Column(String(1024))
timestamp = Column(Integer)
salt = Column(String(40))
+
+
+event.listen(db_session, 'after_flush', search.update_model_based_indexes)
View
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+import os
+import re
+from flask import url_for, Markup
+from flask_website import app
+from flask_website.search import Indexable
+
+
+_doc_body_re = re.compile(r'''(?smx)
+ <title>(.*?)</title>.*?
+ <div\s+class="body">(.*?)<div\s+class="sphinxsidebar">
+''')
+
+
+class DocumentationPage(Indexable):
+ search_document_kind = 'documentation'
+
+ def __init__(self, slug):
+ self.slug = slug
+ fn = os.path.join(app.config['DOCUMENTATION_PATH'],
+ slug, 'index.html')
+ with open(fn) as f:
+ contents = f.read().decode('utf-8')
+ title, text = _doc_body_re.search(contents).groups()
+ self.title = Markup(title).striptags().split(u'')[0].strip()
+ self.text = Markup(text).striptags().strip().replace(u'', u'')
+
+ def get_search_document(self):
+ return dict(
+ id=unicode(self.slug),
+ title=self.title,
+ keywords=[],
+ content=self.text
+ )
+
+ @property
+ def url(self):
+ return url_for('docs.show', page=self.slug)
+
+ @classmethod
+ def describe_search_result(cls, result):
+ rv = cls(result['id'])
+ return Markup(result.highlights('content', text=rv.text)) or None
+
+ @classmethod
+ def iter_pages(cls):
+ base_folder = os.path.abspath(app.config['DOCUMENTATION_PATH'])
+ for dirpath, dirnames, filenames in os.walk(base_folder):
+ if 'index.html' in filenames:
+ slug = dirpath[len(base_folder) + 1:]
+ # skip the index page. useless
+ if slug:
+ yield DocumentationPage(slug)
@@ -0,0 +1,83 @@
+from hashlib import md5
+from flask import Markup, url_for, json
+from werkzeug import parse_date, http_date
+from jinja2.utils import urlize
+from flask_website import app
+from flask_website.utils import split_lines_wrapping
+
+
+class Mail(object):
+
+ def __init__(self, d):
+ self.msgid = d['msgid']
+ self.author_name, self.author_addr = d['author']
+ self.date = parse_date(d['date'])
+ self.subject = d['subject']
+ self.children = [Mail(x) for x in d['children']]
+ self.text = d['text']
+
+ def rendered_text(self):
+ result = []
+ in_sig = False
+ for line in split_lines_wrapping(self.text):
+ if line == u'-- ':
+ in_sig = True
+ # the extra space at the end is a simple workaround for
+ # urlize not to consume the </span> as part of the URL
+ if in_sig:
+ line = Markup(u'<span class=sig>%s </span>') % line
+ elif line.startswith('>'):
+ line = Markup(u'<span class=quote>%s </span>') % line
+ result.append(urlize(line))
+ return Markup(u'\n'.join(result))
+
+ def to_json(self):
+ rv = vars(self).copy()
+ rv.pop('author_email', None)
+ rv['date'] = http_date(rv['date'])
+ rv['children'] = [c.to_json() for c in rv['children']]
+ return rv
+
+ @property
+ def id(self):
+ return md5(self.msgid.encode('utf-8')).hexdigest()
+
+
+class Thread(object):
+
+ def __init__(self, d):
+ self.slug = d['slug'].rsplit('/', 1)[-1]
+ self.title = d['title']
+ self.reply_count = d['reply_count']
+ self.author_name, self.author_email = d['author']
+ self.date = parse_date(d['date'])
+ if 'root' in d:
+ self.root = Mail(d['root'])
+
+ @staticmethod
+ def get(year, month, day, slug):
+ try:
+ with open('%s/threads/%s-%02d-%02d/%s' %
+ (app.config['MAILINGLIST_PATH'], year, month,
+ day, slug)) as f:
+ return Thread(json.load(f))
+ except IOError:
+ pass
+
+ @staticmethod
+ def get_list():
+ with open('%s/threads/threadlist' % app.config['MAILINGLIST_PATH']) as f:
+ return [Thread(x) for x in json.load(f)]
+
+ @property
+ def url(self):
+ return url_for('mailinglist.show_thread', year=self.date.year,
+ month=self.date.month, day=self.date.day,
+ slug=self.slug)
+
+ def to_json(self):
+ rv = vars(self).copy()
+ rv['date'] = http_date(rv['date'])
+ if 'root' in rv:
+ rv['root'] = rv['root'].to_json()
+ return rv
View
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+import os
+from whoosh import highlight, analysis, qparser
+from whoosh.support.charset import accent_map
+from flask import Markup
+from flask_website import app
+from werkzeug import import_string
+
+
+def open_index():
+ from whoosh import index, fields as f
+ if os.path.isdir(app.config['WHOOSH_INDEX']):
+ return index.open_dir(app.config['WHOOSH_INDEX'])
+ os.mkdir(app.config['WHOOSH_INDEX'])
+ analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)
+ schema = f.Schema(
+ url=f.ID(stored=True, unique=True),
+ id=f.ID(stored=True),
+ title=f.TEXT(stored=True, field_boost=2.0, analyzer=analyzer),
+ type=f.ID(stored=True),
+ keywords=f.KEYWORD(commas=True),
+ content=f.TEXT(analyzer=analyzer)
+ )
+ return index.create_in(app.config['WHOOSH_INDEX'], schema)
+
+
+index = open_index()
+
+
+class Indexable(object):
+ search_document_kind = None
+
+ def add_to_search_index(self, writer):
+ writer.add_document(url=unicode(self.url),
+ type=self.search_document_type,
+ **self.get_search_document())
+
+ @classmethod
+ def describe_search_result(cls, result):
+ return None
+
+ @property
+ def search_document_type(self):
+ cls = type(self)
+ return cls.__module__ + u'.' + cls.__name__
+
+ def get_search_document(self):
+ raise NotImplementedError()
+
+ def remove_from_search_index(self, writer):
+ writer.delete_by_term('url', unicode(self.url))
+
+
+def highlight_all(result, field):
+ text = result[field]
+ return Markup(highlight.Highlighter(
+ fragmenter=highlight.WholeFragmenter(),
+ formatter=result.results.highlighter.formatter)
+ .highlight_hit(result, field, text=text)) or text
+
+
+class SearchResult(object):
+
+ def __init__(self, result):
+ self.url = result['url']
+ self.title_text = result['title']
+ self.title = highlight_all(result, 'title')
+ cls = import_string(result['type'])
+ self.kind = cls.search_document_kind
+ self.description = cls.describe_search_result(result)
+
+
+class SearchResultPage(object):
+
+ def __init__(self, results, page):
+ self.page = page
+ if results is None:
+ self.results = []
+ self.pages = 1
+ self.total = 0
+ else:
+ self.results = [SearchResult(r) for r in results]
+ self.pages = results.pagecount
+ self.total = results.total
+
+ def __iter__(self):
+ return iter(self.results)
+
+
+def search(query, page=1, per_page=20):
+ with index.searcher() as s:
+ qp = qparser.MultifieldParser(['title', 'content'], index.schema)
+ q = qp.parse(unicode(query))
+ try:
+ result_page = s.search_page(q, page, pagelen=per_page)
+ except ValueError:
+ if page == 1:
+ return SearchResultPage(None, page)
+ return None
+ results = result_page.results
+ results.highlighter.fragmenter.maxchars = 512
+ results.highlighter.fragmenter.surround = 40
+ results.highlighter.formatter = highlight.HtmlFormatter('em',
+ classname='search-match', termclass='search-term',
+ between=u'<span class=ellipsis> … </span>')
+ return SearchResultPage(result_page, page)
+
+
+def update_model_based_indexes(session, flush_context):
+ """Called by a session event, updates the model based documents."""
+ to_delete = []
+ to_add = []
+ for model in session.new:
+ if isinstance(model, Indexable):
+ to_add.append(model)
+
+ for model in session.dirty:
+ if isinstance(model, Indexable):
+ to_delete.append(model)
+ to_add.append(model)
+
+ for model in session.dirty:
+ if isinstance(model, Indexable):
+ to_delete.append(model)
+
+ if not (to_delete or to_add):
+ return
+
+ writer = index.writer()
+ for model in to_delete:
+ model.remove_from_search_index(writer)
+ for model in to_add:
+ model.add_to_search_index(writer)
+ writer.commit()
+
+
+def update_documentation_index():
+ from flask_website.docs import DocumentationPage
+ writer = index.writer()
+ for page in DocumentationPage.iter_pages():
+ page.remove_from_search_index(writer)
+ page.add_to_search_index(writer)
+ writer.commit()
+
+
+def reindex_snippets():
+ from flask_website.database import Snippet
+ writer = index.writer()
+ for snippet in Snippet.query.all():
+ snippet.remove_from_search_index(writer)
+ snippet.add_to_search_index(writer)
+ writer.commit()
Binary file not shown.
Oops, something went wrong.

0 comments on commit 33c0a28

Please sign in to comment.