Skip to content

Commit

Permalink
Implemented search for the website
Browse files Browse the repository at this point in the history
  • Loading branch information
mitsuhiko committed Sep 10, 2011
1 parent 37767dc commit 33c0a28
Show file tree
Hide file tree
Showing 15 changed files with 413 additions and 102 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,6 +1,8 @@
.DS_Store
*.pyc
*.pyo
*.whoosh
*.db
env
dist
_mailinglist/*
Expand Down
6 changes: 2 additions & 4 deletions flask_website/__init__.py
@@ -1,11 +1,8 @@
from flask import Flask, session, g, render_template
from flaskext.openid import OpenID

import websiteconfig as config

app = Flask(__name__)
app.debug = config.DEBUG
app.secret_key = config.SECRET_KEY
app.config.from_object('websiteconfig')

from flask_website.openid_auth import DatabaseOpenIDStore
oid = OpenID(app, store_factory=DatabaseOpenIDStore)
Expand All @@ -26,6 +23,7 @@ def load_current_user():
def remove_db_session(exception):
db_session.remove()


app.add_url_rule('/docs/', endpoint='docs.index', build_only=True)
app.add_url_rule('/docs/<path:page>/', endpoint='docs.show',
build_only=True)
Expand Down
35 changes: 28 additions & 7 deletions flask_website/database.py
@@ -1,16 +1,17 @@
from datetime import datetime
from sqlalchemy import create_engine, Column, Integer, String, DateTime, \
ForeignKey
ForeignKey, event
from sqlalchemy.orm import scoped_session, sessionmaker, backref, relation
from sqlalchemy.ext.declarative import declarative_base

from werkzeug import cached_property, http_date

from flask import url_for
from flask_website import config
from flask import url_for, Markup
from flask_website import app, search

engine = create_engine(config.DATABASE_URI, convert_unicode=True,
**config.DATABASE_CONNECT_OPTIONS)
engine = create_engine(app.config['DATABASE_URI'],
convert_unicode=True,
**app.config['DATABASE_CONNECT_OPTIONS'])
db_session = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine))
Expand Down Expand Up @@ -38,7 +39,7 @@ def to_json(self):

@property
def is_admin(self):
return self.openid in config.ADMINS
return self.openid in app.config['ADMINS']

def __eq__(self, other):
return type(self) is type(other) and self.id == other.id
Expand Down Expand Up @@ -69,7 +70,7 @@ def url(self):
return url_for('snippets.category', slug=self.slug)


class Snippet(Model):
class Snippet(Model, search.Indexable):
__tablename__ = 'snippets'
id = Column('snippet_id', Integer, primary_key=True)
author_id = Column(Integer, ForeignKey('users.user_id'))
Expand All @@ -81,6 +82,8 @@ class Snippet(Model):
author = relation(User, backref=backref('snippets', lazy='dynamic'))
category = relation(Category, backref=backref('snippets', lazy='dynamic'))

search_document_kind = 'snippet'

def __init__(self, author, title, body, category):
self.author = author
self.title = title
Expand All @@ -96,6 +99,21 @@ def to_json(self):
author=self.author.to_json(),
category=self.category.slug)

def get_search_document(self):
return dict(
id=unicode(self.id),
title=self.title,
keywords=[self.category.name],
content=self.body
)

@classmethod
def describe_search_result(cls, result):
obj = cls.query.get(int(result['id']))
if obj is not None:
text = obj.rendered_body.striptags()
return Markup(result.highlights('content', text=text)) or None

@property
def url(self):
return url_for('snippets.show', id=self.id)
Expand Down Expand Up @@ -154,3 +172,6 @@ class OpenIDUserNonce(Model):
server_url = Column(String(1024))
timestamp = Column(Integer)
salt = Column(String(40))


event.listen(db_session, 'after_flush', search.update_model_based_indexes)
53 changes: 53 additions & 0 deletions flask_website/docs.py
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
import os
import re
from flask import url_for, Markup
from flask_website import app
from flask_website.search import Indexable


_doc_body_re = re.compile(r'''(?smx)
<title>(.*?)</title>.*?
<div\s+class="body">(.*?)<div\s+class="sphinxsidebar">
''')


class DocumentationPage(Indexable):
search_document_kind = 'documentation'

def __init__(self, slug):
self.slug = slug
fn = os.path.join(app.config['DOCUMENTATION_PATH'],
slug, 'index.html')
with open(fn) as f:
contents = f.read().decode('utf-8')
title, text = _doc_body_re.search(contents).groups()
self.title = Markup(title).striptags().split(u'—')[0].strip()
self.text = Markup(text).striptags().strip().replace(u'¶', u'')

def get_search_document(self):
return dict(
id=unicode(self.slug),
title=self.title,
keywords=[],
content=self.text
)

@property
def url(self):
return url_for('docs.show', page=self.slug)

@classmethod
def describe_search_result(cls, result):
rv = cls(result['id'])
return Markup(result.highlights('content', text=rv.text)) or None

@classmethod
def iter_pages(cls):
base_folder = os.path.abspath(app.config['DOCUMENTATION_PATH'])
for dirpath, dirnames, filenames in os.walk(base_folder):
if 'index.html' in filenames:
slug = dirpath[len(base_folder) + 1:]
# skip the index page. useless
if slug:
yield DocumentationPage(slug)
83 changes: 83 additions & 0 deletions flask_website/mailinglist.py
@@ -0,0 +1,83 @@
from hashlib import md5
from flask import Markup, url_for, json
from werkzeug import parse_date, http_date
from jinja2.utils import urlize
from flask_website import app
from flask_website.utils import split_lines_wrapping


class Mail(object):

def __init__(self, d):
self.msgid = d['msgid']
self.author_name, self.author_addr = d['author']
self.date = parse_date(d['date'])
self.subject = d['subject']
self.children = [Mail(x) for x in d['children']]
self.text = d['text']

def rendered_text(self):
result = []
in_sig = False
for line in split_lines_wrapping(self.text):
if line == u'-- ':
in_sig = True
# the extra space at the end is a simple workaround for
# urlize not to consume the </span> as part of the URL
if in_sig:
line = Markup(u'<span class=sig>%s </span>') % line
elif line.startswith('>'):
line = Markup(u'<span class=quote>%s </span>') % line
result.append(urlize(line))
return Markup(u'\n'.join(result))

def to_json(self):
rv = vars(self).copy()
rv.pop('author_email', None)
rv['date'] = http_date(rv['date'])
rv['children'] = [c.to_json() for c in rv['children']]
return rv

@property
def id(self):
return md5(self.msgid.encode('utf-8')).hexdigest()


class Thread(object):

def __init__(self, d):
self.slug = d['slug'].rsplit('/', 1)[-1]
self.title = d['title']
self.reply_count = d['reply_count']
self.author_name, self.author_email = d['author']
self.date = parse_date(d['date'])
if 'root' in d:
self.root = Mail(d['root'])

@staticmethod
def get(year, month, day, slug):
try:
with open('%s/threads/%s-%02d-%02d/%s' %
(app.config['MAILINGLIST_PATH'], year, month,
day, slug)) as f:
return Thread(json.load(f))
except IOError:
pass

@staticmethod
def get_list():
with open('%s/threads/threadlist' % app.config['MAILINGLIST_PATH']) as f:
return [Thread(x) for x in json.load(f)]

@property
def url(self):
return url_for('mailinglist.show_thread', year=self.date.year,
month=self.date.month, day=self.date.day,
slug=self.slug)

def to_json(self):
rv = vars(self).copy()
rv['date'] = http_date(rv['date'])
if 'root' in rv:
rv['root'] = rv['root'].to_json()
return rv
152 changes: 152 additions & 0 deletions flask_website/search.py
@@ -0,0 +1,152 @@
# -*- coding: utf-8 -*-
import os
from whoosh import highlight, analysis, qparser
from whoosh.support.charset import accent_map
from flask import Markup
from flask_website import app
from werkzeug import import_string


def open_index():
from whoosh import index, fields as f
if os.path.isdir(app.config['WHOOSH_INDEX']):
return index.open_dir(app.config['WHOOSH_INDEX'])
os.mkdir(app.config['WHOOSH_INDEX'])
analyzer = analysis.StemmingAnalyzer() | analysis.CharsetFilter(accent_map)
schema = f.Schema(
url=f.ID(stored=True, unique=True),
id=f.ID(stored=True),
title=f.TEXT(stored=True, field_boost=2.0, analyzer=analyzer),
type=f.ID(stored=True),
keywords=f.KEYWORD(commas=True),
content=f.TEXT(analyzer=analyzer)
)
return index.create_in(app.config['WHOOSH_INDEX'], schema)


index = open_index()


class Indexable(object):
search_document_kind = None

def add_to_search_index(self, writer):
writer.add_document(url=unicode(self.url),
type=self.search_document_type,
**self.get_search_document())

@classmethod
def describe_search_result(cls, result):
return None

@property
def search_document_type(self):
cls = type(self)
return cls.__module__ + u'.' + cls.__name__

def get_search_document(self):
raise NotImplementedError()

def remove_from_search_index(self, writer):
writer.delete_by_term('url', unicode(self.url))


def highlight_all(result, field):
text = result[field]
return Markup(highlight.Highlighter(
fragmenter=highlight.WholeFragmenter(),
formatter=result.results.highlighter.formatter)
.highlight_hit(result, field, text=text)) or text


class SearchResult(object):

def __init__(self, result):
self.url = result['url']
self.title_text = result['title']
self.title = highlight_all(result, 'title')
cls = import_string(result['type'])
self.kind = cls.search_document_kind
self.description = cls.describe_search_result(result)


class SearchResultPage(object):

def __init__(self, results, page):
self.page = page
if results is None:
self.results = []
self.pages = 1
self.total = 0
else:
self.results = [SearchResult(r) for r in results]
self.pages = results.pagecount
self.total = results.total

def __iter__(self):
return iter(self.results)


def search(query, page=1, per_page=20):
with index.searcher() as s:
qp = qparser.MultifieldParser(['title', 'content'], index.schema)
q = qp.parse(unicode(query))
try:
result_page = s.search_page(q, page, pagelen=per_page)
except ValueError:
if page == 1:
return SearchResultPage(None, page)
return None
results = result_page.results
results.highlighter.fragmenter.maxchars = 512
results.highlighter.fragmenter.surround = 40
results.highlighter.formatter = highlight.HtmlFormatter('em',
classname='search-match', termclass='search-term',
between=u'<span class=ellipsis> … </span>')
return SearchResultPage(result_page, page)


def update_model_based_indexes(session, flush_context):
"""Called by a session event, updates the model based documents."""
to_delete = []
to_add = []
for model in session.new:
if isinstance(model, Indexable):
to_add.append(model)

for model in session.dirty:
if isinstance(model, Indexable):
to_delete.append(model)
to_add.append(model)

for model in session.dirty:
if isinstance(model, Indexable):
to_delete.append(model)

if not (to_delete or to_add):
return

writer = index.writer()
for model in to_delete:
model.remove_from_search_index(writer)
for model in to_add:
model.add_to_search_index(writer)
writer.commit()


def update_documentation_index():
from flask_website.docs import DocumentationPage
writer = index.writer()
for page in DocumentationPage.iter_pages():
page.remove_from_search_index(writer)
page.add_to_search_index(writer)
writer.commit()


def reindex_snippets():
from flask_website.database import Snippet
writer = index.writer()
for snippet in Snippet.query.all():
snippet.remove_from_search_index(writer)
snippet.add_to_search_index(writer)
writer.commit()
Binary file added flask_website/static/search.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 33c0a28

Please sign in to comment.