Permalink
Browse files

initial prototype. no loader code yet, but can

be used to reconcile in google refine against a
single type. 

also implements multi-representational entity 
pages.
  • Loading branch information...
0 parents commit 55fbe227ce944c22b6fc2e2e5941e9122b054b2a @pudo pudo committed Jul 3, 2011
Showing with 477 additions and 0 deletions.
  1. +2 −0 .gitignore
  2. +1 −0 helmut/__init__.py
  3. +33 −0 helmut/core.py
  4. +14 −0 helmut/default_settings.py
  5. +64 −0 helmut/load.py
  6. +30 −0 helmut/query.py
  7. +121 −0 helmut/schema.xml
  8. +39 −0 helmut/text.py
  9. +144 −0 helmut/web.py
  10. +3 −0 setup.cfg
  11. +26 −0 setup.py
@@ -0,0 +1,2 @@
+*.pyc
+*.egg-info
@@ -0,0 +1 @@
+#
@@ -0,0 +1,33 @@
+from flask import Flask, request
+from pymongo import Connection
+from solr import SolrConnection
+
+from helmut import default_settings
+
+MIME_TYPES = {
+ 'text/html': 'html',
+ 'application/xhtml+xml': 'html',
+ 'application/json': 'json',
+ 'text/javascript': 'json'
+ }
+
+def request_format(fmt):
+ best = request.accept_mimetypes \
+ .best_match(MIME_TYPES.keys())
+ if fmt in MIME_TYPES.values():
+ return fmt
+ return MIME_TYPES.get(best)
+
+
+app = Flask(__name__)
+app.config.from_object(default_settings)
+app.config.from_envvar('RECON_SETTINGS', silent=True)
+
+conn = Connection(app.config['MONGO_HOST'])
+db = conn[app.config['MONGO_DB']]
+entities = db[app.config['MONGO_COLLECTION']]
+
+solr_host = app.config['SOLR_HOST']
+
+def solr():
+ return SolrConnection(solr_host)
@@ -0,0 +1,14 @@
+
+
+DEBUG = True
+SECRET_KEY = 'shibbloleth'
+
+TITLE = 'Helmut the Reconciliation Server'
+ENTITY_NAME = 'entity'
+
+MONGO_HOST = 'localhost'
+MONGO_DB = 'recondb'
+MONGO_COLLECTION = ENTITY_NAME
+
+SOLR_HOST = 'http://localhost:8983/solr/helmut'
+
@@ -0,0 +1,64 @@
+from datetime import datetime
+#from urllib import quote
+from dateutil import tz
+
+from pymongo import ASCENDING
+
+from helmut.core import entities, solr
+from helmut.text import normalize
+
+def datetime_add_tz(dt):
+ return datetime(dt.year, dt.month, dt.day, dt.hour,
+ dt.minute, dt.second, tzinfo=tz.tzutc())
+
+def save_entity(path, title, alias=[], description=None, partition=None,
+ **kwargs):
+ """ Save an entity to the database and to solr.
+
+ Each entity is uniquely described by its path, which will be the last
+ aspect of its URL.
+ """
+ entity = kwargs.copy()
+
+ assert not '.' in path, "Full stop in path is invalid: %s" % path
+ #assert quote(path)==path, "Path changes when URL quoted: %s" % path
+ entity['path'] = path
+
+ assert len(title), "Title has no length: %s" % title
+ entity['title'] = title
+ entity['alias'] = alias
+
+ if description is not None:
+ entity['description'] = description
+ if partition is not None:
+ entity['_partition'] = partition
+
+ entity['updated_at'] = datetime.utcnow()
+
+ existing = entities.find_one({'path': path})
+ if existing is not None:
+ existing.update(entity)
+ entity = existing
+ else:
+ entity['created_at'] = entity['updated_at']
+ entities.update({'path': path}, entity, upsert=True)
+
+ entity['_collection'] = entities.name
+ entity['title.n'] = normalize(title)
+ entity['alias.n'] = map(normalize, alias)
+ conn = solr()
+ _entity = {}
+ for k, v in entity.items():
+ if isinstance(v, datetime):
+ v = datetime_add_tz(v)
+ _entity[str(k)] = v
+ conn.add(**_entity)
+ conn.commit()
+
+def finalize():
+ """ After loading, run a few optimization operations. """
+ entities.ensure_index([('path', ASCENDING)])
+ entities.ensure_index([('partition', ASCENDING)])
+ conn = solr()
+ conn.optimize()
+ conn.commit()
@@ -0,0 +1,30 @@
+import json
+
+from helmut.core import entities
+from helmut.text import normalize
+
+def field(k, v, boost=None):
+ v = v.replace('"', '\\"')
+ fld = '%s:"%s"' % (k, v)
+ if boost is not None:
+ fld += '^%d' % boost
+ return fld
+
+def query(solr, q, kw=(), limit=20):
+ fq = ['+' + field(k, v) for k, v in kw]
+ fq.append('_collection:%s' % entities.name)
+ nq = normalize(q)
+ _q = [
+ field('title', q, boost=10),
+ field('title.n', nq, boost=7),
+ field('alias', q, boost=8),
+ field('alias.n', nq, boost=5),
+ field('text', q, boost=2),
+ field('text', nq)
+ ]
+ _q = ' OR '.join(_q)
+ result = solr.raw_query(q=_q, fq=fq, rows=limit, wt='json',
+ sort='score desc, title desc', fl='*,score')
+ result = json.loads(result).get('response', {})
+ return result
+
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<schema name="helmut" version="1.2">
+
+ <types>
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
+ <fieldtype name="binary" class="solr.BinaryField"/>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+ <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ add enablePositionIncrements=true in both the index and query
+ analyzers to leave a 'gap' for more accurate phrase queries.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+ </analyzer>
+ </fieldType>
+
+
+ <!-- A general unstemmed text field - good if one does not know the language of the field -->
+ <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="stopwords.txt"
+ enablePositionIncrements="true"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ </types>
+
+
+ <fields>
+ <field name="path" type="string" indexed="true" stored="true" required="true" />
+ <field name="_collection" type="string" indexed="true" stored="true" />
+ <field name="_partition" type="string" indexed="true" stored="true" />
+ <field name="title" type="string" indexed="true" stored="true" required="true" />
+ <field name="title.n" type="string" indexed="true" stored="true" required="true" />
+ <field name="alias" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="alias.n" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="description" type="text" indexed="true" stored="true" />
+
+ <field name="created_at" type="date" indexed="true" stored="true" />
+ <field name="updated_at" type="date" indexed="true" stored="true" />
+
+ <!-- catchall field, containing all other searchable text fields (implemented
+ via copyField further on in this schema -->
+ <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+ <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+ <dynamicField name="*" type="text" indexed="true" stored="true"
+ multiValued="true" />
+ </fields>
+
+ <uniqueKey>path</uniqueKey>
+ <defaultSearchField>text</defaultSearchField>
+ <solrQueryParser defaultOperator="AND"/>
+
+ <copyField source="*" dest="text"/>
+</schema>
+
+
@@ -0,0 +1,39 @@
+from unicodedata import normalize as ucnorm, category
+
+def normalize(text):
+ """ Simplify a piece of text to generate a more canonical
+ representation. This involves lowercasing, stripping trailing
+ spaces, removing symbols, diacritical marks (umlauts) and
+ converting all newlines etc. to single spaces.
+ """
+ if not isinstance(text, unicode):
+ text = unicode(text)
+ text = text.lower()
+ decomposed = ucnorm('NFKD', text)
+ filtered = []
+ for char in decomposed:
+ cat = category(char)
+ if cat.startswith('C'):
+ filtered.append(' ')
+ elif cat.startswith('M'):
+ # marks, such as umlauts
+ continue
+ elif cat.startswith('Z'):
+ # newlines, non-breaking etc.
+ filtered.append(' ')
+ elif cat.startswith('S'):
+ # symbols, such as currency
+ continue
+ else:
+ filtered.append(char)
+ text = u''.join(filtered)
+ while ' ' in text:
+ text = text.replace(' ', ' ')
+ text = text.strip()
+ return ucnorm('NFKC', text)
+
+def url_slug(text):
+ text = normalize(text)
+ text = text.replace(' ', '-')
+ text = text.replace('.', '_')
+ return text
Oops, something went wrong.

0 comments on commit 55fbe22

Please sign in to comment.