Skip to content

Commit

Permalink
Merge branch 'master' of github.com:okfn/ckan into 3016-template-tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Sean Hammond committed Dec 6, 2012
2 parents 9fd3f97 + 26456b1 commit afbaaf5
Show file tree
Hide file tree
Showing 11 changed files with 218 additions and 26 deletions.
7 changes: 7 additions & 0 deletions ckan/config/solr/CHANGELOG.txt
@@ -1,6 +1,13 @@
CKAN SOLR schemas changelog
===========================

v2.0 - (ckan>=2.0)
--------------------
* Add _version_ field to make it compatible with solr 4.0
* Remove stopwords
* Add dataset_type field.
* Add *_date autofield.

v1.4 - (ckan>=1.7)
--------------------
* Add Ascii folding filter to text fields.
Expand Down
164 changes: 164 additions & 0 deletions ckan/config/solr/schema-2.0.xml
@@ -0,0 +1,164 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<schema name="ckan" version="2.0">

<types>
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<fieldtype name="binary" class="solr.BinaryField"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
</analyzer>
</fieldType>


<!-- A general unstemmed text field - good if one does not know the language of the field -->
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
</types>


<fields>
<field name="index_id" type="string" indexed="true" stored="true" required="true" />
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="site_id" type="string" indexed="true" stored="true" required="true" />
<field name="title" type="text" indexed="true" stored="true" />
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="dataset_type" type="string" indexed="true" stored="true" />
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="version" type="string" indexed="true" stored="true" />
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
<field name="notes" type="text" indexed="true" stored="true"/>
<field name="author" type="textgen" indexed="true" stored="true" />
<field name="author_email" type="textgen" indexed="true" stored="true" />
<field name="maintainer" type="textgen" indexed="true" stored="true" />
<field name="maintainer_email" type="textgen" indexed="true" stored="true" />
<field name="license" type="string" indexed="true" stored="true" />
<field name="license_id" type="string" indexed="true" stored="true" />
<field name="ratings_count" type="int" indexed="true" stored="false" />
<field name="ratings_average" type="float" indexed="true" stored="false" />
<field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>

<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>

<field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>

<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>

<field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="views_total" type="int" indexed="true" stored="false"/>
<field name="views_recent" type="int" indexed="true" stored="false"/>
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
<field name="resources_accessed_recent" type="int" indexed="true" stored="false"/>

<field name="metadata_created" type="date" indexed="true" stored="true" multiValued="false"/>
<field name="metadata_modified" type="date" indexed="true" stored="true" multiValued="false"/>

<field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>

<!-- Copy the title field into titleString, and treat as a string
(rather than text type). This allows us to sort on the titleString -->
<field name="title_string" type="string" indexed="true" stored="false" />

<field name="data_dict" type="string" indexed="false" stored="true" />

<field name="_version_" type="string" indexed="true" stored="true"/>

<dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/>

<dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*" type="string" indexed="true" stored="false"/>
</fields>

<uniqueKey>index_id</uniqueKey>
<defaultSearchField>text</defaultSearchField>
<solrQueryParser defaultOperator="AND"/>

<copyField source="url" dest="urls"/>
<copyField source="ckan_url" dest="urls"/>
<copyField source="download_url" dest="urls"/>
<copyField source="res_url" dest="urls"/>
<copyField source="extras_*" dest="text"/>
<copyField source="vocab_*" dest="text"/>
<copyField source="urls" dest="text"/>
<copyField source="name" dest="text"/>
<copyField source="title" dest="text"/>
<copyField source="text" dest="text"/>
<copyField source="license" dest="text"/>
<copyField source="notes" dest="text"/>
<copyField source="tags" dest="text"/>
<copyField source="groups" dest="text"/>
<copyField source="res_description" dest="text"/>
<copyField source="maintainer" dest="text"/>
<copyField source="author" dest="text"/>

</schema>
2 changes: 1 addition & 1 deletion ckan/controllers/home.py
Expand Up @@ -68,7 +68,7 @@ def index(self):
'res_format': _('Formats'),
'license': _('Licence'), }

data_dict = {'order_by': 'packages', 'all_fields': 1}
data_dict = {'sort': 'packages', 'all_fields': 1}
# only give the terms to group dictize that are returned in the
# facets as full results take a lot longer
if 'groups' in c.search_facets:
Expand Down
2 changes: 1 addition & 1 deletion ckan/lib/search/__init__.py
Expand Up @@ -30,7 +30,7 @@ def text_traceback():

SIMPLE_SEARCH = asbool(config.get('ckan.simple_search', False))

SUPPORTED_SCHEMA_VERSIONS = ['1.4']
SUPPORTED_SCHEMA_VERSIONS = ['2.0']

DEFAULT_OPTIONS = {
'limit': 20,
Expand Down
8 changes: 6 additions & 2 deletions ckan/lib/search/index.py
Expand Up @@ -7,6 +7,7 @@
import re

from pylons import config
from paste.deploy.converters import asbool

from common import SearchIndexError, make_connection
from ckan.model import PackageRelationship
Expand Down Expand Up @@ -223,6 +224,8 @@ def index_package(self, pkg_dict, defer_commit=False):
try:
conn = make_connection()
commit = not defer_commit
if not asbool(config.get('ckan.search.solr_commit', 'true')):
commit = False
conn.add_many([pkg_dict], _commit=commit)
except Exception, e:
log.exception(e)
Expand All @@ -236,7 +239,7 @@ def index_package(self, pkg_dict, defer_commit=False):
def commit(self):
try:
conn = make_connection()
conn.commit(wait_flush=False, wait_searcher=False)
conn.commit(wait_searcher=False)
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
Expand All @@ -251,7 +254,8 @@ def delete_package(self, pkg_dict):
config.get('ckan.site_id'))
try:
conn.delete_query(query)
conn.commit()
if asbool(config.get('ckan.search.solr_commit', 'true')):
conn.commit()
except Exception, e:
log.exception(e)
raise SearchIndexError(e)
Expand Down
4 changes: 3 additions & 1 deletion ckan/lib/search/query.py
Expand Up @@ -338,7 +338,9 @@ def run(self, query):
if ':' not in query['q']:
query['defType'] = 'dismax'
query['tie'] = '0.1'
query['mm'] = '1'
# this minimum match is explained
# http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
query['mm'] = '2<-1 5<80%'
query['qf'] = query.get('qf', QUERY_FIELDS)

conn = make_connection()
Expand Down
6 changes: 3 additions & 3 deletions ckan/tests/functional/test_search.py
Expand Up @@ -62,12 +62,12 @@ def test_1_name(self):

def test_2_title(self):
# exact title, one word
res = self.app.get('/dataset?q=Opengov.se')
res = self.app.get('/dataset?q=Opengov')
result = self._check_results(res, 1, 'se-opengov')

# multiple words
res = self.app.get('/dataset?q=Government%20Expenditure')
result = self._check_results(res, 5, 'uk-government-expenditure')
result = self._check_results(res, 1, 'uk-government-expenditure')

class TestSearch2(FunctionalTestCase, PylonsTestCase):#, TestPackageForm):

Expand Down Expand Up @@ -158,7 +158,7 @@ def test_search(self):
res = self.app.get(offset)
assert 'Search - ' in res
form = res.forms['dataset-search']
form['q'] = str(self.non_active_name)
form['q'] = 'name:' + str(self.non_active_name)
results_page = form.submit()
assert 'Search - ' in results_page, results_page
assert '<strong>0</strong> datasets found' in results_page, (self.non_active_name, results_page)
29 changes: 14 additions & 15 deletions ckan/tests/lib/test_solr_package_search.py
Expand Up @@ -88,21 +88,20 @@ def test_1_name_token(self):

def test_2_title(self):
# exact title, one word
result = search.query_for(model.Package).run({'q': u'Opengov.se'})
result = search.query_for(model.Package).run({'q': u'Opengov'})

assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
# multiple words
result = search.query_for(model.Package).run({'q': u'Government Expenditure'})
# uk-government-expenditure is the best match but all other results should be retured
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
# se-opengov has only government in tags, all others hav it in title.
assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words wrong order
result = search.query_for(model.Package).run({'q': u'Expenditure Government'})
assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
# multiple words all should match government

result = search.query_for(model.Package).run({'q': u'Expenditure Government China'})
assert len(result['results']) == 5, self._pkg_names(result)
assert len(result['results']) == 1, self._pkg_names(result)

def test_3_licence(self):
# this should result, but it is here to check that at least it does not error
Expand Down Expand Up @@ -136,7 +135,7 @@ def test_tags_field_with_capitals(self):
def dont_test_tags_field_with_basic_unicode(self):
result = search.query_for(model.Package).run({'q': u'greek omega \u03a9'})
assert self._check_entity_names(result, ['se-publications']), self._pkg_names(result)

def test_tags_token_simple(self):
result = search.query_for(model.Package).run({'q': u'tags:country-sweden'})
assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
Expand All @@ -146,7 +145,7 @@ def test_tags_token_simple(self):
def test_tags_token_with_multi_word_tag(self):
result = search.query_for(model.Package).run({'q': u'tags:"todo split"'})
assert self._check_entity_names(result, ['us-gov-images']), self._pkg_names(result)

def test_tags_token_simple_with_deleted_tag(self):
# registry has been deleted
result = search.query_for(model.Package).run({'q': u'tags:registry'})
Expand Down Expand Up @@ -287,7 +286,7 @@ def test_search_notes_on(self):
pkgs = result['results']
count = result['count']
assert len(pkgs) == 2, pkgs

def test_search_foreign_chars(self):
result = search.query_for(model.Package).run({'q': 'umlaut'})
assert result['results'] == ['gils'], result['results']
Expand Down Expand Up @@ -319,8 +318,8 @@ def test_overall(self):
check_search_results('annakarenina', 1, ['annakarenina'])
check_search_results('warandpeace', 1, ['warandpeace'])
check_search_results('', 2)
check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])

check_search_results('Tolstoy', 1, ['annakarenina'])
check_search_results('title:Novel', 1, ['annakarenina'])
check_search_results('title:peace', 0)
check_search_results('name:warandpeace', 1)
Expand All @@ -332,7 +331,7 @@ def test_overall(self):
check_search_results(u'Flexible \u30a1', 2)
check_search_results(u'Flexible', 2)
check_search_results(u'flexible', 2)


class TestGeographicCoverage(TestController):
@classmethod
Expand All @@ -356,7 +355,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, q, expected_pkgs, count=None):
query = {
'q': q,
Expand Down Expand Up @@ -390,7 +389,7 @@ def test_0_basic(self):
self._do_search(u'great britain', ['gb'], 1)

def test_1_filtered(self):
# TODO: solr is not currently set up to allow partial matches
# TODO: solr is not currently set up to allow partial matches
# and extras are not saved as multivalued so this
# test will fail. Make multivalued or remove?
from ckan.tests import SkipTest
Expand Down Expand Up @@ -420,7 +419,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, department, expected_pkgs, count=None):
result = search.query_for(model.Package).run({'q': 'department: %s' % department})
pkgs = result['results']
Expand Down Expand Up @@ -465,7 +464,7 @@ def setup_class(cls):
def teardown_class(self):
model.repo.rebuild_db()
search.clear()

def _do_search(self, q, wanted_results):
query = {
'q': q,
Expand Down
4 changes: 2 additions & 2 deletions ckan/tests/lib/test_solr_package_search_synchronous_update.py
Expand Up @@ -71,7 +71,7 @@ def _remove_package(self, name=None):

def test_02_add_package_from_dict(self):
check_search_results('', 3)
check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
check_search_results('spatial', 1, ['council-owned-litter-bins'])

def test_03_update_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
Expand All @@ -93,7 +93,7 @@ def test_03_update_package_from_dict(self):
model.repo.commit_and_remove()

check_search_results('', 3)
check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
check_search_results('spatial', 1, ['council-owned-litter-bins'])

def test_04_delete_package_from_dict(self):
package = model.Package.by_name('council-owned-litter-bins')
Expand Down

0 comments on commit afbaaf5

Please sign in to comment.