Merge branch 'master' of github.com:okfn/ckan into 3016-template-tweaks

ckan · Dec 6, 2012 · afbaaf5 · afbaaf5
2 parents 9fd3f97 + 26456b1
commit afbaaf5
Show file tree

Hide file tree

Showing 11 changed files with 218 additions and 26 deletions.
diff --git a/ckan/config/solr/CHANGELOG.txt b/ckan/config/solr/CHANGELOG.txt
@@ -1,6 +1,13 @@
 CKAN SOLR schemas changelog
 ===========================
 
+v2.0 - (ckan>=2.0)
+--------------------
+* Add _version_ field to make it compatible with solr 4.0
+* Remove stopwords
+* Add dataset_type field.
+* Add *_date autofield.
+
 v1.4 - (ckan>=1.7)
 --------------------
 * Add Ascii folding filter to text fields.

diff --git a/ckan/config/solr/schema-2.0.xml b/ckan/config/solr/schema-2.0.xml
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<schema name="ckan" version="2.0">
+
+<types>
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
+    <fieldtype name="binary" class="solr.BinaryField"/>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+        <analyzer type="index">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+            <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+            <filter class="solr.ASCIIFoldingFilterFactory"/>
+        </analyzer>
+        <analyzer type="query">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+            <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+            <filter class="solr.ASCIIFoldingFilterFactory"/>
+        </analyzer>
+    </fieldType>
+
+
+    <!-- A general unstemmed text field - good if one does not know the language of the field -->
+    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
+        <analyzer type="index">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+        </analyzer>
+        <analyzer type="query">
+            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+            <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+            <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+            <filter class="solr.LowerCaseFilterFactory"/>
+        </analyzer>
+    </fieldType>
+</types>
+
+
+<fields>
+    <field name="index_id" type="string" indexed="true" stored="true" required="true" />
+    <field name="id" type="string" indexed="true" stored="true" required="true" />
+    <field name="site_id" type="string" indexed="true" stored="true" required="true" />
+    <field name="title" type="text" indexed="true" stored="true" />
+    <field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="dataset_type" type="string" indexed="true" stored="true" />
+    <field name="state" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="name" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="version" type="string" indexed="true" stored="true" />
+    <field name="url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" />
+    <field name="notes" type="text" indexed="true" stored="true"/>
+    <field name="author" type="textgen" indexed="true" stored="true" />
+    <field name="author_email" type="textgen" indexed="true" stored="true" />
+    <field name="maintainer" type="textgen" indexed="true" stored="true" />
+    <field name="maintainer_email" type="textgen" indexed="true" stored="true" />
+    <field name="license" type="string" indexed="true" stored="true" />
+    <field name="license_id" type="string" indexed="true" stored="true" />
+    <field name="ratings_count" type="int" indexed="true" stored="false" />
+    <field name="ratings_average" type="float" indexed="true" stored="false" />
+    <field name="tags" type="string" indexed="true" stored="true" multiValued="true"/>
+    <field name="groups" type="string" indexed="true" stored="true" multiValued="true"/>
+
+    <field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/>
+
+    <field name="res_description" type="textgen" indexed="true" stored="true" multiValued="true"/>
+    <field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/>
+    <field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/>
+
+    <!-- catchall field, containing all other searchable text fields (implemented
+         via copyField further on in this schema  -->
+    <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="urls" type="text" indexed="true" stored="false" multiValued="true"/>
+
+    <field name="depends_on" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="dependency_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="derives_from" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="links_to" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="linked_from" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="child_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="parent_of" type="text" indexed="true" stored="false" multiValued="true"/>
+    <field name="views_total" type="int" indexed="true" stored="false"/>
+    <field name="views_recent" type="int" indexed="true" stored="false"/>
+    <field name="resources_accessed_total" type="int" indexed="true" stored="false"/>
+    <field name="resources_accessed_recent" type="int" indexed="true" stored="false"/>
+
+    <field name="metadata_created" type="date" indexed="true" stored="true" multiValued="false"/>
+    <field name="metadata_modified" type="date" indexed="true" stored="true" multiValued="false"/>
+
+    <field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+
+    <!-- Copy the title field into titleString, and treat as a string
+         (rather than text type).  This allows us to sort on the titleString -->
+    <field name="title_string" type="string" indexed="true" stored="false" />
+
+    <field name="data_dict" type="string" indexed="false" stored="true" />
+
+    <field name="_version_" type="string" indexed="true" stored="true"/>
+
+    <dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/>
+
+    <dynamicField name="extras_*" type="text" indexed="true" stored="true" multiValued="false"/>
+    <dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/>
+    <dynamicField name="*" type="string" indexed="true"  stored="false"/>
+</fields>
+
+<uniqueKey>index_id</uniqueKey>
+<defaultSearchField>text</defaultSearchField>
+<solrQueryParser defaultOperator="AND"/>
+
+<copyField source="url" dest="urls"/>
+<copyField source="ckan_url" dest="urls"/>
+<copyField source="download_url" dest="urls"/>
+<copyField source="res_url" dest="urls"/>
+<copyField source="extras_*" dest="text"/>
+<copyField source="vocab_*" dest="text"/>
+<copyField source="urls" dest="text"/>
+<copyField source="name" dest="text"/>
+<copyField source="title" dest="text"/>
+<copyField source="text" dest="text"/>
+<copyField source="license" dest="text"/>
+<copyField source="notes" dest="text"/>
+<copyField source="tags" dest="text"/>
+<copyField source="groups" dest="text"/>
+<copyField source="res_description" dest="text"/>
+<copyField source="maintainer" dest="text"/>
+<copyField source="author" dest="text"/>
+
+</schema>
diff --git a/ckan/controllers/home.py b/ckan/controllers/home.py
@@ -68,7 +68,7 @@ def index(self):
                           'res_format': _('Formats'),
                           'license': _('Licence'), }
 
-            data_dict = {'order_by': 'packages', 'all_fields': 1}
+            data_dict = {'sort': 'packages', 'all_fields': 1}
             # only give the terms to group dictize that are returned in the
             # facets as full results take a lot longer
             if 'groups' in c.search_facets:

diff --git a/ckan/lib/search/__init__.py b/ckan/lib/search/__init__.py
@@ -30,7 +30,7 @@ def text_traceback():
 
 SIMPLE_SEARCH = asbool(config.get('ckan.simple_search', False))
 
-SUPPORTED_SCHEMA_VERSIONS = ['1.4']
+SUPPORTED_SCHEMA_VERSIONS = ['2.0']
 
 DEFAULT_OPTIONS = {
     'limit': 20,

diff --git a/ckan/lib/search/index.py b/ckan/lib/search/index.py
@@ -7,6 +7,7 @@
 import re
 
 from pylons import config
+from paste.deploy.converters import asbool
 
 from common import SearchIndexError, make_connection
 from ckan.model import PackageRelationship
@@ -223,6 +224,8 @@ def index_package(self, pkg_dict, defer_commit=False):
         try:
             conn = make_connection()
             commit = not defer_commit
+            if not asbool(config.get('ckan.search.solr_commit', 'true')):
+                commit = False
             conn.add_many([pkg_dict], _commit=commit)
         except Exception, e:
             log.exception(e)
@@ -236,7 +239,7 @@ def index_package(self, pkg_dict, defer_commit=False):
     def commit(self):
         try:
             conn = make_connection()
-            conn.commit(wait_flush=False, wait_searcher=False)
+            conn.commit(wait_searcher=False)
         except Exception, e:
             log.exception(e)
             raise SearchIndexError(e)
@@ -251,7 +254,8 @@ def delete_package(self, pkg_dict):
                                                        config.get('ckan.site_id'))
         try:
             conn.delete_query(query)
-            conn.commit()
+            if asbool(config.get('ckan.search.solr_commit', 'true')):
+                conn.commit()
         except Exception, e:
             log.exception(e)
             raise SearchIndexError(e)

diff --git a/ckan/lib/search/query.py b/ckan/lib/search/query.py
@@ -338,7 +338,9 @@ def run(self, query):
         if ':' not in query['q']:
             query['defType'] = 'dismax'
             query['tie'] = '0.1'
-            query['mm'] = '1'
+            # this minimum match is explained
+            # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29
+            query['mm'] = '2<-1 5<80%'
             query['qf'] = query.get('qf', QUERY_FIELDS)
 
         conn = make_connection()

diff --git a/ckan/tests/functional/test_search.py b/ckan/tests/functional/test_search.py
@@ -62,12 +62,12 @@ def test_1_name(self):
 
     def test_2_title(self):
         # exact title, one word
-        res = self.app.get('/dataset?q=Opengov.se')
+        res = self.app.get('/dataset?q=Opengov')
         result = self._check_results(res, 1, 'se-opengov')
 
         # multiple words
         res = self.app.get('/dataset?q=Government%20Expenditure')
-        result = self._check_results(res, 5, 'uk-government-expenditure')
+        result = self._check_results(res, 1, 'uk-government-expenditure')
 
 class TestSearch2(FunctionalTestCase, PylonsTestCase):#, TestPackageForm):
 
@@ -158,7 +158,7 @@ def test_search(self):
         res = self.app.get(offset)
         assert 'Search - ' in res
         form = res.forms['dataset-search']
-        form['q'] =  str(self.non_active_name)
+        form['q'] =  'name:' + str(self.non_active_name)
         results_page = form.submit()
         assert 'Search - ' in results_page, results_page
         assert '<strong>0</strong> datasets found' in results_page, (self.non_active_name, results_page)
diff --git a/ckan/tests/lib/test_solr_package_search.py b/ckan/tests/lib/test_solr_package_search.py
@@ -88,21 +88,20 @@ def test_1_name_token(self):
 
     def test_2_title(self):
         # exact title, one word
-        result = search.query_for(model.Package).run({'q': u'Opengov.se'})
+        result = search.query_for(model.Package).run({'q': u'Opengov'})
+
         assert self._pkg_names(result) == 'se-opengov', self._pkg_names(result)
         # multiple words
         result = search.query_for(model.Package).run({'q': u'Government Expenditure'})
         # uk-government-expenditure is the best match but all other results should be retured
         assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
-        # se-opengov has only government in tags, all others hav it in title.
-        assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
         # multiple words wrong order
         result = search.query_for(model.Package).run({'q': u'Expenditure Government'})
         assert self._pkg_names(result).startswith('uk-government-expenditure'), self._pkg_names(result)
-        assert self._pkg_names(result).endswith('se-opengov'), self._pkg_names(result)
         # multiple words all should match government
+
         result = search.query_for(model.Package).run({'q': u'Expenditure Government China'})
-        assert len(result['results']) == 5, self._pkg_names(result)
+        assert len(result['results']) == 1, self._pkg_names(result)
 
     def test_3_licence(self):
         # this should result, but it is here to check that at least it does not error
@@ -136,7 +135,7 @@ def test_tags_field_with_capitals(self):
     def dont_test_tags_field_with_basic_unicode(self):
         result = search.query_for(model.Package).run({'q': u'greek omega \u03a9'})
         assert self._check_entity_names(result, ['se-publications']), self._pkg_names(result)
-        
+
     def test_tags_token_simple(self):
         result = search.query_for(model.Package).run({'q': u'tags:country-sweden'})
         assert self._check_entity_names(result, ['se-publications', 'se-opengov']), self._pkg_names(result)
@@ -146,7 +145,7 @@ def test_tags_token_simple(self):
     def test_tags_token_with_multi_word_tag(self):
         result = search.query_for(model.Package).run({'q': u'tags:"todo split"'})
         assert self._check_entity_names(result, ['us-gov-images']), self._pkg_names(result)
-    
+
     def test_tags_token_simple_with_deleted_tag(self):
         # registry has been deleted
         result = search.query_for(model.Package).run({'q': u'tags:registry'})
@@ -287,7 +286,7 @@ def test_search_notes_on(self):
         pkgs = result['results']
         count = result['count']
         assert len(pkgs) == 2, pkgs
-        
+
     def test_search_foreign_chars(self):
         result = search.query_for(model.Package).run({'q': 'umlaut'})
         assert result['results'] == ['gils'], result['results']
@@ -319,8 +318,8 @@ def test_overall(self):
         check_search_results('annakarenina', 1, ['annakarenina'])
         check_search_results('warandpeace', 1, ['warandpeace'])
         check_search_results('', 2)
-        
-        check_search_results('A Novel By Tolstoy', 1, ['annakarenina'])
+
+        check_search_results('Tolstoy', 1, ['annakarenina'])
         check_search_results('title:Novel', 1, ['annakarenina'])
         check_search_results('title:peace', 0)
         check_search_results('name:warandpeace', 1)
@@ -332,7 +331,7 @@ def test_overall(self):
         check_search_results(u'Flexible \u30a1', 2)
         check_search_results(u'Flexible', 2)
         check_search_results(u'flexible', 2)
-        
+
 
 class TestGeographicCoverage(TestController):
     @classmethod
@@ -356,7 +355,7 @@ def setup_class(cls):
     def teardown_class(self):
         model.repo.rebuild_db()
         search.clear()
-    
+
     def _do_search(self, q, expected_pkgs, count=None):
         query = {
             'q': q,
@@ -390,7 +389,7 @@ def test_0_basic(self):
         self._do_search(u'great britain', ['gb'], 1)
 
     def test_1_filtered(self):
-        # TODO: solr is not currently set up to allow partial matches 
+        # TODO: solr is not currently set up to allow partial matches
         #       and extras are not saved as multivalued so this
         #       test will fail. Make multivalued or remove?
         from ckan.tests import SkipTest
@@ -420,7 +419,7 @@ def setup_class(cls):
     def teardown_class(self):
         model.repo.rebuild_db()
         search.clear()
-    
+
     def _do_search(self, department, expected_pkgs, count=None):
         result = search.query_for(model.Package).run({'q': 'department: %s' % department})
         pkgs = result['results']
@@ -465,7 +464,7 @@ def setup_class(cls):
     def teardown_class(self):
         model.repo.rebuild_db()
         search.clear()
-    
+
     def _do_search(self, q, wanted_results):
         query = {
             'q': q,

diff --git a/ckan/tests/lib/test_solr_package_search_synchronous_update.py b/ckan/tests/lib/test_solr_package_search_synchronous_update.py
@@ -71,7 +71,7 @@ def _remove_package(self, name=None):
 
     def test_02_add_package_from_dict(self):
         check_search_results('', 3)
-        check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
+        check_search_results('spatial', 1, ['council-owned-litter-bins'])
 
     def test_03_update_package_from_dict(self):
         package = model.Package.by_name('council-owned-litter-bins')
@@ -93,7 +93,7 @@ def test_03_update_package_from_dict(self):
         model.repo.commit_and_remove()
 
         check_search_results('', 3)
-        check_search_results('test-spatial', 1, ['council-owned-litter-bins'])
+        check_search_results('spatial', 1, ['council-owned-litter-bins'])
 
     def test_04_delete_package_from_dict(self):
         package = model.Package.by_name('council-owned-litter-bins')