diff --git a/learningresources/tests/base.py b/learningresources/tests/base.py index e2347235..aaecd5a8 100644 --- a/learningresources/tests/base.py +++ b/learningresources/tests/base.py @@ -15,11 +15,9 @@ from django.conf import settings from django.contrib.auth.models import User, Permission from django.core.files.storage import default_storage -from django.core.management import call_command from django.test import Client from django.test.testcases import TestCase from django.test.utils import override_settings -import haystack from learningresources.api import ( create_repo, @@ -28,7 +26,7 @@ update_description_path ) from learningresources.models import Repository, StaticAsset -from search.utils import recreate_index, refresh_index +from search.utils import recreate_index, refresh_index, remove_index log = logging.getLogger(__name__) # Using the md5 hasher speeds up tests. @@ -131,11 +129,7 @@ def tearDown(self): """Clean up Elasticsearch and static assets between tests.""" for static_asset in StaticAsset.objects.all(): default_storage.delete(static_asset.asset) - for key, _ in haystack.connections.connections_info.items(): - haystack.connections.reload(key) - call_command('clear_index', interactive=False, verbosity=0) - recreate_index() - refresh_index() + remove_index() def _make_archive(self, path, make_zip=False, ext=None): """ diff --git a/lore/settings.py b/lore/settings.py index 44766f30..ee445164 100644 --- a/lore/settings.py +++ b/lore/settings.py @@ -95,7 +95,6 @@ def get_var(name, default): 'taxonomy', 'rest', 'rest_framework', - 'haystack', 'search', 'roles', 'xanalytics', @@ -385,15 +384,10 @@ def get_var(name, default): # Haystack HAYSTACK_CONNECTIONS = { 'default': { - 'ENGINE': ( - 'haystack.backends.elasticsearch_backend' - '.ElasticsearchSearchEngine' - ), 'URL': get_var('HAYSTACK_URL', 'http://127.0.0.1:9200'), 'INDEX_NAME': get_var('HAYSTACK_INDEX', 'haystack'), } } -HAYSTACK_SIGNAL_PROCESSOR = 'search.signals.LoreRealTimeSignalProcessor' XANALYTICS_URL = get_var('XANALYTICS_URL', "") @@ -414,3 +408,7 @@ def get_var(name, default): # Google analytics code GOOGLE_ANALYTICS_ID = get_var('LORE_GOOGLE_ANALYTICS_ID', None) + +# This is needed to connect the signals properly. +# pylint: disable=unused-import +import search.signals # noqa diff --git a/requirements.txt b/requirements.txt index 51a615ee..81c59677 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ django-bootstrap3==6.2.2 django-compressor==1.5 django-debug-toolbar==1.3.2 django-elasticsearch-debug-toolbar==0.1.15 -django-haystack==2.4.0 djangorestframework==3.2.2 dj-database-url==0.3.0 dj-static==0.0.6 diff --git a/search/api.py b/search/api.py index 4830c42b..5bf6fa91 100644 --- a/search/api.py +++ b/search/api.py @@ -4,8 +4,6 @@ from __future__ import unicode_literals -from haystack.query import SearchQuerySet - from search.sorting import LoreSortingFields from search.utils import search_index from taxonomy.models import Vocabulary, Term @@ -29,28 +27,16 @@ def construct_queryset(repo_slug, query='', selected_facets=None, sortby=''): if selected_facets is None: selected_facets = [] - queryset = SearchQuerySet() - kwargs = {} - for facet in selected_facets: - queryset = queryset.narrow(facet) - if query != "": kwargs["content"] = query - queryset = queryset.filter(**kwargs) - - queryset = queryset.narrow("repository_exact:{slug}".format( - slug=repo_slug)) - if sortby == "": sortby = LoreSortingFields.DEFAULT_SORTING_FIELD # default values in case of weird sorting options sortby, _, order_direction = LoreSortingFields.get_sorting_option( sortby) sortby = "{0}{1}".format(order_direction, sortby) - queryset = queryset.order_by( - sortby, LoreSortingFields.BASE_SORTING_FIELD) # Do a parallel query using elasticsearch-dsl. if query not in ("", None): diff --git a/search/search_indexes.py b/search/search_indexes.py index a6031aaa..f0521bc5 100644 --- a/search/search_indexes.py +++ b/search/search_indexes.py @@ -18,10 +18,8 @@ import logging from django.core.cache import caches -from haystack import indexes -from lxml import etree -from learningresources.models import Course, LearningResource, get_preview_url +from learningresources.models import Course, LearningResource log = logging.getLogger(__name__) @@ -64,120 +62,3 @@ def get_vocabs(resource_id): for rel in rels.iterator(): data[rel.term.vocabulary_id].append(rel.term_id) return dict(data) - - -class LearningResourceIndex(indexes.SearchIndex, indexes.Indexable): - """ - Index configuration for the LearningResource model. - """ - text = indexes.CharField(document=True) - resource_type = indexes.CharField(faceted=True) - course = indexes.CharField(faceted=True) - run = indexes.CharField(faceted=True) - - # repository is here for filtering the repository listing page by the - # repo_slug in the URL. It is not used or needed in the repository listing - # page because that page is always for a single repository. - repository = indexes.CharField(faceted=True) - - nr_views = indexes.IntegerField(model_attr="xa_nr_views") - nr_attempts = indexes.IntegerField(model_attr="xa_nr_attempts") - avg_grade = indexes.FloatField(model_attr="xa_avg_grade") - - lid = indexes.IntegerField(model_attr="id", indexed=False) - title = indexes.CharField(model_attr="title", indexed=False) - titlesort = indexes.CharField(indexed=False) - description = indexes.CharField(model_attr="description", indexed=False) - description_path = indexes.CharField( - model_attr="description_path", - indexed=False, - ) - preview_url = indexes.CharField(indexed=False) - - def get_model(self): - """Return the model for which this configures indexing.""" - return LearningResource - - def index_queryset(self, using=None): - """Records to check when updating entire index.""" - return self.get_model().objects.all() - - def prepare_text(self, obj): # pylint: disable=no-self-use - """Indexing of the primary content of a LearningResource.""" - try: - # Strip XML tags from content before indexing. - tree = etree.fromstring(obj.content_xml) - content = etree.tostring(tree, encoding="utf-8", method="text") - except etree.XMLSyntaxError: - # For blank/invalid XML. - content = obj.content_xml - try: - content = content.decode('utf-8') - except AttributeError: - # For Python 3. - pass - - return "{0} {1} {2}".format( - obj.title, obj.description, content - ) - - def prepare_run(self, obj): # pylint: disable=no-self-use - """Define what goes into the "run" index.""" - data = get_course_metadata(obj.course_id) - return data["run"] - - def prepare_preview_url(self, obj): # pylint: disable=no-self-use - """Define what goes into the "run" index.""" - data = get_course_metadata(obj.course_id) - return get_preview_url( - obj, - org=data["org"], - course_number=data["course_number"], - run=data["run"], - ) - - def prepare_titlesort(self, obj): # pylint: disable=no-self-use - """Define what goes into the "titlesort" index.""" - title = obj.title.strip() - # hack to handle empty titles - # to show up at the bottom of the sorted list - if title: - title = '0{0}'.format(title) - return title.lower() - return '1' - - @staticmethod - def prepare_course(obj): - """Define what goes into the "course" index.""" - data = get_course_metadata(obj.course_id) - return data["course_number"] - - @staticmethod - def prepare_resource_type(obj): - """The name of the LearningResourceType.""" - return obj.learning_resource_type.name - - def prepare(self, obj): - """ - Get dynamic vocabularies. - - The prepare() method runs last, similar to Django's form.clean(). - This allows us to override anything we want. Here we add vocabularies - to the index because they must be dynamic. - - Technically, we could handle the other stuff (run, course, etc.) here - as well, but don't because explicit is better than implicit. - """ - prepared = super(LearningResourceIndex, self).prepare(obj) - for vocab_id, term_ids in get_vocabs(obj.id).items(): - # Use the integer primary keys as index values. This saves space, - # and also avoids all issues dealing with "special" characters. - prepared[vocab_id] = term_ids - # For faceted "_exact" in URL. - prepared["{0}_exact".format(vocab_id)] = term_ids - return prepared - - def prepare_repository(self, obj): # pylint: disable=no-self-use - """Use the slug for the repo, since it's unique.""" - data = get_course_metadata(obj.course_id) - return data["repo_slug"] diff --git a/search/signals.py b/search/signals.py index cacc4f09..f214e2d8 100644 --- a/search/signals.py +++ b/search/signals.py @@ -10,41 +10,22 @@ from django.db.models.signals import m2m_changed, post_save, post_delete from django.dispatch import receiver -from haystack.signals import RealtimeSignalProcessor from statsd.defaults.django import statsd log = logging.getLogger(__name__) -class LoreRealTimeSignalProcessor(RealtimeSignalProcessor): - """ - Add timers for Haystack signal processing. - """ - @statsd.timer('lore.haystack.save_signal') - def handle_save(self, sender, instance, **kwargs): - super(LoreRealTimeSignalProcessor, self).handle_save( - sender, instance, **kwargs - ) - - @statsd.timer('lore.haystack.delete_signal') - def handle_delete(self, sender, instance, **kwargs): - super(LoreRealTimeSignalProcessor, self).handle_delete( - sender, instance, **kwargs - ) - - # pylint: disable=unused-argument @statsd.timer('lore.haystack.taxonomy_update') @receiver(m2m_changed) def handle_m2m_save(sender, **kwargs): """Update index when taxonomies are updated.""" - from search.search_indexes import LearningResourceIndex, get_vocabs + from search.search_indexes import get_vocabs instance = kwargs.pop("instance") if instance.__class__.__name__ != "LearningResource": return # Update cache for the LearningResource if it's already set. get_vocabs(instance.id) - LearningResourceIndex().update_object(instance) # Update Elasticsearch index: from search.utils import index_resources index_resources([instance]) diff --git a/search/tests/test_indexing.py b/search/tests/test_indexing.py index a2aaa0cd..36964c96 100644 --- a/search/tests/test_indexing.py +++ b/search/tests/test_indexing.py @@ -205,11 +205,11 @@ def get_count(): return count set_cache_timeout(0) - with self.assertNumQueries(31): + with self.assertNumQueries(16): self.assertEqual(get_count(), 0) set_cache_timeout(60) - with self.assertNumQueries(17): + with self.assertNumQueries(14): self.assertEqual(get_count(), 1) def test_course_cache(self): diff --git a/search/utils.py b/search/utils.py index 981030a3..05ca7655 100644 --- a/search/utils.py +++ b/search/utils.py @@ -18,6 +18,7 @@ from statsd.defaults.django import statsd from learningresources.models import get_preview_url, LearningResource +from rest.serializers import RepositorySearchSerializer from search.exceptions import ReindexException from search.search_indexes import get_course_metadata from search.tasks import refresh_index as _refresh_index @@ -29,6 +30,7 @@ INDEX_NAME = settings.HAYSTACK_CONNECTIONS["default"]["INDEX_NAME"] URL = settings.HAYSTACK_CONNECTIONS["default"]["URL"] _CONN = None +_CONN_VERIFY = False PAGE_LENGTH = 10 @@ -44,10 +46,22 @@ def get_conn(verify=True): # pylint: disable=global-statement # This is ugly. Any suggestions on a way that doesn't require "global"? global _CONN + global _CONN_VERIFY + + do_verify = False if _CONN is None: _CONN = connections.create_connection(hosts=[URL]) + # Verify connection on first connect if verify=True. + do_verify = verify + + if verify and not _CONN_VERIFY: + # If we have a connection but haven't verified before, do it now. + do_verify = True - if not verify: + if not do_verify: + # We only skip verification if we're reindexing or deleting the index. + # Make sure we verify next time we connect. + _CONN_VERIFY = False return _CONN # Make sure everything exists. @@ -69,6 +83,8 @@ def get_conn(verify=True): raise ReindexException("Mapping {doc_type} not found".format( doc_type=DOC_TYPE )) + + _CONN_VERIFY = True return _CONN @@ -98,6 +114,12 @@ def get_resource_terms(resource_ids): return info +def _get_field_names(): + """Return list of search field names.""" + return list( + RepositorySearchSerializer.get_fields(RepositorySearchSerializer())) + + def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ Perform a search in Elasticsearch. @@ -112,12 +134,14 @@ def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ if terms is None: terms = {} - if tokens is None: - # Retrieve everything. - search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) - else: + + search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) + + # Limit returned fields since content_xml can be huge and is unnecessary. + search = search.fields(_get_field_names()) + + if tokens is not None: # Search on title, description, and content_xml (minus markup). - search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) multi = query.MultiMatch( query=tokens, fields=["title", "description", "content_stripped"]) search = search.query(multi) @@ -131,7 +155,7 @@ def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): search = search.query("match", repository=repo_slug) if sort_by is None: # Always sort by ID to preserve ordering. - search = search.sort({"id": {"ignore_unmapped": True}}) + search = search.sort("id") else: # Temporary workaround; the values in sorting.py should be updated, # but for now Haystack is still using them. Also, the hyphen is @@ -145,7 +169,7 @@ def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): if reverse: sort_by = "-{0}".format(sort_by) # Always sort by ID to preserve ordering. - search = search.sort(sort_by, {"id": {"ignore_unmapped": True}}) + search = search.sort(sort_by, "id") terms = set(get_vocab_names()) terms.update(set(('run', 'course', 'resource_type'))) @@ -202,6 +226,8 @@ def index_resources(resources, chunk_size=100): _index_resource_chunk(chunk) chunk = list(islice(resources, chunk_size)) + refresh_index() + @statsd.timer('lore.elasticsearch.delete_index') def delete_resource_from_index(resource): @@ -321,15 +347,19 @@ class SearchResults(object): def __init__(self, search): """Get raw search result from Elasticsearch.""" self._search = search + self._cached_agg = None + self._cached_count = None get_conn() # We don't need the return value; just for it to exist. def count(self): """Total records matching the query.""" - return self._search.count() + if self._cached_count is None: + self._cached_count = self._search.count() + return self._cached_count def page_count(self): """Total number of result pages.""" - total = self._search.count() + total = self.count() count = total / PAGE_LENGTH if total % PAGE_LENGTH > 0: count += 1 @@ -349,14 +379,26 @@ def all(self): def aggregations(self): """Return aggregations.""" - return convert_aggregate(vars(self._search.execute().aggregations)) + if self._cached_agg is None: + self._cached_agg = self._search.params( + search_type="count").execute() + return convert_aggregate(vars(self._cached_agg.aggregations)) def __getitem__(self, i): """Return result by index.""" if isinstance(i, slice): - return self._search[i].execute().hits + hits = self._search[i].execute().hits + else: + hits = self._search[slice(i, i+1)].execute().hits + + for hit in hits: + for field_name in _get_field_names(): + setattr(hit, field_name, getattr(hit, field_name)[0]) + + if isinstance(i, slice): + return hits else: - return self._search[i].execute().hits[0] + return hits[0] def create_mapping(): @@ -385,7 +427,7 @@ def _create_mapping(conn): conn.indices.delete_mapping(index=INDEX_NAME, doc_type=DOC_TYPE) mapping = Mapping(DOC_TYPE) - + mapping.field("id", "integer", index="not_analyzed") mapping.field("course", "string", index="not_analyzed") mapping.field("description_path", "string", index="no") mapping.field("description", "string", index="analyzed") @@ -509,8 +551,9 @@ def format_key(key): for key in updated.keys(): updated[key]["facet"] = updated[key]["buckets"] del updated[key]["buckets"] - del updated[key]["doc_count_error_upper_bound"] - del updated[key]["sum_other_doc_count"] + for name in ("doc_count_error_upper_bound", "sum_other_doc_count"): + if name in updated[key]: + del updated[key][name] temp_values = updated[key]["facet"] updated[key]["values"] = [] for rec in temp_values: