Skip to content
This repository has been archived by the owner on Jan 28, 2020. It is now read-only.

Commit

Permalink
Sped up indexing using caching.
Browse files Browse the repository at this point in the history
Disabled by default; an environment variable should be
added to production to enable this.
  • Loading branch information
ShawnMilo committed Aug 4, 2015
1 parent e1dd6a1 commit 710d6bc
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 18 deletions.
23 changes: 18 additions & 5 deletions learningresources/models.py
Expand Up @@ -136,12 +136,25 @@ class LearningResource(BaseModel):
xa_histogram_grade = models.FloatField(default=0)
url_name = models.TextField(null=True)

def get_preview_url(self):
"""Create a preview URL."""
def get_preview_url(self, org=None, course_number=None, run=None):
"""
Create a preview URL. Accepts optional kwargs to prevent
database lookups, especially for during search engine indexing.
Args:
org (unicode): self.course.org
run (unicode): self.course.run
course_number (unicode): self.course.course_number
"""
if org is None:
org = self.course.org
if course_number is None:
course_number = self.course.course_number
if run is None:
run = self.course.run
key = "{org}/{course}/{run}".format(
org=self.course.org,
course=self.course.course_number,
run=self.course.run,
org=org,
course=course_number,
run=run,
)

if self.url_name is not None:
Expand Down
1 change: 1 addition & 0 deletions lore/settings.py
Expand Up @@ -352,3 +352,4 @@ def get_var(name, default):
}
}
HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
ALLOW_CACHING = get_var("ALLOW_CACHING", get_var("ALLOW_CACHING", False))
104 changes: 91 additions & 13 deletions search/search_indexes.py
Expand Up @@ -14,11 +14,82 @@

from __future__ import unicode_literals

from lxml import etree
from collections import defaultdict
from datetime import datetime, timedelta
import logging

from django.conf import settings
from haystack import indexes
from lxml import etree

from learningresources.models import Course, LearningResource

INDEX_CACHE = {
"course": {},
"term": {},
"born": datetime.now(),
}
MAX_INDEX_AGE = timedelta(minutes=1)

log = logging.getLogger(__name__)


def check_cache_age():
"""
Keep the cache from getting stale.
from learningresources.models import LearningResource
from taxonomy.models import Vocabulary
Must not cache during tests or development, or it will
cause confusion. Must manually enable caching in production.
"""
# pylint: disable=global-statement
global INDEX_CACHE
if (datetime.now() - INDEX_CACHE["born"] > MAX_INDEX_AGE) \
or settings.ALLOW_CACHING is False:
INDEX_CACHE = {
"course": {},
"term": {},
"born": datetime.now(),
}


def get_course_metadata(course_id):
"""
Caches and returns course metadata.
Args:
course_id (int): Primary key of learningresources.models.Course
Returns:
data (dict): Metadata about course.
"""
check_cache_age()
data = INDEX_CACHE["course"].get(course_id, {})
if data == {}:
course = Course.objects.get(id=course_id)
data["run"] = course.run
data["course_number"] = course.course_number
data["org"] = course.org
data["repo_slug"] = course.repository.slug
INDEX_CACHE["course"][course_id] = data
return data


def get_vocabs(course_id, lid):
"""
Caches and returns taxonomy metadata for a course.
Args:
course_id (int): Primary key of learningresources.models.Course
lid (int): Primary key of learningresources.models.LearningResource
Returns:
data (dict): Vocab/term data for course.
"""
check_cache_age()
if INDEX_CACHE["term"].get(course_id) is None:
INDEX_CACHE["term"][course_id] = defaultdict(lambda: defaultdict(list))
rels = LearningResource.terms.related.through.objects.select_related(
"term").filter(learningresource__course__id=course_id)
for rel in rels.iterator():
INDEX_CACHE["term"][course_id][rel.learningresource_id][
rel.term.vocabulary_id].append(rel.term_id)
return INDEX_CACHE["term"][course_id][lid]


class LearningResourceIndex(indexes.SearchIndex, indexes.Indexable):
Expand Down Expand Up @@ -80,16 +151,23 @@ def prepare_text(self, obj): # pylint: disable=no-self-use

def prepare_run(self, obj): # pylint: disable=no-self-use
"""Define what goes into the "run" index."""
return obj.course.run
data = get_course_metadata(obj.course_id)
return data["run"]

def prepare_preview_url(self, obj): # pylint: disable=no-self-use
"""Define what goes into the "run" index."""
return obj.get_preview_url()
data = get_course_metadata(obj.course_id)
return obj.get_preview_url(
org=data["org"],
course_number=data["course_number"],
run=data["run"],
)

@staticmethod
def prepare_course(obj):
"""Define what goes into the "course" index."""
return obj.course.course_number
data = get_course_metadata(obj.course_id)
return data["course_number"]

def prepare(self, obj):
"""
Expand All @@ -103,16 +181,16 @@ def prepare(self, obj):
as well, but don't because explicit is better than implicit.
"""
prepared = super(LearningResourceIndex, self).prepare(obj)
for vocab in Vocabulary.objects.all():

for vocab_id, term_ids in get_vocabs(obj.course_id, obj.id).items():
# Use the integer primary keys as index values. This saves space,
# and also avoids all issues dealing with "special" characters.
terms = set(obj.terms.filter(vocabulary_id=vocab.id).values_list(
'id', flat=True))
prepared[vocab.id] = terms
# for faceted "_exact" in URL
prepared["{0}_exact".format(vocab.id)] = terms
prepared[vocab_id] = term_ids
# For faceted "_exact" in URL.
prepared["{0}_exact".format(vocab_id)] = term_ids
return prepared

def prepare_repository(self, obj): # pylint: disable=no-self-use
"""Use the slug for the repo, since it's unique."""
return obj.course.repository.slug
data = get_course_metadata(obj.course_id)
return data["repo_slug"]
47 changes: 47 additions & 0 deletions search/tests/test_indexing.py
@@ -1,6 +1,8 @@
"""Tests for search engine indexing."""
from __future__ import unicode_literals

from django.conf import settings

from search.sorting import LoreSortingFields
from search.tests.base import SearchTestCase

Expand Down Expand Up @@ -105,3 +107,48 @@ def test_sorting(self):
top_res.avg_grade,
res1.xa_avg_grade
)

def test_indexing_cache(self):
"""
Test caching -- enabled and disabled.
This test both the course and taxonomy caches
from within search/search_indexes.py, because
both are "faceted" searches.
"""
def get_count():
"""
Get the count of a search after changing
the course_number. This will return different
results depending on whether caching is enabled.
"""
# This save() is required to make the caching
# happen if it's enabled, or delete it if it's disabled.
# Either way, a clean slate for this test.
self.resource.save()

# Remember original value, and prove a resource is found.
orig = self.course.course_number
self.assertEqual(self.count_faceted_results("course", orig), 1)

# Change the course number and make sure indexing
# is called by saving the resource again.
self.course.course_number = orig + "blah blah blah"
self.course.save()
self.resource.save()

# Get the result and reset everything.
count = self.count_faceted_results("course", orig)
self.course.course_number = orig
self.course.save()
self.resource.save()
return count

orig_cache_setting = settings.ALLOW_CACHING

settings.ALLOW_CACHING = False
self.assertEqual(get_count(), 0)

settings.ALLOW_CACHING = True
self.assertEqual(get_count(), 1)

settings.ALLOW_CACHING = orig_cache_setting

0 comments on commit 710d6bc

Please sign in to comment.