Merge pull request #528 from kleintom/file_operator

File operator
mozilla · Apr 23, 2016 · 066907f · 066907f
2 parents dc4d454 + fbba444
commit 066907f
Show file tree

Hide file tree

Showing 12 changed files with 125 additions and 49 deletions.
diff --git a/dxr/app.py b/dxr/app.py
@@ -309,7 +309,9 @@ def _browse_folder(tree, path, config):
         filter={'folder': path},
         sort=[{'is_folder': 'desc'}, 'name'],
         size=10000,
-        exclude=['raw_data'])
+        include=['name', 'modified', 'size', 'link', 'path', 'is_binary',
+                 'is_folder'])
+
     if not files_and_folders:
         raise NotFound
 

diff --git a/dxr/build.py b/dxr/build.py
@@ -27,7 +27,7 @@
 import dxr
 from dxr.app import make_app
 from dxr.config import FORMAT
-from dxr.es import UNINDEXED_STRING, TREE, create_index_and_wait
+from dxr.es import UNINDEXED_STRING, UNANALYZED_STRING, TREE, create_index_and_wait
 from dxr.exceptions import BuildError
 from dxr.filters import LINE, FILE
 from dxr.lines import es_lines, finished_tags
@@ -98,14 +98,8 @@ def deploy_tree(tree, es, index_name):
                             'enabled': False
                         },
                         'properties': {
-                            'name': {
-                                'type': 'string',
-                                'index': 'not_analyzed'
-                            },
-                            'format': {
-                                'type': 'string',
-                                'index': 'not_analyzed'
-                            },
+                            'name': UNANALYZED_STRING,
+                            'format': UNANALYZED_STRING,
                             # In case es_alias changes in the conf file:
                             'es_alias': UNINDEXED_STRING,
                             # Needed so new trees or edited descriptions can show

diff --git a/dxr/es.py b/dxr/es.py
@@ -13,6 +13,12 @@
 }
 
 
+UNANALYZED_STRING = {
+    'type': 'string',
+    'index': 'not_analyzed',
+}
+
+
 UNINDEXED_INT = {
     'type': 'integer',
     'index': 'no',

diff --git a/dxr/format b/dxr/format
@@ -1 +1 @@
-17
+18
diff --git a/dxr/plugins/core.py b/dxr/plugins/core.py
@@ -2,15 +2,16 @@
 
 from base64 import b64encode
 from itertools import chain
-from os.path import relpath, splitext, islink, realpath
+from os.path import relpath, splitext, realpath, basename
 import re
 
 from flask import url_for
 from funcy import identity
 from jinja2 import Markup
 from parsimonious import ParseError
 
-from dxr.es import UNINDEXED_STRING, UNINDEXED_INT, UNINDEXED_LONG
+from dxr.es import (UNINDEXED_STRING, UNANALYZED_STRING, UNINDEXED_INT,
+                    UNINDEXED_LONG)
 from dxr.exceptions import BadTerm
 from dxr.filters import Filter, negatable, FILE, LINE
 import dxr.indexers
@@ -21,11 +22,11 @@
                             NoTrigrams, PythonRegexVisitor)
 from dxr.utils import glob_to_regex
 
-__all__ = ['mappings', 'analyzers', 'TextFilter', 'PathFilter', 'ExtFilter',
-           'RegexpFilter', 'IdFilter', 'RefFilter']
+__all__ = ['mappings', 'analyzers', 'TextFilter', 'PathFilter', 'FilenameFilter',
+           'ExtFilter', 'RegexpFilter', 'IdFilter', 'RefFilter']
 
 
-PATH_MAPPING = {  # path/to/a/folder/filename.cpp
+PATH_SEGMENT_MAPPING = {  # some portion of a path/to/a/folder/filename.cpp string
     'type': 'string',
     'index': 'not_analyzed',  # support JS source fetching & sorting & browse() lookups
     'fields': {
@@ -41,12 +42,6 @@
 }
 
 
-EXT_MAPPING = {
-    'type': 'string',
-    'index': 'not_analyzed'
-}
-
-
 mappings = {
     # We also insert entries here for folders. This gives us folders in dir
     # listings and the ability to find matches in folder pathnames.
@@ -56,26 +51,23 @@
         },
         'properties': {
             # FILE filters query this. It supports globbing via JS regex script.
-            'path': PATH_MAPPING,
+            'path': PATH_SEGMENT_MAPPING,  # path/to/a/folder/filename.cpp
 
-            'ext': EXT_MAPPING,
+            # Basename of path for fast lookup.
+            # FILE filters query this. It supports globbing via JS regex script.
+            'file_name': PATH_SEGMENT_MAPPING,  # filename.cpp
 
-            'link': {  # the target path if this FILE is a symlink
-                'type': 'string',
-                'index': 'not_analyzed'
-            },
+            'ext': UNANALYZED_STRING,
+
+            # the target path if this FILE is a symlink
+            'link': UNANALYZED_STRING,
 
             # Folder listings query by folder and then display filename, size,
             # and mod date.
-            'folder': {  # path/to/a/folder
-                'type': 'string',
-                'index': 'not_analyzed'
-            },
+            'folder': UNANALYZED_STRING,  # path/to/a/folder
 
-            'name': {  # filename.cpp or leaf_folder (for sorting and display)
-                'type': 'string',
-                'index': 'not_analyzed'
-            },
+            # filename.cpp or leaf_folder (for sorting and display)
+            'name': UNANALYZED_STRING,
             'size': UNINDEXED_INT,  # bytes. not present for folders.
             'modified': {  # not present for folders
                 'type': 'date',
@@ -119,8 +111,9 @@
             'enabled': False
         },
         'properties': {
-            'path': PATH_MAPPING,
-            'ext': EXT_MAPPING,
+            'path': PATH_SEGMENT_MAPPING,
+            'file_name': PATH_SEGMENT_MAPPING,
+            'ext': UNANALYZED_STRING,
             # TODO: After the query language refresh, use match_phrase_prefix
             # queries on non-globbed paths, analyzing them with the path
             # analyzer, for max perf. Perfect! Otherwise, fall back to trigram-
@@ -283,30 +276,56 @@ def highlight_content(self, result):
                            maybe_lower(self._term['arg'])))
 
 
-class PathFilter(Filter):
+class _PathSegmentFilterBase(Filter):
+    """A base class for a filter that matches a glob against a path segment."""
+    domain = FILE
+
+    def _regex_filter(self, path_seg_property_name, no_trigrams_error_text):
+        """Return an ES regex filter that matches this filter's glob against the
+        path segment at path_seg_property_name.
+
+        """
+        glob = self._term['arg']
+        try:
+            return es_regex_filter(
+                regex_grammar.parse(glob_to_regex(glob)),
+                path_seg_property_name,
+                is_case_sensitive=self._term['case_sensitive'])
+        except NoTrigrams:
+            raise BadTerm(no_trigrams_error_text)
+
+
+class PathFilter(_PathSegmentFilterBase):
     """Substring filter for paths
 
     Pre-ES parity dictates that this simply searches for paths that have the
     argument as a substring. We may allow anchoring and such later.
 
     """
     name = 'path'
-    domain = FILE
     description = Markup('File or directory sub-path to search within. <code>*'
                          '</code>, <code>?</code>, and <code>[...]</code> act '
                          'as shell wildcards.')
 
     @negatable
     def filter(self):
-        glob = self._term['arg']
-        try:
-            return es_regex_filter(
-                regex_grammar.parse(glob_to_regex(glob)),
-                'path',
-                is_case_sensitive=self._term['case_sensitive'])
-        except NoTrigrams:
-            raise BadTerm('Path globs need at least 3 literal characters in a row '
-                          'for speed.')
+        return self._regex_filter('path',
+                                  'Path globs need at least 3 literal '
+                                  'characters in a row for speed.')
+
+
+class FilenameFilter(_PathSegmentFilterBase):
+    """Substring filter for file names"""
+    name = 'file'
+    description = Markup('File to search within. <code>*</code>, '
+                         '<code>?</code>, and <code>[...]</code> act as shell '
+                         'wildcards.')
+
+    @negatable
+    def filter(self):
+        return self._regex_filter('file_name',
+                                  'File globs need at least 3 literal '
+                                  'characters in a row for speed.')
 
 
 class ExtFilter(Filter):
@@ -436,6 +455,7 @@ def needles(self):
             # realpath will keep following symlinks until it gets to the 'real' thing.
             yield 'link', relpath(realpath(self.absolute_path()), self.tree.source_folder)
         yield 'path', self.path
+        yield 'file_name', basename(self.path)
         extension = splitext(self.path)[1]
         if extension:
             yield 'ext', extension[1:]  # skip the period

diff --git a/tests/test_path_file_filters/code/fish1 b/tests/test_path_file_filters/code/fish1
diff --git a/tests/test_path_file_filters/code/fishy_folder/fish2 b/tests/test_path_file_filters/code/fishy_folder/fish2
diff --git a/tests/test_path_file_filters/code/fishy_folder/gill b/tests/test_path_file_filters/code/fishy_folder/gill
diff --git a/tests/test_path_file_filters/code/folder/fish3 b/tests/test_path_file_filters/code/folder/fish3
@@ -0,0 +1 @@
+fins
diff --git a/tests/test_path_file_filters/code/folder/fish4 b/tests/test_path_file_filters/code/folder/fish4
diff --git a/tests/test_path_file_filters/dxr.config b/tests/test_path_file_filters/dxr.config
@@ -0,0 +1,10 @@
+[DXR]
+enabled_plugins       = pygmentize
+es_index              = dxr_test_{format}_{tree}_{unique}
+es_alias              = dxr_test_{format}_{tree}
+es_catalog_index      = dxr_test_catalog
+
+[code]
+source_folder         = code
+build_command         =
+clean_command         =
diff --git a/tests/test_path_file_filters/test_path_file_filters.py b/tests/test_path_file_filters/test_path_file_filters.py
@@ -0,0 +1,43 @@
+from nose.tools import raises
+
+from dxr.testing import DxrInstanceTestCase
+
+
+class PathAndFileFilterTests(DxrInstanceTestCase):
+    """Basic tests for functionality of the 'path:' and 'file:' filters"""
+
+    def test_basic_path_results(self):
+        """Check that a 'path:' result includes both file and folder matches."""
+        self.found_files_eq('path:fish', ['fish1', 'fishy_folder/fish2',
+                                          'fishy_folder/gill', 'folder/fish3',
+                                          'folder/fish4'])
+
+    def test_basic_file_results(self):
+        """Check that a 'file:' result includes only file matches."""
+        self.found_files_eq('file:fish', ['fish1', 'fishy_folder/fish2',
+                                          'folder/fish3', 'folder/fish4'])
+
+    def test_path_and_file_line_promotion(self):
+        """Make sure promotion of a 'path:' or 'file:' filter to a LINE query
+        works.
+
+        """
+        self.found_files_eq('path:fish fins', ['folder/fish3'])
+        self.found_files_eq('file:fish fins', ['folder/fish3'])
+
+    # This fails because we currently intentionally exclude folder paths from
+    # FILE query results - remove the @raises line when that's changed.  (Of
+    # course then other tests here will need to be updated as well.)
+    @raises(AssertionError)
+    def test_empty_folder_path_results(self):
+        """Check that 'path:' results include empty folders."""
+        self.found_files_eq('path:empty_folder', ['empty_folder'])
+
+    def test_basic_wildcard(self):
+        """Test basic wildcard functionality."""
+        # 'path:' and 'file:' currently have the same underlying wildcard
+        # support, so we're spreading out the basic wildcard testing over both.
+        self.found_files_eq('path:fish?_fo*er',
+                            ['fishy_folder/fish2', 'fishy_folder/gill'])
+
+        self.found_files_eq('file:fish[14]', ['fish1', 'folder/fish4'])