Skip to content
This repository has been archived by the owner on Oct 13, 2021. It is now read-only.

Commit

Permalink
Fix bug 850453, the repeated-results thing.
Browse files Browse the repository at this point in the history
It was faster (and yielded more readable code) to rewrite the highlighting stuff than to fix what was there.
  • Loading branch information
erikrose committed Mar 14, 2013
1 parent 9135bd9 commit 80dd1c9
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 66 deletions.
133 changes: 71 additions & 62 deletions dxr/query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import itertools
from itertools import groupby
import utils, cgi, codecs, struct
import time

Expand Down Expand Up @@ -37,7 +37,7 @@
# Pattern for recognizing if a word will be tokenized as a single term.
# Ideally we should reuse our custom sqlite tokenizer, but that'll just
# complicated things, anyways, if it's not a identifier, it must be a single
# token, in which we'll wrap it anyway :)
# token, in which we'll wrap it anyway :)
_single_term = re.compile("^[a-zA-Z]+[a-zA-Z0-9]*$")

class Query:
Expand Down Expand Up @@ -155,13 +155,6 @@ def results(self,
#utils.log(sql)
#utils.log(arguments)

# Make a simple decoder for decoding unicode
# Note that we need to operate in ascii inorder to handle
# compiler offsets
decoder = codecs.getdecoder("utf-8")
def d(string):
return decoder(string, errors="replace")[0]

cursor = self.execute_sql(sql, arguments)

for path, icon, content, fileid, extents in cursor:
Expand All @@ -182,59 +175,8 @@ def d(string):
if self._should_explain:
continue

lines = []
line_number = 1
last_pos = 0

for i in xrange(0, len(offsets)):
# TODO keylist should infact have information about which extent of the
# search query caused this hit, we should highlight this extent
# (Note. Query object still doesn't provide support for offering this
# extent, and this needs to be supported and used in filters).
estart, eend, keylist = offsets[i]

# Count the newlines from the top of the file to get the line
# number. Maybe we could optimize this by storing the line number
# in the index with the extent.
line_diff = content.count("\n", last_pos, estart)
# Skip if we didn't get a new line
if line_diff == 0 and last_pos > 0:
continue
line_number += line_diff
last_pos = estart

# Find newline before and after offset
end = content.find("\n", estart)
if end == -1:
end = len(content)
start = content.rfind("\n", 0, end) + 1
src_line = content[start:end]

# Build line
out_line = ""
mend = 0 # Invariant: Offset where last write ended

# Add some markup to highlight hits
while content.count("\n", last_pos, estart) == 0:
last_end = mend
mstart = estart - start
mend = eend - start
# Output line segment from last_end to markup start
out_line += cgi.escape(d(src_line[last_end:mstart]))
# Output markup and line segment
out_line += markup + cgi.escape(d(src_line[mstart:mend])) + markdown
i += 1
if i >= len(offsets):
break
estart, eend, keylist = offsets[i]

# Output the rest of the line when theres no more offsets
# Notice that the while loop always goes atleast once
out_line += cgi.escape(d(src_line[mend:]))

lines.append((line_number, out_line))
# Return result
yield icon, path, lines
yield icon, path, _highlit_lines(content, offsets, markup, markdown)

def number_lines(arr):
ret = []
Expand Down Expand Up @@ -349,6 +291,73 @@ def direct_result(self):
return None


def _highlit_line(content, offsets, markup, markdown):
"""Return a line of string ``content`` with the given ``offsets`` prefixed
by ``markup`` and suffixed by ``markdown``.
We assume that none of the offsets split a Unicode code point. This
assumption lets us run one big ``decode`` at the end.
"""
def chunks():
try:
# Start on the line the highlights are on:
chars_before = content.rindex('\n', 0, offsets[0][0]) + 1
except ValueError:
chars_before = None
for start, end in offsets:
# We can do the escapes before decoding, because all escaped chars
# are the same in ASCII and utf-8:
yield cgi.escape(content[chars_before:start])
yield markup
yield cgi.escape(content[start:end])
yield markdown
chars_before = end
# Make sure to get the rest of the line after the last highlight:
try:
next_newline = content.index('\n', chars_before)
except ValueError: # eof
next_newline = None
yield cgi.escape(content[chars_before:next_newline])
ret = ''.join(chunks())
return ret.decode('utf-8', errors='replace')


def _highlit_lines(content, offsets, markup, markdown):
"""Return a list of (line number, highlit line) tuples.
:arg content: The contents of the file against which the offsets are
reported, as a bytestring. (We need to operate in terms of bytestrings,
because those are the terms in which the C compiler gives us offsets.)
:arg offsets: A sequence of non-overlapping (start offset, end offset,
[keylist (presently unused)]) tuples describing each extent to
highlight. The sequence must be in order by start offset.
Assumes no newlines are highlit.
"""
line_extents = [] # [(line_number, (start, end)), ...]
lines_before = 1
chars_before = 0
for start, end, _ in offsets:
# How many lines we've skipped since we last knew what line we were on:
lines_since = content.count('\n', chars_before, start)

# Figure out what line we're on, and throw this extent into its bucket:
line = lines_before + lines_since
line_extents.append((line, (start, end)))

lines_before = line
chars_before = end

# Bucket highlit ranges by line, and build up the marked up strings:
return [(line, _highlit_line(content,
[extent for line, extent in lines_and_extents],
markup,
markdown)) for
line, lines_and_extents in groupby(line_extents, lambda (l, e): l)]


def like_escape(val):
""" Escape for usage in as argument to the LIKE operator """
return (val.replace("\\", "\\\\")
Expand Down Expand Up @@ -590,7 +599,7 @@ def builder():
for hit in hits:
yield hit
def sorter():
for hits in itertools.groupby(sorted(builder())):
for hits in groupby(sorted(builder())):
yield hits[0]
yield sorter()

Expand Down
22 changes: 18 additions & 4 deletions tests/test_strings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Tests for string searches"""

from nose import SkipTest
from nose.tools import eq_

from dxr.query import _highlit_lines
from dxr.testing import SingleFileTestCase, MINIMAL_MAIN


Expand All @@ -26,6 +26,20 @@ class RepeatedResultTests(SingleFileTestCase):

def test_repeated_results(self):
"""Make sure we don't get the same line back twice."""
raise SkipTest
self.found_lines_eq('int',
'<b>int</b> main(<b>int</b> argc, char* argv[]) {')
self.found_line_eq('int',
'<b>int</b> main(<b>int</b> argc, char* argv[]) {')


def test_highlit_lines():
"""A unit test for _highlit_lines() that I found handy while rewriting it
Redundant with most of the rest of these tests but runs fast, so let's keep
it.
"""
source = """int main(int argc, char* argv[]) {
return 0;
}
"""
eq_(_highlit_lines(source, [(0, 3, []), (9, 12, [])], '<b>', '</b>'),
[(1, '<b>int</b> main(<b>int</b> argc, char* argv[]) {')])

0 comments on commit 80dd1c9

Please sign in to comment.