Skip to content

Commit

Permalink
Identifier searching
Browse files Browse the repository at this point in the history
  • Loading branch information
bill-mccloskey committed Jul 4, 2016
1 parent b849540 commit 919cb5f
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 108 deletions.
19 changes: 17 additions & 2 deletions TODO
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ Other indexing:
IDL:
* Need to index the name of the IDL class as a type.

Searching for declarations:
== Searching for declarations ==

Perhaps I'll make an index file that can be used to augment the full text search.
This would be similar to augmenting it with filenames.
When doing the analysis, a given decl would generate a fully-qualified name and a line/column number.
Expand All @@ -44,7 +45,21 @@ Not sure if that would be fast enough, but it might not be too bad. It would jus
Or I could put in each level of qualification as a separate thing, and they would have to type
a prefix. That way I could do a binary search, which would be much faster.

C++:
There are two ways to make this fast:
1. Use codesearch to search through the file. It would probably go in a separate repo.
2. Use binary search. The file would be sorted. For this, though, I would need to include
all levels of qualification. I guess the JS analysis already does this.
3. Just grep through the file. It's hard to see how this would be fast enough though.

Actually, though, I don't want just defs. I also want uses. So this will be a very big file.
I'll start out using binary search. It makes sense to have the crossref code generate this
file I think.

ID table: This will be a map from identifiers (with all levels of
qualification) to symbols. Once we have a symbol, we can look up the
symbol in the crossref file.

== C++ ==

For a given (static type, method) combo, I think I should have one
canonical set of results regardless of whether you're searching from a
Expand Down
73 changes: 73 additions & 0 deletions router/identifiers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import json
import sys
import mmap
import os.path

repo_data = {}

def load(config):
global repo_data

for repo_name in config['repos']:
print 'Loading identifiers for', repo_name
index_path = config['repos'][repo_name]['index_path']

f = open(os.path.join(index_path, 'identifiers'))
mm = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
f.close()

repo_data[repo_name] = mm

def get_line(mm, pos):
if mm[pos] == '\n':
pos -= 1

start = end = pos

while start >= 0 and mm[start] != '\n':
start -= 1
start += 1

size = mm.size()
while end < size and mm[end] != '\n':
end += 1

return mm[start:end]

def bisect(mm, needle, upper_bound):
first = 0
count = mm.size()
while count > 0:
step = count / 2
pos = first + step

line = get_line(mm, pos)
if line < needle or (upper_bound and line == needle):
first = pos + 1
count -= step + 1
else:
count = step

return first

def lookup(tree_name, needle, complete):
mm = repo_data[tree_name]

first = bisect(mm, needle, False)
last = bisect(mm, needle + '~', False)

result = []
mm.seek(first)
while mm.tell() < last:
line = mm.readline().strip()
pieces = line.split(' ')
suffix = pieces[0][len(needle):]
if ':' in suffix or '.' in suffix or (complete and suffix):
continue
result.append(pieces[0:2])

return result

if __name__ == '__main__':
load(json.load(open(sys.argv[1])))
print lookup(sys.argv[2], sys.argv[3])
164 changes: 135 additions & 29 deletions router/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@
import time
import errno
import traceback
import collections

import crossrefs
import identifiers
import codesearch
from logger import log

# TODO:
# Move spinner to the right end?
# Make a help box?
#FIXME:
# If you have an identifier search that includes lots of symbols, it will be very slow. Need to limit the result count, but we need to return 1000 results even if there are dupes.
# Need case insensitivity.
# Path restriction?

def index_path(tree_name):
return config['repos'][tree_name]['index_path']
Expand Down Expand Up @@ -63,6 +66,8 @@ def parse_search(searchString):
elif pieces[i].startswith('re:'):
result['re'] = (' '.join(pieces[i:]))[len('re:'):]
break
elif pieces[i].startswith('id:'):
result['id'] = pieces[i][len('id:'):]
else:
result['default'] = re.escape(' '.join(pieces[i:]))
break
Expand All @@ -80,6 +85,19 @@ def is_trivial_search(parsed):
return True

def sort_results(results):
# Semantic results are everything except "Textual Occurrences".
# We track them so they can be removed from "Textual Occurrences".
semantic_results = {}
for (kind, rs) in results.items():
if kind == 'Textual Occurrences':
continue

for result in rs:
path = result['path']
for line_info in result['lines']:
lno = line_info['lno']
semantic_results[(path, lno)] = True

def is_test(p):
return '/test/' in p or '/tests/' in p or '/mochitest/' in p or '/unit/' in p or 'testing/' in p

Expand All @@ -99,30 +117,75 @@ def sortfunc(p1, p2):
r -= 10000
return r

def combine_lines(lines1, lines2):
result_count = [0]
max_result_count = 1000

def combine_lines(kind, path, lines1, lines2):
# Eliminate duplicates and sort by line number.
dict1 = { l['lno']: l for l in lines1 }
dict2 = { l['lno']: l for l in lines2 }
dict1.update(dict2)
lines = dict1.values()

# If this is a "Textual Occurrences" result, remove semantic matches.
if kind == 'Textual Occurrences':
def keep(l):
return (path, l['lno']) not in semantic_results
lines = [ l for l in lines if keep(l) ]

lines.sort(lambda l1, l2: cmp(l1['lno'], l2['lno']))

result_count[0] += len(lines)
if result_count[0] > max_result_count:
n = result_count[0] - max_result_count
lines = lines[:-n]
result_count[0] -= n

return lines

def combine(path1r, path2r):
def combine(kind, path1r, path2r):
return {'path': path1r['path'],
'lines': combine_lines(path1r['lines'], path2r['lines'])}
'lines': combine_lines(kind, path1r['path'], path1r['lines'], path2r['lines'])}

def sort_inner(results):
def sort_inner(kind, results):
m = {}
for result in results:
m[result['path']] = combine(m.get(result['path'], result), result)
r = combine(kind, m.get(result['path'], result), result)

# We may have removed everything (due to them being
# semantic matches). Don't record the path in this case.
if len(r['lines']):
m[result['path']] = r

paths = m.keys()
paths.sort(sortfunc)

return [ m[path] for path in paths ]

return { kind: sort_inner(res) for kind, res in results.items() }
# Return results in this order.
key_precedences = ["IDL", "Definitions", "Assignments", "Uses", "Textual Occurrences", "Declarations"]

def key_precedence(k):
for (prec, kind) in enumerate(key_precedences):
if k.startswith(kind):
return prec
return len(key_precedences)

def key_sort(k1, k2):
prec1 = key_precedence(k1)
prec2 = key_precedence(k2)
if prec1 == prec2:
return cmp(k1, k2)
else:
return cmp(prec1, prec2)

keys = list(results.keys())
keys.sort(key_sort)

r = collections.OrderedDict()
for k in keys:
r[k] = sort_inner(k, results[k])
return r

def search_files(tree_name, path):
pathFile = os.path.join(index_path(tree_name), 'repo-files')
Expand All @@ -135,38 +198,72 @@ def search_files(tree_name, path):
results = [ {'path': f, 'lines': []} for f in results ]
return results[:1000]

def num_lines(results):
count = 0
for k in results:
for pathspec in results[k]:
count += len(pathspec['lines'])
return count

def identifier_search(tree_name, needle, complete, path):
needle = re.sub(r'\\(.)', r'\1', needle)

pieces = re.split(r'\.|:', needle)
if not complete and len(pieces[-1]) < 3:
return {}

ids = identifiers.lookup(tree_name, needle, complete)
print 'IDS', ids
result = {}
count = 0
for (qualified, sym) in ids:
results = crossrefs.lookup(tree_name, sym)
for kind in results:
if path:
pass
else:
k = '%s (%s)' % (kind, qualified)
result[k] = result.get(k, []) + results[kind]

count += num_lines(results)
if count > 1000:

break

return result

def get_json_search_results(tree_name, query):
try:
searchString = query['q'][0]
search_string = query['q'][0]
except:
searchString = ''
search_string = ''

try:
foldCase = query['case'][0] != 'true'
fold_case = query['case'][0] != 'true'
except:
foldCase = True
fold_case = True

try:
regexp = query['regexp'][0] == 'true'
except:
regexp = False

try:
pathFilter = query['path'][0]
path_filter = query['path'][0]
except:
pathFilter = ''
path_filter = ''

parsed = parse_search(searchString)
parsed = parse_search(search_string)

if pathFilter:
parsed['pathre'] = parse_path_filter(pathFilter)
if path_filter:
parsed['pathre'] = parse_path_filter(path_filter)

if regexp:
if 'default' in parsed:
del parsed['default']
if 're' in parsed:
del parsed['re']
parsed['re'] = searchString
parsed['re'] = search_string

if 'default' in parsed and len(parsed['default']) == 0:
del parsed['default']
Expand All @@ -175,30 +272,38 @@ def get_json_search_results(tree_name, query):
results = {}
return json.dumps(results)

title = searchString
title = search_string
if not title:
title = 'Files ' + pathFilter
title = 'Files ' + path_filter

if 'symbol' in parsed:
# FIXME: Need to deal with path here
symbols = parsed['symbol']
title = 'Symbol ' + symbols
results = crossrefs.lookup(tree_name, symbols)
elif 're' in parsed:
path = parsed.get('pathre', '.*')
substrResults = codesearch.search(parsed['re'], foldCase, path, tree_name)
results = {'default': substrResults}
substr_results = codesearch.search(parsed['re'], fold_case, path, tree_name)
results = {'Textual Occurrences': substr_results}
elif 'id' in parsed:
results = identifier_search(tree_name, parsed['id'], complete=True, path=parsed.get('pathre'))
elif 'default' in parsed:
path = parsed.get('pathre', '.*')
substrResults = codesearch.search(parsed['default'], foldCase, path, tree_name)
substr_results = codesearch.search(parsed['default'], fold_case, path, tree_name)
if 'pathre' in parsed:
fileResults = []
file_results = []
id_results = []
else:
fileResults = search_files(tree_name, parsed['default'])
results = {'default': fileResults + substrResults}
file_results = search_files(tree_name, parsed['default'])

print 'A'
id_results = identifier_search(tree_name, parsed['default'], complete=False, path=parsed.get('pathre'))
print 'B'

results = {'Textual Occurrences': file_results + substr_results}
results.update(id_results)
elif 'pathre' in parsed:
path = parsed['pathre']
results = {"default": search_files(tree_name, path)}
results = {'Textual Occurrences': search_files(tree_name, path)}
else:
assert False
results = {}
Expand Down Expand Up @@ -332,6 +437,7 @@ def generateWithTemplate(self, replacements, templateFile):

crossrefs.load(config)
codesearch.load(config)
identifiers.load(config)

class ForkingServer(ForkingMixIn, HTTPServer):
pass
Expand Down
Loading

0 comments on commit 919cb5f

Please sign in to comment.