Skip to content
This repository has been archived by the owner on Oct 13, 2021. It is now read-only.

Commit

Permalink
Index binary files and show image previews (Bug #1052217)
Browse files Browse the repository at this point in the history
Binary files are stored in the elastic search index under the 'IMAGE'
doctype and retrieved on demand. A bit of Javascript on browse pages
adds the image preview.
  • Loading branch information
pelmers committed Jan 22, 2015
1 parent b1eea26 commit 936ce19
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 30 deletions.
33 changes: 29 additions & 4 deletions dxr/app.py
Expand Up @@ -3,6 +3,8 @@
from os.path import isdir, isfile, join, basename
from sys import stderr
from time import time
from cStringIO import StringIO
from mimetypes import guess_type
from urllib import quote_plus

from flask import (Blueprint, Flask, send_from_directory, current_app,
Expand All @@ -13,7 +15,7 @@

from dxr.build import linked_pathname
from dxr.exceptions import BadTerm
from dxr.filters import FILE
from dxr.filters import FILE, IMAGE
from dxr.mime import icon
from dxr.plugins import plugins_named
from dxr.query import Query, filter_menu_items
Expand Down Expand Up @@ -165,6 +167,30 @@ def _tree_tuples(trees, tree, query_text, is_case_sensitive):
description)
for t, description in trees.iteritems()]

@dxr_blueprint.route('/<tree>/images/<path:path>')
def send_image(tree, path):
'''
Send an image at path from tree.
Looks like a file to the client, but it comes straight from the ES index.
'''
query = {
'filter': {
'term': {'path': path}
}
}
index = current_app.config['ES_ALIASES'][tree]
results = current_app.es.search(
query,
index=index,
doc_type=IMAGE, size=1)
try:
# we explicitly get index 0 because there should be exactly 1 result
data = results['hits']['hits'][0]['_source']['data']
dataFile = StringIO(data.decode("base64"))
return send_file(dataFile, mimetype=guess_type(path)[0])
except IndexError: # couldn't find the image
raise IndexError("Couldn't find image for %s.\nSearch: %s\nResults: %s"
% (path, query, results))

@dxr_blueprint.route('/<tree>/source/')
@dxr_blueprint.route('/<tree>/source/<path:path>')
Expand Down Expand Up @@ -267,11 +293,10 @@ def parallel(tree, path=''):


def _tree_folder(tree):
"""Return the on-disk path to the root of the given tree's folder in the
instance."""
"""Return the on-disk path to the root of the given tree's folder in
the instance."""
return join(current_app.instance_path, 'trees', tree)


def _html_file_path(tree_folder, url_path):
"""Return the on-disk path, relative to the tree folder, of the HTML file
that should be served when a certain path is browsed to. If a path to a
Expand Down
50 changes: 33 additions & 17 deletions dxr/build.py
Expand Up @@ -23,9 +23,9 @@
import dxr
from dxr.config import Config, FORMAT
from dxr.exceptions import BuildError
from dxr.filters import LINE, FILE
from dxr.filters import LINE, FILE, IMAGE
from dxr.lines import build_lines
from dxr.mime import is_text, icon
from dxr.mime import is_text, icon, is_image
from dxr.query import filter_menu_items
from dxr.utils import (open_log, parallel_url, deep_update, append_update,
append_update_by_line, append_by_line, TEMPLATE_DIR)
Expand Down Expand Up @@ -342,7 +342,7 @@ def ensure_folder(folder, clean=False):
if clean and os.path.isdir(folder):
shutil.rmtree(folder, False)
if not os.path.isdir(folder):
mkdir(folder)
os.makedirs(folder)


def _unignored_folders(folders, source_path, ignore_patterns, ignore_paths):
Expand Down Expand Up @@ -487,17 +487,16 @@ def index_file(tree, tree_indexers, path, es, index, jinja_env):
file_info = stat(path)
folder_name, file_name = split(rel_path)

if is_text: # conditional until we figure out how to display binary files
es.index(index,
FILE,
# Hard-code the keys that are hard-coded in the browse()
# controller. Merge with the pluggable ones from needles:
dict(folder=folder_name,
name=file_name,
size=file_info.st_size,
modified=datetime.fromtimestamp(file_info.st_mtime),
is_folder=False,
**needles))
es.index(index,
FILE,
# Hard-code the keys that are hard-coded in the browse()
# controller. Merge with the pluggable ones from needles:
dict(folder=folder_name,
name=file_name,
size=file_info.st_size,
modified=datetime.fromtimestamp(file_info.st_mtime),
is_folder=False,
**needles))

# Index all the lines, attaching the file-wide needles to each line as well:
if is_text and needles_by_line: # If it's an empty file (no lines), don't
Expand All @@ -508,10 +507,10 @@ def index_file(tree, tree_indexers, path, es, index, jinja_env):
(merge(n, needles) for n in needles_by_line), 300):
es.bulk_index(index, LINE, chunk_of_needles, id_field=None)

# Index image contents as binary in ES
image_src = index_image(tree, rel_path, index, es) if is_image(rel_path) else None
# Render some HTML:
# TODO: Make this no longer conditional on is_text, and come up with a nice
# way to show binary files, especially images.
if is_text and 'html' not in tree.config.skip_stages:
if 'html' not in tree.config.skip_stages:
_fill_and_write_template(
jinja_env,
'file.html',
Expand Down Expand Up @@ -543,8 +542,25 @@ def index_file(tree, tree_indexers, path, es, index, jinja_env):

'is_text': is_text,

# if the file's an image, use <img src=image_src>
'image_src': image_src,
'sections': build_sections(chain.from_iterable(linkses))})

def index_image(tree, rel_path, index, es):
'''
Index the image located at path relative to given tree using es in index.
Return the URL to get the image.
'''
# define the image source
image_src = "%s/images/%s" % (tree.name, rel_path)
# encode the image file as base64 string and index it
with open(join(tree.source_folder, rel_path), "rb") as img:
imgdata = img.read().encode("base64")
es.index(index, IMAGE, {
'path': [rel_path],
'data': imgdata})
return image_src


def index_chunk(tree, tree_indexers, paths, index, swallow_exc=False):
"""Index a pile of files.
Expand Down
2 changes: 1 addition & 1 deletion dxr/config.py
Expand Up @@ -149,7 +149,7 @@ def __init__(self, config, config_string, name):
self.source_encoding = parser.get(name, 'source_encoding')
self.description = parser.get(name, 'description')

# You cannot redefine the target folder!
# You cannot redefine the target folders!
self.target_folder = os.path.join(config.target_folder, 'trees', name)
# Set config file and DXR config object reference
self.config = config
Expand Down
1 change: 1 addition & 0 deletions dxr/filters.py
Expand Up @@ -7,6 +7,7 @@
# Domain constants:
FILE = 'file'
LINE = 'line'
IMAGE = 'image'


class Filter(object):
Expand Down
12 changes: 11 additions & 1 deletion dxr/mime.py
Expand Up @@ -11,6 +11,11 @@ def is_text(data):
# Simple stupid test that apparently works rather well :)
return '\0' not in data

def is_image(path):
"""Determine whether the path is an image."""
_, ext = splitext(path)
return ext_map.get(ext[1:], False) == 'image'


# File extension known as this point
ext_map = {
Expand Down Expand Up @@ -57,5 +62,10 @@ def is_text(data):
"ipdl": 'conf',
"mm": 'mm',
"tex": 'tex',
"vsprops": 'vs'
"vsprops": 'vs',
"jpg": 'image',
"jpeg": 'image',
"png": 'image',
"gif": 'image',
"svg": 'image'
}
25 changes: 24 additions & 1 deletion dxr/plugins/core.py
Expand Up @@ -8,7 +8,7 @@
from parsimonious import ParseError

from dxr.exceptions import BadTerm
from dxr.filters import Filter, negatable, FILE, LINE
from dxr.filters import Filter, negatable, FILE, IMAGE, LINE
import dxr.indexers
from dxr.plugins import direct_search
from dxr.trigrammer import (regex_grammar, SubstringTreeVisitor, NGRAM_LENGTH,
Expand Down Expand Up @@ -81,6 +81,22 @@
}
},

# The image doc stores the binary base64 encode blobs of image files.
IMAGE: {
'_all': {
'enabled': False
},
'properties': {
# IMAGE filters query this (like FILE)
'path': PATH_MAPPING,
# actual image data here
'data': {
'type': 'binary',
'index': 'no'
}
}
},

# The line doctype is the main workhorse of DXR searches. The search
# results present lines, so that's what we index.
LINE: {
Expand Down Expand Up @@ -313,6 +329,13 @@ def needles_by_line(self):
yield [('number', number),
('content', text)]

def is_interesting(self):
"""
Core is responsible for putting files in the index,
so everything is interesting
"""
return True


# Match file name and line number: filename:n. Strip leading slashes because
# we don't have any in the index.
Expand Down
1 change: 1 addition & 0 deletions dxr/static/css/icons.css
Expand Up @@ -10,6 +10,7 @@
background-color: transparent;
background-position: .5em center;
background-repeat: no-repeat;
background-size: 16px 16px;
padding: .5em .5em .5em 30px;
}
.icon-container {
Expand Down
2 changes: 1 addition & 1 deletion dxr/static/js/dxr.js
Expand Up @@ -461,7 +461,7 @@ $(function() {
// Thanks to bug 63040 in Chrome, onpopstate is fired when the page reloads.
// That means that if we naively set onpopstate, we would get into an
// infinite loop of reloading whenever onpopstate is triggered. Therefore,
// we have to only add out onpopstate handler once the page has loaded.
// we have to only add our onpopstate handler once the page has loaded.
window.onload = function() {
setTimeout(function() {
window.onpopstate = popStateHandler;
Expand Down
19 changes: 19 additions & 0 deletions dxr/static/js/image_preview.js
@@ -0,0 +1,19 @@
$(function() {
'use strict';

/**
* Replace 'source' with 'images' in href, and set that to the background-image
*/
function setBackgroundImageFromLink(anchorElement) {
var href = anchorElement.getAttribute('href');
// note: breaks if the tree's name is "source"
var bg_src = href.replace('source', 'images');
anchorElement.style.backgroundImage = 'url(' + bg_src + ')';
}

window.addEventListener('load', function() {
$(".image").each(function() {
setBackgroundImageFromLink(this);
});
});
});
10 changes: 6 additions & 4 deletions dxr/static/templates/file.html
Expand Up @@ -50,10 +50,6 @@ <h4>{{ section }}</h4>
</div>
{% endif %}

{% if not is_text %}
(binary file)
{% endif %}

<div id="annotations">
{% for line, annotations in lines %}
<div class="annotation-set" id="aset-{{ loop.index }}">
Expand All @@ -66,6 +62,9 @@ <h4>{{ section }}</h4>
{%- endfor -%}
</div>

{% if image_src %}
<img src="{{wwwroot}}/{{ image_src }}">
{% endif %}
<table id="file" class="file">
<thead class="visually-hidden">
<th scope="col">Line</th>
Expand All @@ -79,6 +78,9 @@ <h4>{{ section }}</h4>
{% endfor %}
</td>
<td class="code">
{% if not is_text %}
(binary file)
{% endif %}
<pre>
{% for line, annotations in lines -%}
<code id="line-{{ loop.index }}" aria-labelledby="{{ loop.index }}">{{ line }}</code>
Expand Down
1 change: 1 addition & 0 deletions dxr/static/templates/layout.html
Expand Up @@ -94,6 +94,7 @@
<script src="{{ wwwroot }}/static/js/dxr.js"></script>
<script src="{{ wwwroot }}/static/js/context_menu.js"></script>
<script src="{{ wwwroot }}/static/js/filter.js"></script>
<script src="{{ wwwroot }}/static/js/image_preview.js"></script>
{% endblock %}

{% if google_analytics_key %}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_binary_files/dxr.config.in
@@ -1,5 +1,5 @@
[DXR]
enabled_plugins = pygmentize clang
enabled_plugins = pygmentize
temp_folder = PWD/temp
target_folder = PWD/target
nb_jobs = 4
Expand Down

0 comments on commit 936ce19

Please sign in to comment.