Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #10 from Shura1oplot/improvements2
Bugfixes
  • Loading branch information
pcbje committed Jan 7, 2017
2 parents d2d9aaf + 7d93047 commit 63be498
Show file tree
Hide file tree
Showing 30 changed files with 290 additions and 128 deletions.
6 changes: 5 additions & 1 deletion .travis.yml
Expand Up @@ -7,7 +7,7 @@ python:
- "2.7"

install:
- "sudo apt-get install --force-yes -y python-dev zlib1g-dev p7zip-full libicu-dev fontconfig rubygems-integration poppler-utils ghostscript"
- "sudo apt-get install --force-yes -y python-dev zlib1g-dev unzip p7zip-full p7zip-rar libicu-dev fontconfig rubygems-integration poppler-utils ghostscript"
- "sudo gem install coveralls-lcov"
- "sudo apt-get remove --purge -y bzip2 libbz2-dev"
- "pip install -r utils/dfvfs-requirements.txt"
Expand All @@ -16,6 +16,10 @@ install:
- "python setup.py install"
- "cp gransk/web/tests/package.json /opt && npm install --prefix=/opt /opt"

before_script:
- "chmod +x utils/travis/*"
- "export PATH=utils/travis:$PATH"

script:
- "python run_tests.py"
- "py.test --cov gransk gransk"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Expand Up @@ -12,7 +12,7 @@ WORKDIR /app

RUN apt-get update
RUN apt-get install --force-yes -y \
python-dev python-setuptools zlib1g-dev p7zip-full \
python-dev python-setuptools zlib1g-dev unzip p7zip-full p7zip-rar \
python-pip libicu-dev poppler-utils ghostscript && \
pip install -r utils/dfvfs-requirements.txt && \
pip install -r requirements.txt && \
Expand Down
20 changes: 12 additions & 8 deletions gransk/api.py
Expand Up @@ -5,13 +5,10 @@

import requests
import time
import json
import hashlib
import logging
import traceback
import threading
import yaml
import os
import sys
import shutil

import six.moves.http_client
Expand Down Expand Up @@ -70,16 +67,17 @@ def consume(self, doc, file_object=None):
return

if not file_object:
file_object = open(doc.path)
file_object = open(doc.path, "rb")

self.produce(helper.EXTRACT_META, doc, file_object)
self.produce(helper.PROCESS_FILE, doc, file_object)

file_object.close()

except Exception as err:
LOGGER.exception('could not process %s: %s', doc.path, err)
doc.status = 'error'
doc.meta['gransk_error'] = six.text_type(err)
traceback.print_exc(file=sys.stdout)


class API(object):
Expand Down Expand Up @@ -113,12 +111,18 @@ def __init__(self, injector=None, config_path=None):
self.pipeline = pipeline.build_pipeline(self.config)
self.entrypoint = Subscriber(self.pipeline)
self.entrypoint.setup(self.config)
self.write_lock = threading.Lock()

def add_file(self, doc, file_object):
return self.entrypoint.consume(doc, file_object=file_object)
with self.write_lock:
return self.entrypoint.consume(doc, file_object=file_object)

def clear_all(self):
"""Clear all processed data."""
with self.write_lock:
self._clear_all()

def _clear_all(self):
try:
if os.path.exists(self.config[helper.DATA_ROOT]):
shutil.rmtree(self.config[helper.DATA_ROOT])
Expand All @@ -128,7 +132,7 @@ def clear_all(self):
os.makedirs(os.path.join(self.config[helper.DATA_ROOT], 'archives'))
os.makedirs(os.path.join(self.config[helper.DATA_ROOT], 'archives', '.tmp'))
except Exception as err:
print (">>", err)
LOGGER.error("could not clear data: %s", err)

connection = self.config['injector'].get_http_connection('%s:%s' % (self.config['es_host'][0], 9200))
connection.request('DELETE', '/gransk', '', {})
Expand Down
13 changes: 1 addition & 12 deletions gransk/boot/run.py
Expand Up @@ -8,28 +8,17 @@
import os
import argparse
import logging
import six.moves.http_client
import shutil
import sys

import yaml

import gransk.core.helper as helper
import gransk.api

from multiprocessing import Queue, Process
from six.moves.queue import Empty
import time
import traceback
import logging
import os
import sys
import glob
import six
from six.moves import range

from tqdm import tqdm

import gransk.api
import gransk.core.compat as _
import gransk.core.file_collector as collector
import gransk.core.pipeline as pipeline
Expand Down
24 changes: 10 additions & 14 deletions gransk/boot/ui.py
Expand Up @@ -9,13 +9,10 @@
import os
import logging
import json
import shutil
import time
from functools import wraps
import requests

from flask import Flask, Response, render_template, request, abort
from werkzeug import secure_filename
import yaml
from flask import Flask, Response, request, abort

import gransk.api
import gransk.core.helper as helper
Expand All @@ -32,9 +29,6 @@
static_folder=os.path.join(_root, 'gransk', 'web', 'app'))


from functools import wraps
from flask import request, Response

_globals = {}

def check_auth(username, password):
Expand Down Expand Up @@ -83,8 +77,9 @@ def upload():
_file = request.files.get('file')

doc = document.get_document(
secure_filename(_file.filename),
parent=document.get_document('root'))
_file.filename,
parent=document.get_document('root'),
need_secure_path=True)

doc.tag = 'upload'

Expand All @@ -104,8 +99,8 @@ def delete_data():
@app.route('/file', methods=['GET'])
def get_file():
"""Get original file."""
filename = secure_filename(request.args['filename'])
ext = secure_filename(request.args['ext'])
filename = document.secure_path(request.args['filename'])
ext = document.secure_path(request.args['ext'])
mediatype = request.args['mediatype']

root = os.path.join(_globals['gransk'].config[helper.DATA_ROOT], 'files')
Expand All @@ -117,6 +112,7 @@ def get_file():
with open(file_path, 'rb') as inp:
return Response(inp.read(), mimetype=mediatype, status=200)


@app.route('/search')
def search():
query = json.loads(request.args['q'])
Expand All @@ -133,7 +129,7 @@ def search():
@app.route('/picture', methods=['GET'])
def picture():
"""Get document content as picture."""
name = secure_filename(request.args['name'])
name = document.secure_path(request.args['name'])
mediatype = request.args['mediatype']

root = os.path.join(_globals['gransk'].config[helper.DATA_ROOT], 'pictures')
Expand Down Expand Up @@ -231,5 +227,5 @@ def setup(args, pipeline, runmod, injector):
#context = ('/etc/letsencrypt/live/gransk.com/cert.pem', '/etc/letsencrypt/live/gransk.com/privkey.pem')
context = None

app.run(host=args.host, port=args.port, debug=args.debug, threaded=True , ssl_context=context)
app.run(host=args.host, port=args.port, debug=args.debug, threaded=True, ssl_context=context)
_globals['gransk'].pipeline.stop()
4 changes: 1 addition & 3 deletions gransk/core/detect_type.py
Expand Up @@ -4,7 +4,6 @@
from __future__ import absolute_import, unicode_literals

import logging
import traceback
import sys
import os
import six
Expand Down Expand Up @@ -92,8 +91,7 @@ def consume(self, doc, payload):
self.produce(helper.RUN_PIPELINE, doc, None)

except Exception as err:
traceback.print_exc(file=sys.stdout)
LOGGER.warning('could not process %s: %s', doc.path, err)
LOGGER.exception('could not process %s: %s', doc.path, err)
doc.status = 'error'
doc.meta['gransk_error'] = six.text_type(err)
self.produce(helper.ERRORED_FILE, doc, payload)
54 changes: 44 additions & 10 deletions gransk/core/document.py
Expand Up @@ -10,6 +10,9 @@

from six import text_type as unicode

from werkzeug import secure_filename
from slugify import slugify


class Entities(object):
"""Class for a set of entities found within documents."""
Expand Down Expand Up @@ -142,32 +145,40 @@ def as_obj(self):
}


def get_document(path, parent=None):
def get_document(path, parent=None, need_secure_path=None):
"""
Create a new document object from the given path.
:param path: Path to document (does not have to exist on file system).
:param parent: Parent document (e.g. diskimage or archive).
:returns: ``gransk.core.Document``
"""
if isinstance(path, unicode):
bpath, upath = path.encode('utf-8'), path
else:
bpath, upath = path, path.decode('utf-8')
if isinstance(path, bytes):
path = path.decode('utf-8')

original_path = path

if need_secure_path:
path = secure_path(path)

if not path:
path = 'unnamed'

doc = Document()
doc.path = upath
doc.path = path

doc.meta['original_path'] = original_path

if os.path.dirname(doc.path):
doc.meta['directory'] = os.path.dirname(doc.path)
if os.path.dirname(path):
doc.meta['directory'] = os.path.dirname(path)

digest = hashlib.md5()

digest.update(bpath)
digest.update(path.encode('utf-8'))

doc.docid = digest.hexdigest()

_, ext = os.path.splitext(doc.path)
_, ext = os.path.splitext(path)
doc.ext = ext.lstrip('.').lower() or 'none'

doc.parent = parent
Expand All @@ -179,3 +190,26 @@ def get_document(path, parent=None):

doc.added = int(time.time())
return doc


def secure_path(path):
dirname = os.path.dirname(path)
filename = os.path.basename(path)
file_base, file_ext = os.path.splitext(path)

dirname = secure_filename(slugify(dirname, only_ascii=True))
file_base = secure_filename(slugify(file_base, only_ascii=True)) or 'unnamed'
file_ext = secure_filename(slugify(file_ext, only_ascii=True))

if file_ext:
filename = '.'.join([file_base, file_ext])
else:
filename = file_base

if len(filename) > 200:
filename = '%s__%s' % (filename[:99], filename[-99:])

if dirname:
return os.path.join(dirname, filename)

return filename
1 change: 1 addition & 0 deletions gransk/core/helper.py
Expand Up @@ -55,3 +55,4 @@
MIN_SCORE = 'min_score'
MIN_SHARED = 'min_shared'
MAX_RESULTS = 'max_results'
ES_INDEX = 'es_index'
5 changes: 1 addition & 4 deletions gransk/core/pipeline.py
Expand Up @@ -4,8 +4,6 @@
from __future__ import absolute_import, unicode_literals

import logging
import traceback
import sys
import inspect
import os
from collections import defaultdict
Expand Down Expand Up @@ -133,8 +131,7 @@ def init_subscriber(config, subscriber_mod, pipeline):

pipeline.subscribers.append(subscriber)
except Exception as err:
traceback.print_exc(file=sys.stdout)
LOGGER.warning('! %s could not be loaded: %s', subscriber_mod, err)
LOGGER.exception('! %s could not be loaded: %s', subscriber_mod, err)


def build_pipeline(config):
Expand Down
28 changes: 22 additions & 6 deletions gransk/core/tests/test_helper.py
Expand Up @@ -54,12 +54,28 @@ class MockHttpConnection(object):

def __init__(self, response_text):
self.response_text = response_text
self.request_headers = None

def request(self, method, uri, payload, headers):
pass
self.request_headers = headers

def getresponse(self):
return BytesIO(self.response_text)
return MockHttpResponse(self.response_text)


class MockHttpResponse(object):

def __init__(self, response_data):
super(MockHttpResponse, self).__init__()
self.stream = BytesIO(response_data)
self.status = 200
self.reason = 'OK'

def read(self):
return self.stream.read()

def close(self):
self.stream.close()


class MockElasticsearchIndex(object):
Expand Down Expand Up @@ -129,7 +145,6 @@ def __init__(self, text):
self.entities.append(MockEntity('I-PER', parts))



class MockWorker(object):

def __init__(self):
Expand All @@ -141,10 +156,11 @@ def boot(self, inject, config, path):
self.called = True
return []


class MockInjector(object):

def __init__(self, response_text=None, ner_entities=[]):
self.response_text = response_text
def __init__(self, response_text=b"", ner_entities=[]):
self.http_connection = MockHttpConnection(response_text)
self.elastic = MockElasticsearch()
self.elastic_helper = MockElasticsearchHelper()
self.polyglot = MockPolyglot
Expand All @@ -158,7 +174,7 @@ def get_worker(self):
return self.worker

def get_http_connection(self, url=None):
return MockHttpConnection(self.response_text)
return self.http_connection

def get_elasticsearch(self):
return self.elastic
Expand Down
2 changes: 1 addition & 1 deletion gransk/plugins/analysis/abstract_related.py
Expand Up @@ -77,7 +77,7 @@ def load_all(self, config):
else:
self.buckets[key] = value
except:
pass
logging.warning('could not load related_%s data', self.NAME)

def stop(self):
"""Write data to file."""
Expand Down

0 comments on commit 63be498

Please sign in to comment.