Permalink
Browse files

Merge pull request #10 from Shura1oplot/improvements2

Bugfixes
  • Loading branch information...
pcbje committed Jan 7, 2017
2 parents d2d9aaf + 7d93047 commit 63be498ecc7115c55271e51f4ef27fe170d4f3f2
@@ -7,7 +7,7 @@ python:
- "2.7"
install:
- "sudo apt-get install --force-yes -y python-dev zlib1g-dev p7zip-full libicu-dev fontconfig rubygems-integration poppler-utils ghostscript"
- "sudo apt-get install --force-yes -y python-dev zlib1g-dev unzip p7zip-full p7zip-rar libicu-dev fontconfig rubygems-integration poppler-utils ghostscript"
- "sudo gem install coveralls-lcov"
- "sudo apt-get remove --purge -y bzip2 libbz2-dev"
- "pip install -r utils/dfvfs-requirements.txt"
@@ -16,6 +16,10 @@ install:
- "python setup.py install"
- "cp gransk/web/tests/package.json /opt && npm install --prefix=/opt /opt"
before_script:
- "chmod +x utils/travis/*"
- "export PATH=utils/travis:$PATH"
script:
- "python run_tests.py"
- "py.test --cov gransk gransk"
@@ -12,7 +12,7 @@ WORKDIR /app
RUN apt-get update
RUN apt-get install --force-yes -y \
python-dev python-setuptools zlib1g-dev p7zip-full \
python-dev python-setuptools zlib1g-dev unzip p7zip-full p7zip-rar \
python-pip libicu-dev poppler-utils ghostscript && \
pip install -r utils/dfvfs-requirements.txt && \
pip install -r requirements.txt && \
@@ -5,13 +5,10 @@
import requests
import time
import json
import hashlib
import logging
import traceback
import threading
import yaml
import os
import sys
import shutil
import six.moves.http_client
@@ -70,16 +67,17 @@ def consume(self, doc, file_object=None):
return
if not file_object:
file_object = open(doc.path)
file_object = open(doc.path, "rb")
self.produce(helper.EXTRACT_META, doc, file_object)
self.produce(helper.PROCESS_FILE, doc, file_object)
file_object.close()
except Exception as err:
LOGGER.exception('could not process %s: %s', doc.path, err)
doc.status = 'error'
doc.meta['gransk_error'] = six.text_type(err)
traceback.print_exc(file=sys.stdout)
class API(object):
@@ -113,12 +111,18 @@ def __init__(self, injector=None, config_path=None):
self.pipeline = pipeline.build_pipeline(self.config)
self.entrypoint = Subscriber(self.pipeline)
self.entrypoint.setup(self.config)
self.write_lock = threading.Lock()
def add_file(self, doc, file_object):
return self.entrypoint.consume(doc, file_object=file_object)
with self.write_lock:
return self.entrypoint.consume(doc, file_object=file_object)
def clear_all(self):
"""Clear all processed data."""
with self.write_lock:
self._clear_all()
def _clear_all(self):
try:
if os.path.exists(self.config[helper.DATA_ROOT]):
shutil.rmtree(self.config[helper.DATA_ROOT])
@@ -128,7 +132,7 @@ def clear_all(self):
os.makedirs(os.path.join(self.config[helper.DATA_ROOT], 'archives'))
os.makedirs(os.path.join(self.config[helper.DATA_ROOT], 'archives', '.tmp'))
except Exception as err:
print (">>", err)
LOGGER.error("could not clear data: %s", err)
connection = self.config['injector'].get_http_connection('%s:%s' % (self.config['es_host'][0], 9200))
connection.request('DELETE', '/gransk', '', {})
@@ -8,28 +8,17 @@
import os
import argparse
import logging
import six.moves.http_client
import shutil
import sys
import yaml
import gransk.core.helper as helper
import gransk.api
from multiprocessing import Queue, Process
from six.moves.queue import Empty
import time
import traceback
import logging
import os
import sys
import glob
import six
from six.moves import range
from tqdm import tqdm
import gransk.api
import gransk.core.compat as _
import gransk.core.file_collector as collector
import gransk.core.pipeline as pipeline
@@ -9,13 +9,10 @@
import os
import logging
import json
import shutil
import time
from functools import wraps
import requests
from flask import Flask, Response, render_template, request, abort
from werkzeug import secure_filename
import yaml
from flask import Flask, Response, request, abort
import gransk.api
import gransk.core.helper as helper
@@ -32,9 +29,6 @@
static_folder=os.path.join(_root, 'gransk', 'web', 'app'))
from functools import wraps
from flask import request, Response
_globals = {}
def check_auth(username, password):
@@ -83,8 +77,9 @@ def upload():
_file = request.files.get('file')
doc = document.get_document(
secure_filename(_file.filename),
parent=document.get_document('root'))
_file.filename,
parent=document.get_document('root'),
need_secure_path=True)
doc.tag = 'upload'
@@ -104,8 +99,8 @@ def delete_data():
@app.route('/file', methods=['GET'])
def get_file():
"""Get original file."""
filename = secure_filename(request.args['filename'])
ext = secure_filename(request.args['ext'])
filename = document.secure_path(request.args['filename'])
ext = document.secure_path(request.args['ext'])
mediatype = request.args['mediatype']
root = os.path.join(_globals['gransk'].config[helper.DATA_ROOT], 'files')
@@ -117,6 +112,7 @@ def get_file():
with open(file_path, 'rb') as inp:
return Response(inp.read(), mimetype=mediatype, status=200)
@app.route('/search')
def search():
query = json.loads(request.args['q'])
@@ -133,7 +129,7 @@ def search():
@app.route('/picture', methods=['GET'])
def picture():
"""Get document content as picture."""
name = secure_filename(request.args['name'])
name = document.secure_path(request.args['name'])
mediatype = request.args['mediatype']
root = os.path.join(_globals['gransk'].config[helper.DATA_ROOT], 'pictures')
@@ -231,5 +227,5 @@ def setup(args, pipeline, runmod, injector):
#context = ('/etc/letsencrypt/live/gransk.com/cert.pem', '/etc/letsencrypt/live/gransk.com/privkey.pem')
context = None
app.run(host=args.host, port=args.port, debug=args.debug, threaded=True , ssl_context=context)
app.run(host=args.host, port=args.port, debug=args.debug, threaded=True, ssl_context=context)
_globals['gransk'].pipeline.stop()
@@ -4,7 +4,6 @@
from __future__ import absolute_import, unicode_literals
import logging
import traceback
import sys
import os
import six
@@ -92,8 +91,7 @@ def consume(self, doc, payload):
self.produce(helper.RUN_PIPELINE, doc, None)
except Exception as err:
traceback.print_exc(file=sys.stdout)
LOGGER.warning('could not process %s: %s', doc.path, err)
LOGGER.exception('could not process %s: %s', doc.path, err)
doc.status = 'error'
doc.meta['gransk_error'] = six.text_type(err)
self.produce(helper.ERRORED_FILE, doc, payload)
@@ -10,6 +10,9 @@
from six import text_type as unicode
from werkzeug import secure_filename
from slugify import slugify
class Entities(object):
"""Class for a set of entities found within documents."""
@@ -142,32 +145,40 @@ def as_obj(self):
}
def get_document(path, parent=None):
def get_document(path, parent=None, need_secure_path=None):
"""
Create a new document object from the given path.
:param path: Path to document (does not have to exist on file system).
:param parent: Parent document (e.g. diskimage or archive).
:returns: ``gransk.core.Document``
"""
if isinstance(path, unicode):
bpath, upath = path.encode('utf-8'), path
else:
bpath, upath = path, path.decode('utf-8')
if isinstance(path, bytes):
path = path.decode('utf-8')
original_path = path
if need_secure_path:
path = secure_path(path)
if not path:
path = 'unnamed'
doc = Document()
doc.path = upath
doc.path = path
doc.meta['original_path'] = original_path
if os.path.dirname(doc.path):
doc.meta['directory'] = os.path.dirname(doc.path)
if os.path.dirname(path):
doc.meta['directory'] = os.path.dirname(path)
digest = hashlib.md5()
digest.update(bpath)
digest.update(path.encode('utf-8'))
doc.docid = digest.hexdigest()
_, ext = os.path.splitext(doc.path)
_, ext = os.path.splitext(path)
doc.ext = ext.lstrip('.').lower() or 'none'
doc.parent = parent
@@ -179,3 +190,26 @@ def get_document(path, parent=None):
doc.added = int(time.time())
return doc
def secure_path(path):
dirname = os.path.dirname(path)
filename = os.path.basename(path)
file_base, file_ext = os.path.splitext(path)
dirname = secure_filename(slugify(dirname, only_ascii=True))
file_base = secure_filename(slugify(file_base, only_ascii=True)) or 'unnamed'
file_ext = secure_filename(slugify(file_ext, only_ascii=True))
if file_ext:
filename = '.'.join([file_base, file_ext])
else:
filename = file_base
if len(filename) > 200:
filename = '%s__%s' % (filename[:99], filename[-99:])
if dirname:
return os.path.join(dirname, filename)
return filename
@@ -55,3 +55,4 @@
MIN_SCORE = 'min_score'
MIN_SHARED = 'min_shared'
MAX_RESULTS = 'max_results'
ES_INDEX = 'es_index'
@@ -4,8 +4,6 @@
from __future__ import absolute_import, unicode_literals
import logging
import traceback
import sys
import inspect
import os
from collections import defaultdict
@@ -133,8 +131,7 @@ def init_subscriber(config, subscriber_mod, pipeline):
pipeline.subscribers.append(subscriber)
except Exception as err:
traceback.print_exc(file=sys.stdout)
LOGGER.warning('! %s could not be loaded: %s', subscriber_mod, err)
LOGGER.exception('! %s could not be loaded: %s', subscriber_mod, err)
def build_pipeline(config):
@@ -54,12 +54,28 @@ class MockHttpConnection(object):
def __init__(self, response_text):
self.response_text = response_text
self.request_headers = None
def request(self, method, uri, payload, headers):
pass
self.request_headers = headers
def getresponse(self):
return BytesIO(self.response_text)
return MockHttpResponse(self.response_text)
class MockHttpResponse(object):
def __init__(self, response_data):
super(MockHttpResponse, self).__init__()
self.stream = BytesIO(response_data)
self.status = 200
self.reason = 'OK'
def read(self):
return self.stream.read()
def close(self):
self.stream.close()
class MockElasticsearchIndex(object):
@@ -129,7 +145,6 @@ def __init__(self, text):
self.entities.append(MockEntity('I-PER', parts))
class MockWorker(object):
def __init__(self):
@@ -141,10 +156,11 @@ def boot(self, inject, config, path):
self.called = True
return []
class MockInjector(object):
def __init__(self, response_text=None, ner_entities=[]):
self.response_text = response_text
def __init__(self, response_text=b"", ner_entities=[]):
self.http_connection = MockHttpConnection(response_text)
self.elastic = MockElasticsearch()
self.elastic_helper = MockElasticsearchHelper()
self.polyglot = MockPolyglot
@@ -158,7 +174,7 @@ def get_worker(self):
return self.worker
def get_http_connection(self, url=None):
return MockHttpConnection(self.response_text)
return self.http_connection
def get_elasticsearch(self):
return self.elastic
@@ -77,7 +77,7 @@ def load_all(self, config):
else:
self.buckets[key] = value
except:
pass
logging.warning('could not load related_%s data', self.NAME)
def stop(self):
"""Write data to file."""
Oops, something went wrong.

0 comments on commit 63be498

Please sign in to comment.