/
document.py
181 lines (139 loc) · 4.05 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import os
import time
import hashlib
import re
from six import text_type as unicode
class Entities(object):
"""Class for a set of entities found within documents."""
def __init__(self):
self.obj = {}
self.keys = []
def add(self, offset, entity_type, entity_value):
"""
Adds a found entity to list. The `offset` parameter is used as key, so
only one entity can start at the given offset.
:param offset: Offset in extracted text where the found entity starts.
:param entity_type: The type of entity that is found.
:param entity_value: The found entity.
:type start: ``int``
:type entity_type: ``unicode``
:type entity_value: ``unicode``
"""
self.obj[offset] = {
'type': entity_type,
'value': entity_value,
'entity_id': re.sub(r'\s', '_', entity_value).lower()
}
def get_all(self):
"""
Get all entities in list.
:returns: ``tuple``: (``int``, ``dict``)
"""
return list(self.obj.items())
class Document(object):
"""A document that is being processed."""
def __init__(self):
self.docid = None
self.path = None
self.ext = None
self.added = -1
self.doctype = 'unknown'
self.parent = None
self.entities = Entities()
self.tag = None
self.meta = {}
self.status = 'unknown'
self.text = None
self.children = 0
self.magic_hit = False
def set_type(self, doc_type):
"""
Set the type of document this is, e.g., diskimage, archive, document.
:param doc_type: Type of document.
:type doc_type: ``str``
"""
self.doctype = doc_type
self.meta['type'] = doc_type
def set_size(self, size):
"""
Set size of document/file.
:param size: Size of file.
:type size: ``int``
"""
self.meta['size'] = size
def set_id(self, data):
"""
Set ID of document. To allow multiple files with the same path, the
digest of supplied data (e.g. first 4KB) is appended to the doc ID.
:param data: Data to append to docuemnt path.
:type data: Anything digestable by hashlib.
"""
digest = hashlib.md5()
digest.update(self.path.encode('utf-8'))
if isinstance(data, unicode):
data = data.encode('utf-8')
digest.update(data)
self.docid = digest.hexdigest()
def as_obj(self):
"""
Return a dict representation of the document.
:returns: ``dict``
"""
parent_obj = None
if self.parent:
parent_obj = {
'path': self.parent.path,
'id': self.parent.docid,
'filename': os.path.basename(self.parent.path)
}
metaobj = {
'added': self.added
}
meta = self.meta
for key, value in metaobj.items():
meta[key] = value
entities = {}
for _, entity in self.entities.get_all():
entities[entity['entity_id']] = entity
return {
'ext': self.ext,
'id': self.docid,
'path': self.path,
'doctype': self.doctype,
'filename': os.path.basename(self.path),
'parent': parent_obj,
'tag': self.tag,
'status': self.status,
'text': self.text if self.text else '',
'meta': meta
}
def get_document(path, parent=None):
"""
Create a new document object from the given path.
:param path: Path to document (does not have to exist on file system).
:param parent: Parent document (e.g. diskimage or archive).
:returns: ``gransk.core.Document``
"""
if isinstance(path, unicode):
bpath, upath = path.encode('utf-8'), path
else:
bpath, upath = path, path.decode('utf-8')
doc = Document()
doc.path = upath
if os.path.dirname(doc.path):
doc.meta['directory'] = os.path.dirname(doc.path)
digest = hashlib.md5()
digest.update(bpath)
doc.docid = digest.hexdigest()
_, ext = os.path.splitext(doc.path)
doc.ext = ext.lstrip('.').lower() or 'none'
doc.parent = parent
try:
doc.set_size(os.path.getsize(path))
except OSError:
doc.set_size(-1)
doc.added = int(time.time())
return doc