Skip to content

Commit

Permalink
Browser formatter for text documents
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Oct 20, 2020
1 parent 6c0cbc2 commit 9af6844
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 13 deletions.
4 changes: 2 additions & 2 deletions src/python/pimlico/datatypes/corpora/data_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
Document types used to represent datatypes of individual documents in an IterableCorpus or subtype.
"""
import traceback
from builtins import object
from collections import OrderedDict
from traceback import format_exc

from builtins import object
from future.utils import with_metaclass, PY3

__all__ = ["DataPointType", "RawDocumentType", "TextDocumentType", "RawTextDocumentType", "DataPointError",
Expand Down Expand Up @@ -591,6 +590,7 @@ class TextDocumentType(RawDocumentType):
"""
data_point_type_supports_python2 = True
formatters = [("text", "pimlico.datatypes.corpora.formatters.text.TextDocumentFormatter")]

class Document(object):
keys = ["text"]
Expand Down
12 changes: 12 additions & 0 deletions src/python/pimlico/datatypes/corpora/formatters/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import json

from pimlico.datatypes.corpora.json import JsonDocumentType

from pimlico.cli.browser.tools.formatter import DocumentBrowserFormatter


class JsonFormatter(DocumentBrowserFormatter):
DATATYPE = JsonDocumentType()

def format_document(self, doc):
return json.dumps(doc.data, indent=4)
16 changes: 16 additions & 0 deletions src/python/pimlico/datatypes/corpora/formatters/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pimlico.datatypes.corpora.data_points import TextDocumentType

from pimlico.cli.browser.tools.formatter import DocumentBrowserFormatter


class TextDocumentFormatter(DocumentBrowserFormatter):
"""
Formatter for text document types for use in the corpus browser.
Simply displays the unicode text that's stored for the document.
"""
DATATYPE = TextDocumentType()

def format_document(self, doc):
return doc.text
13 changes: 2 additions & 11 deletions src/python/pimlico/datatypes/corpora/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

from __future__ import absolute_import
from builtins import object

import json
from builtins import object

from pimlico.cli.browser.tools.formatter import DocumentBrowserFormatter
from pimlico.datatypes.corpora.data_points import RawDocumentType


__all__ = ["JsonDocumentType"]


Expand All @@ -19,7 +17,7 @@ class JsonDocumentType(RawDocumentType):
Very simple document corpus in which each document is a JSON object.
"""
formatters = [("json", "pimlico.datatypes.corpora.json.JsonFormatter")]
formatters = [("json", "pimlico.datatypes.corpora.formatters.json.JsonFormatter")]
data_point_type_supports_python2 = True

class Document(object):
Expand All @@ -30,10 +28,3 @@ def raw_to_internal(self, raw_data):

def internal_to_raw(self, internal_data):
return json.dumps(internal_data["data"]).encode("utf-8")


class JsonFormatter(DocumentBrowserFormatter):
DATATYPE = JsonDocumentType()

def format_document(self, doc):
return json.dumps(doc.data, indent=4)

0 comments on commit 9af6844

Please sign in to comment.