Skip to content

Commit

Permalink
Updated bar chart plotter from old datatypes
Browse files Browse the repository at this point in the history
Updated datatypes and module.

Added some new test data and a test pipeline that uses it.
  • Loading branch information
markgw committed Nov 12, 2020
1 parent 8674dda commit f2487bf
Show file tree
Hide file tree
Showing 18 changed files with 152 additions and 178 deletions.
2 changes: 2 additions & 0 deletions src/python/pimlico/datatypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
from .corpora.base import IterableCorpus
from .corpora.grouped import GroupedCorpus
from .arrays import NumpyArray, ScipySparseMatrix
from .results import NumericResult

# All builtin datatypes that may be easily loaded using their
# class names of datatype names from config files
BUILTIN_DATATYPES = [
PimlicoDatatype, IterableCorpus, GroupedCorpus,
StringList, Dict, NamedFileCollection, NamedFile, TextFile, Dictionary,
Embeddings, TSVVecFiles, GensimLdaModel, NumpyArray, ScipySparseMatrix,
NumericResult,
]
BUILTIN_DATATYPES_BY_DATATYPE_NAME = dict(
# Go through them in reverse, so that, if we make a mistake and have a duplicate name, the
Expand Down
15 changes: 15 additions & 0 deletions src/python/pimlico/datatypes/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ def __init__(self, *args, **kwargs):
super(PlotOutput, self).__init__(["plot.py", "data.csv", "plot.pdf"], *args, **kwargs)

class Writer(object):
"""
Writes out source data, a Python script for the plotting using Matplotlib and
a PDF of the resulting plot, if the script completes successfully.
This approach means that a plot is produced immediately, but can easily be tweaked
and customized for later use elsewhere by copying and editing the Python
plotting script.
Use ``writer.write_file("data.csv", text=True)`` to write the source data and
``writer.write_file("plot.py", text=True)`` to write the plotting script, which should
output a file ``plot.pdf``. Then call ``writer.plot()`` to execute the
script. If this fails, at least the other files are there so the user can
correct the errors and use them if they want.
"""
def plot(self):
"""
Runs the plotting script. Errors are not caught, so if there's a problem in the script they'll be raised.
Expand Down
52 changes: 52 additions & 0 deletions src/python/pimlico/datatypes/results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

import json
import os

from pimlico.datatypes.base import PimlicoDatatype
from pimlico.utils.core import cached_property


class NumericResult(PimlicoDatatype):
"""
Simple datatype to contain a numeric value and a label, representing
the result of some process, such as evaluation of a model on a task.
Write using ``writer.write(label, value)``. The label must be a string,
identifying what the result is, e.g. "f-score". The value can be any
JSON-serializable type, e.g. int or float.
For example, allows results to be plotted by passing them into a graph plotting module.
"""
datatype_name = "numeric_result"
datatype_supports_python2 = True

class Reader(object):
class Setup(object):
def get_required_paths(self):
return ["data.json"]

@cached_property
def data(self):
with open(os.path.join(self.data_dir, "data.json"), "r") as f:
return json.load(f)

@cached_property
def label(self):
return self.data["label"]

@cached_property
def value(self):
return self.data["value"]

class Writer(object):
required_tasks = ["data"]

def write(self, label, value):
# Write out the data JSON file
with open(os.path.join(self.data_dir, "data.json"), "w") as f:
json.dump({"label": label, "value": value}, f)
self.task_complete("data")
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,44 @@

from future import standard_library
standard_library.install_aliases()
from builtins import zip
from builtins import str
from builtins import zip, str

import os
import csv
from io import StringIO

from pimlico.core.modules.base import BaseModuleExecutor
import csv


from pimlico.old_datatypes.plotting import PlotOutputWriter


class ModuleExecutor(BaseModuleExecutor):
def execute(self):
# Get values and labels from the inputs
self.log.info("Collecting data")
inputs = self.info.get_input("values")
inputs = self.info.get_input("results", always_list=True)
labels = [result.label for result in inputs]
values = [result.result for result in inputs]
values = [result.value for result in inputs]

self.log.info("Outputting data and plotting code")
with PlotOutputWriter(self.info.get_absolute_output_dir("plot")) as writer:
with self.info.get_output_writer("plot") as writer:
# Prepare data to go to CSV file
io = StringIO()
csv_writer = csv.writer(io)
sio = StringIO()
csv_writer = csv.writer(sio)
for label, value in zip(labels, values):
csv_writer.writerow([str(label), "%f" % value])
writer.data = io.getvalue()
csv_writer.writerow([str(label), str(value)])
writer.write_file("data.csv", sio.getvalue(), text=True)

# Use a standard template plot python file
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "plot_template.py"), "r") as f:
plotting_code = f.read()
# Remove the first line, which is a comment to explain what the file is
plotting_code = "\n".join(plotting_code.splitlines()[1:])
writer.plotting_code = plotting_code
# Otherwise, the script stays exactly as it is
writer.write_file("plot.py", plotting_code, text=True)

# Written the plot code and data
# Now do the plotting
self.log.info("Running plotter")
plot_output = self.info.get_output("plot")
plot_output.plot()
# Written the plot code and data
# Now do the plotting
self.log.info("Running plotter")
writer.plot()

self.log.info("Plot output to %s" % plot_output.pdf_path)
self.log.info("Customize plot by editing %s and recompiling (python ploy.py)" % plot_output.script_path)
self.log.info("Plot output to {}".format(writer.plot_path))
self.log.info("Customize plot by editing {} and recompiling: python plot.py".format(writer.code_path))
40 changes: 40 additions & 0 deletions src/python/pimlico/modules/visualization/bar_chart/info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

"""
Simple plotting of a bar chart from numeric results data using Matplotlib.
"""
from pimlico.core.modules.options import comma_separated_strings

from pimlico.core.modules.base import BaseModuleInfo
from pimlico.datatypes.base import MultipleInputs
from pimlico.datatypes.plotting import PlotOutput
from pimlico.datatypes.results import NumericResult
from pimlico.modules.visualization import matplotlib_dependency


class ModuleInfo(BaseModuleInfo):
module_type_name = "bar_chart"
module_readable_name = "Bar chart plotter"
module_inputs = [("results", MultipleInputs(NumericResult()))]
module_outputs = [("plot", PlotOutput())]
module_supports_python2 = True
module_options = {
"labels": {
"help": "If given, a list of labels corresponding to the inputs to use in plots. "
"Otherwise, inputs are numbered and the labels provided in their label fields are used",
"type": comma_separated_strings,
},
"colors": {
"help": "Pyplot colors to use for each series. If shorter than the number of inputs, "
"cycles round. Specify according to pyplot docs: https://matplotlib.org/2.0.2/api/colors_api.html. "
"E.g. use single-letter color names, HTML color codes or HTML color names",
"type": comma_separated_strings,
"default": ["r", "g", "b", "y", "c", "m", "k"],
},
}

def get_software_dependencies(self):
return [matplotlib_dependency]
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# This file is part of Pimlico
# Copyright (C) 2020 Mark Granroth-Wilding
# Licensed under the GNU LGPL v3.0 - https://www.gnu.org/licenses/lgpl-3.0.en.html

# Template plotting script that will be copied into the output dir so that it's easy to customize the plot afterwards
"""
This is a basic template for plotting a bar chart of the data in data.csv. It's been output by Pimlico's
Expand Down

This file was deleted.

This file was deleted.

63 changes: 0 additions & 63 deletions src/python/pimlico/old_datatypes/plotting.py

This file was deleted.

61 changes: 0 additions & 61 deletions src/python/pimlico/old_datatypes/results.py

This file was deleted.

1 change: 1 addition & 0 deletions test/data/datasets/results/A/corpus_metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
1 change: 1 addition & 0 deletions test/data/datasets/results/A/data/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"label": "A", "value": 6.5}
1 change: 1 addition & 0 deletions test/data/datasets/results/B/corpus_metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
1 change: 1 addition & 0 deletions test/data/datasets/results/B/data/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"label": "B", "value": 13.2}
1 change: 1 addition & 0 deletions test/data/datasets/results/C/corpus_metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
1 change: 1 addition & 0 deletions test/data/datasets/results/C/data/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"label": "C", "value": 10.76}

0 comments on commit f2487bf

Please sign in to comment.