Skip to content

Commit

Permalink
Merge pull request #25 from nlesc-nano/standalone
Browse files Browse the repository at this point in the history
improve report of results without metadata
  • Loading branch information
felipeZ committed Feb 12, 2021
2 parents 3e6686e + f8b43bd commit 0572936
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 89 deletions.
36 changes: 18 additions & 18 deletions ceibacli/actions/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from ..client import query_server
from ..client.mutations import create_job_mutation
from ..client.queries import create_properties_query
from ..utils import Options, json_properties_to_dataframe
from ..utils import (Options, generate_smile_identifier,
json_properties_to_dataframe)

logger = logging.getLogger(__name__)

Expand All @@ -31,7 +32,20 @@ def fetch_candidates(opts: Options) -> pd.DataFrame:
return json_properties_to_dataframe(reply["properties"])


def create_mutations(row: pd.Series, opts: Options) -> str:
def add_jobs(opts: Options) -> None:
"""Add new jobs to the server."""
opts.cookie = fetch_cookie()
# Get the data to create the jobs
df_candidates = fetch_candidates(opts)
# Create the mutation to add the jobs in the server
mutations = (create_mutations(opts, smile) for smile in df_candidates["smile"])
logger.info("New Jobs:")
for query in mutations:
reply = query_server(opts.web, query)
logger.info(reply['createJob']['text'])


def create_mutations(opts: Options, smile: str) -> str:
"""Create a list of mutations with the new jobs."""
job_info = defaultdict(lambda: "null") # type: DefaultDict[str, Any]
prop_info = defaultdict(lambda: "null") # type: DefaultDict[str, Any]
Expand All @@ -41,28 +55,14 @@ def create_mutations(row: pd.Series, opts: Options) -> str:
"settings": format_settings(opts.settings)})

prop_info.update({
"smile_id": row._id,
"smile": row.smile,
"smile_id": generate_smile_identifier(smile),
"smile": smile,
"collection_name": generate_collection_name(opts.settings),
})

return create_job_mutation(opts.cookie, job_info, prop_info)


def add_jobs(opts: Options) -> None:
"""Add new jobs to the server."""
opts.cookie = fetch_cookie()
# Get the data to create the jobs
df_candidates = fetch_candidates(opts)
# Create the mutation to add the jobs in the server
rows = df_candidates[["_id", "smile"]].iterrows()
mutations = (create_mutations(row, opts) for _, row in rows)
logger.info("New Jobs:")
for query in mutations:
reply = query_server(opts.web, query)
logger.info(reply['createJob']['text'])


def format_settings(settings: Options) -> str:
"""Format the settings as string."""
string = json.dumps(settings.to_dict())
Expand Down
53 changes: 30 additions & 23 deletions ceibacli/actions/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from ..client.mutations import (create_job_update_mutation,
create_property_mutation)
from ..swift_interface import SwiftAction
from ..utils import Options
from ..utils import Options, generate_smile_identifier

__all__ = ["report_properties"]

Expand All @@ -35,27 +35,34 @@ def report_properties(opts: Options) -> None:
# fetch authentication credentials
opts.cookie = fetch_cookie()

if opts.is_standalone:
report_standalone_properties(opts)
else:
if opts.has_metadata:
report_jobs_properties(opts)
elif opts.collection_name is None:
msg = "A collection name is required if the results don't have metadata"
raise RuntimeError(msg)
else:
# The results don't have associated jobs
report_standalone_properties(opts)


def report_standalone_properties(opts: Options) -> None:
"""Send standalone data to a given collection."""
data = read_result_from_folder(Path(opts.path_results), opts.output)
query = create_standalone_mutation(opts, data)
query_server(opts.web, query)
for output in Path(opts.path_results).glob(opts.output):
smile, data = read_properties_from_csv(output)
data = data.replace('\"', '\\"')
query = create_standalone_mutation(opts, smile, data)
query_server(opts.web, query)

logger.info(f"Standalone data has been sent to collection: {opts.collection_name}")


def report_jobs_properties(opts: Options) -> None:
"""Report properties coming from a server's job."""
path = Path(opts.path_results)
if not path.exists():
raise FileNotFoundError(f"There is not results folder:{path}")
raise FileNotFoundError(f"There are not results folders:{path}")
# Collect results folders
folders = collect_results(path)
folders = collect_results(path, pattern="job_*")

# Add metadata to the jobs
shared_data = {
Expand Down Expand Up @@ -106,8 +113,8 @@ def retrieve_data(path: Path, opts: Options) -> Tuple[Dict[str, Any], DefaultDic
"data": data,
"large_objects": large_objects,
"input": read_input_files(path, opts.input),
"geometry": read_optimized_geometry(path, opts.geometry)
})
"geometry": read_optimized_geometry(path, opts.geometry)}
)

job_medata = {"job_id": metadata["job_id"], "status": status}
return job_medata, prop_data
Expand Down Expand Up @@ -147,9 +154,9 @@ def read_data_and_job_status(path: Path, pattern: str) -> Tuple[str, str]:
return data, status


def collect_results(path_results: Path) -> List[Path]:
def collect_results(path_results: Path, pattern: str) -> List[Path]:
"""Gather all the results from the jobs."""
return [x for x in path_results.glob("job_*") if x.is_dir()]
return [x for x in path_results.glob(pattern) if x.is_dir()]


def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:
Expand All @@ -163,7 +170,7 @@ def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:
# Read the results from the file
suffix = result_file.suffix
if suffix == ".csv":
data = read_properties_from_csv(result_file)
_smile, data = read_properties_from_csv(result_file)
elif suffix == ".json":
data = read_properties_from_json(result_file)
else:
Expand All @@ -172,22 +179,23 @@ def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:

return data.replace('\"', '\\"')


def read_properties_from_json(path_results: Path) -> str:
"""Read JSON file."""
with open(path_results, 'r') as handler:
data = json.load(handler)
return json.dumps(data)


def read_properties_from_csv(path_results: Path) -> str:
def read_properties_from_csv(path_results: Path) -> Tuple[str, str]:
"""From a csv file to a pandas DataFrame."""
df = pd.read_csv(path_results).reset_index(drop=True)

smile, = df["smiles"]
# clean the data
columns_to_exclude = [x for x in df.columns if x in {"smiles"}]
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.drop(columns=columns_to_exclude, inplace=True)
return df.to_json()
return smile, df.to_json()


def read_metadata(path_job: Path) -> Dict[str, Any]:
Expand All @@ -200,16 +208,15 @@ def read_metadata(path_job: Path) -> Dict[str, Any]:
return yaml.load(handler.read(), Loader=yaml.FullLoader)


def create_standalone_mutation(opts: Options, data: str) -> str:
""""Create query to mutate standalone data."""
def create_standalone_mutation(opts: Options, smile: str, data: str) -> str:
"""Create query to mutate standalone data."""
info = defaultdict(lambda: "null")
info['data'] = data

# Read metadata from workdir
metadata = read_metadata(Path(opts.path_results))["property"]
info["smile_id"] = metadata["smile_id"]
info["smile"] = metadata["smile"]
info['collection_name'] = metadata["collection_name"]
info["smile_id"] = generate_smile_identifier(smile)
info["smile"] = smile
info['collection_name'] = opts.collection_name

return create_property_mutation(opts.cookie, info)

Expand Down
28 changes: 7 additions & 21 deletions ceibacli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,13 @@
from .actions import (add_jobs, compute_jobs, login_insilico, manage_jobs, query_properties,
report_properties)
from .input_validation import DEFAULT_WEB, validate_input
from .utils import Options
from .utils import Options, exists

logger = logging.getLogger(__name__)

VERSION = pkg_resources.get_distribution('ceibacli').version


def exists(input_file: str) -> Path:
"""Check if the input file exists."""
path = Path(input_file)
if not path.exists():
raise argparse.ArgumentTypeError(f"{input_file} doesn't exist!")

return path


def configure_logger(workdir: Path) -> None:
"""Set the logging infrasctucture."""
file_log = workdir / 'ceibacli_output.log'
Expand All @@ -54,12 +45,9 @@ def parse_user_arguments() -> Tuple[str, Options]:
# Common arguments
parent_parser = argparse.ArgumentParser(add_help=False)

# you should provide either the input file with the arguments
# or each argument in the command line
group = parent_parser.add_mutually_exclusive_group()
# Common collection argument
group.add_argument("-i", "--input", type=exists, help="Yaml input file")
group.add_argument("-w", "--web", default=DEFAULT_WEB, help="Web Service URL")
parent_parser.add_argument("-i", "--input", type=exists, help="Yaml input file")
parent_parser.add_argument("-w", "--web", default=DEFAULT_WEB, help="Web Service URL")

# Command line arguments share
collection_parser = argparse.ArgumentParser(add_help=False)
Expand All @@ -74,8 +62,7 @@ def parse_user_arguments() -> Tuple[str, Options]:
subparsers.add_parser("compute", help="Compute available jobs", parents=[parent_parser, collection_parser])

# Report properties to the database
subparsers.add_parser(
"report", help="Report the results back to the server", parents=[parent_parser])
subparsers.add_parser("report", help="Report the results back to the server", parents=[parent_parser, collection_parser])

# Request data from the database
subparsers.add_parser(
Expand All @@ -102,10 +89,9 @@ def parse_user_arguments() -> Tuple[str, Options]:

def handle_input(args: argparse.Namespace) -> Options:
"""Check user input."""
if getattr(args, "input", None) is not None:
input_file = args.input
else:
user_input = {key: value for key, value in vars(args).items() if key not in {"command", "input"}}
input_file = getattr(args, "input", None)
if input_file is None:
user_input = {key: value for key, value in vars(args).items() if key not in {"command"}}
input_file = Path(tempfile.gettempdir()) / "user_input.yml"
with open(input_file, 'w') as handler:
yaml.dump(user_input, handler)
Expand Down
7 changes: 5 additions & 2 deletions ceibacli/input_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,11 @@ def is_in_array_uppercase(array: Iterable[str]) -> Schema:
# Pattern to search for the optimized geometry
Optional("geometry", default="geometry*xyz"): str,

# The data to report is not associated to a job
Optional("is_standalone", default=False): bool,
# The data to report is associated to a job
Optional("has_metadata", default=True): bool,

# If there is not metadata a collection name is required
Optional("collection_name", default=None): Or(None, str),

# If the data is already in server you can either:
# KEEP the old data
Expand Down
22 changes: 22 additions & 0 deletions ceibacli/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
"""Utility functions."""

import argparse
import hashlib
import json
from pathlib import Path
from typing import Any, Dict, List, TypeVar

import pandas as pd

__all__ = ["Options", "exists", "generate_smile_identifier", "json_properties_to_dataframe"]

T = TypeVar('T')


Expand Down Expand Up @@ -42,6 +48,15 @@ def converter(var):
return {k: converter(v) for k, v in self.items()}


def exists(input_file: str) -> Path:
"""Check if the input file exists."""
path = Path(input_file)
if not path.exists():
raise argparse.ArgumentTypeError(f"{input_file} doesn't exist!")

return path


def json_properties_to_dataframe(properties: List[Dict[str, Any]]) -> pd.DataFrame:
"""Transform a JSON list of dictionaries into a pandas DataFrame."""
df = pd.DataFrame(properties)
Expand All @@ -51,3 +66,10 @@ def json_properties_to_dataframe(properties: List[Dict[str, Any]]) -> pd.DataFra
df['data'] = df['data'].apply(lambda x: json.loads(x))

return df


def generate_smile_identifier(smile: str) -> str:
"""Generate a (hopefully) for an smile that doesn't have a unique identifier."""
obj = hashlib.md5(smile.encode())
dig = obj.hexdigest()
return str(int(dig[-12:], 16))
42 changes: 31 additions & 11 deletions docs/report.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
Report
======
The ``report`` command send the results of the jobs computed by the user to
the web service. You can also send "standalone" data to the server. Where standalone
means data that is not associated to a job in the server, for example because it
has been previously computed.
the web service. You can also send data that is not associated to any job to the server.
In the last case, the results don't have all the metadata associated with a job in the server,
for example because it has been previously computed or computed in another facility.

To report the results you need to type in the terminal:
::
Expand All @@ -18,11 +18,34 @@ Or if you want to have more control over what is reported you can provide an inp

Where the *input_compute.yml* is an file in `YAML format <https://en.wikipedia.org/wiki/YAML>`_ containing the :ref:`report input` metadata.

To report results without associated jobs, follow the :ref:`report stand alone results`.

.. _report stand alone results:

Report results without associated jobs
**************************************
In the case that you have some results in *csv* format but those results were not computed with the
`ceibacli compute` command, you need to specify
in the YAML input file the following options
::

# It states that the data don't have associated jobs
has_metadata: False
path_results: Path to the csv files containing the results

# Pattern to search for the result files (default "results*csv")
output: "result*csv*"

# Where is going to be the data store
collection_name: simulation/name
.. _report input:

Report Input File
*****************
The input file contains the following optional keywords:
Report results from a job
*************************
If the results that you want to report where computed with the `ceibacli compute` command, you need
to provide the following input:
::

# Path to the Folder where the jobs run (default "workdir_ceibacli")
Expand All @@ -37,15 +60,12 @@ The input file contains the following optional keywords:
# Pattern to search for the optimized molecular geometry
geometry: "geometry*xyz"

# The data to report is not associated to a job (default False)
is_standalone: True

# If the data is already in server you can either:
# KEEP the old data
# OVERWRITE and discard the old data
# MERGE the new and the old data
# MERGE the new and the old data (Default)
# APPEND new data at the end of the old data array
# Default = KEEP
# Default = KEEP
duplication_policy: "KEEP"

Check the :ref:`large objects data storage` for further information on
Expand Down
5 changes: 4 additions & 1 deletion tests/files/input_test_report_standalone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
path_results: "tests/files/mocked_standalone"

# The data is not associated with a job
is_standalone: True
has_metadata: False

# Collection to upload
collection_name: "functional/basisset"

# Pattern to search for the result files
output: "result*csv"

0 comments on commit 0572936

Please sign in to comment.