Merge pull request #25 from nlesc-nano/standalone

improve report of results without metadata
nlesc-nano · Feb 12, 2021 · 0572936 · 0572936
2 parents 3e6686e + f8b43bd
commit 0572936
Show file tree

Hide file tree

Showing 10 changed files with 150 additions and 89 deletions.
diff --git a/ceibacli/actions/add.py b/ceibacli/actions/add.py
@@ -19,7 +19,8 @@
 from ..client import query_server
 from ..client.mutations import create_job_mutation
 from ..client.queries import create_properties_query
-from ..utils import Options, json_properties_to_dataframe
+from ..utils import (Options, generate_smile_identifier,
+                     json_properties_to_dataframe)
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +32,20 @@ def fetch_candidates(opts: Options) -> pd.DataFrame:
     return json_properties_to_dataframe(reply["properties"])
 
 
-def create_mutations(row: pd.Series, opts: Options) -> str:
+def add_jobs(opts: Options) -> None:
+    """Add new jobs to the server."""
+    opts.cookie = fetch_cookie()
+    # Get the data to create the jobs
+    df_candidates = fetch_candidates(opts)
+    # Create the mutation to add the jobs in the server
+    mutations = (create_mutations(opts, smile) for smile in df_candidates["smile"])
+    logger.info("New Jobs:")
+    for query in mutations:
+        reply = query_server(opts.web, query)
+        logger.info(reply['createJob']['text'])
+
+
+def create_mutations(opts: Options, smile: str) -> str:
     """Create a list of mutations with the new jobs."""
     job_info = defaultdict(lambda: "null")  # type: DefaultDict[str, Any]
     prop_info = defaultdict(lambda: "null")  # type: DefaultDict[str, Any]
@@ -41,28 +55,14 @@ def create_mutations(row: pd.Series, opts: Options) -> str:
         "settings": format_settings(opts.settings)})
 
     prop_info.update({
-        "smile_id": row._id,
-        "smile": row.smile,
+        "smile_id": generate_smile_identifier(smile),
+        "smile": smile,
         "collection_name": generate_collection_name(opts.settings),
     })
 
     return create_job_mutation(opts.cookie, job_info, prop_info)
 
 
-def add_jobs(opts: Options) -> None:
-    """Add new jobs to the server."""
-    opts.cookie = fetch_cookie()
-    # Get the data to create the jobs
-    df_candidates = fetch_candidates(opts)
-    # Create the mutation to add the jobs in the server
-    rows = df_candidates[["_id", "smile"]].iterrows()
-    mutations = (create_mutations(row, opts) for _, row in rows)
-    logger.info("New Jobs:")
-    for query in mutations:
-        reply = query_server(opts.web, query)
-        logger.info(reply['createJob']['text'])
-
-
 def format_settings(settings: Options) -> str:
     """Format the settings as string."""
     string = json.dumps(settings.to_dict())

diff --git a/ceibacli/actions/report.py b/ceibacli/actions/report.py
@@ -23,7 +23,7 @@
 from ..client.mutations import (create_job_update_mutation,
                                 create_property_mutation)
 from ..swift_interface import SwiftAction
-from ..utils import Options
+from ..utils import Options, generate_smile_identifier
 
 __all__ = ["report_properties"]
 
@@ -35,27 +35,34 @@ def report_properties(opts: Options) -> None:
     # fetch authentication credentials
     opts.cookie = fetch_cookie()
 
-    if opts.is_standalone:
-        report_standalone_properties(opts)
-    else:
+    if opts.has_metadata:
         report_jobs_properties(opts)
+    elif opts.collection_name is None:
+        msg = "A collection name is required if the results don't have metadata"
+        raise RuntimeError(msg)
+    else:
+        # The results don't have  associated jobs
+        report_standalone_properties(opts)
 
 
 def report_standalone_properties(opts: Options) -> None:
     """Send standalone data to a given collection."""
-    data = read_result_from_folder(Path(opts.path_results), opts.output)
-    query = create_standalone_mutation(opts, data)
-    query_server(opts.web, query)
+    for output in Path(opts.path_results).glob(opts.output):
+        smile, data = read_properties_from_csv(output)
+        data = data.replace('\"', '\\"')
+        query = create_standalone_mutation(opts, smile, data)
+        query_server(opts.web, query)
+
     logger.info(f"Standalone data has been sent to collection: {opts.collection_name}")
 
 
 def report_jobs_properties(opts: Options) -> None:
     """Report properties coming from a server's job."""
     path = Path(opts.path_results)
     if not path.exists():
-        raise FileNotFoundError(f"There is not results folder:{path}")
+        raise FileNotFoundError(f"There are not results folders:{path}")
     # Collect results folders
-    folders = collect_results(path)
+    folders = collect_results(path, pattern="job_*")
 
     # Add metadata to the jobs
     shared_data = {
@@ -106,8 +113,8 @@ def retrieve_data(path: Path, opts: Options) -> Tuple[Dict[str, Any], DefaultDic
         "data": data,
         "large_objects": large_objects,
         "input": read_input_files(path, opts.input),
-        "geometry": read_optimized_geometry(path, opts.geometry)
-        })
+        "geometry": read_optimized_geometry(path, opts.geometry)}
+    )
 
     job_medata = {"job_id": metadata["job_id"], "status": status}
     return job_medata, prop_data
@@ -147,9 +154,9 @@ def read_data_and_job_status(path: Path, pattern: str) -> Tuple[str, str]:
     return data, status
 
 
-def collect_results(path_results: Path) -> List[Path]:
+def collect_results(path_results: Path, pattern: str) -> List[Path]:
     """Gather all the results from the jobs."""
-    return [x for x in path_results.glob("job_*") if x.is_dir()]
+    return [x for x in path_results.glob(pattern) if x.is_dir()]
 
 
 def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:
@@ -163,7 +170,7 @@ def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:
     # Read the results from the file
     suffix = result_file.suffix
     if suffix == ".csv":
-        data = read_properties_from_csv(result_file)
+        _smile, data = read_properties_from_csv(result_file)
     elif suffix == ".json":
         data = read_properties_from_json(result_file)
     else:
@@ -172,22 +179,23 @@ def read_result_from_folder(folder: Path, pattern: str) -> pd.DataFrame:
 
     return data.replace('\"', '\\"')
 
+
 def read_properties_from_json(path_results: Path) -> str:
     """Read JSON file."""
     with open(path_results, 'r') as handler:
         data = json.load(handler)
     return json.dumps(data)
 
 
-def read_properties_from_csv(path_results: Path) -> str:
+def read_properties_from_csv(path_results: Path) -> Tuple[str, str]:
     """From a csv file to a pandas DataFrame."""
     df = pd.read_csv(path_results).reset_index(drop=True)
-
+    smile, = df["smiles"]
     # clean the data
     columns_to_exclude = [x for x in df.columns if x in {"smiles"}]
     df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
     df.drop(columns=columns_to_exclude, inplace=True)
-    return df.to_json()
+    return smile, df.to_json()
 
 
 def read_metadata(path_job: Path) -> Dict[str, Any]:
@@ -200,16 +208,15 @@ def read_metadata(path_job: Path) -> Dict[str, Any]:
         return yaml.load(handler.read(), Loader=yaml.FullLoader)
 
 
-def create_standalone_mutation(opts: Options, data: str) -> str:
-    """"Create query to mutate standalone data."""
+def create_standalone_mutation(opts: Options, smile: str, data: str) -> str:
+    """Create query to mutate standalone data."""
     info = defaultdict(lambda: "null")
     info['data'] = data
 
     # Read metadata from workdir
-    metadata = read_metadata(Path(opts.path_results))["property"]
-    info["smile_id"] = metadata["smile_id"]
-    info["smile"] = metadata["smile"]
-    info['collection_name'] = metadata["collection_name"]
+    info["smile_id"] = generate_smile_identifier(smile)
+    info["smile"] = smile
+    info['collection_name'] = opts.collection_name
 
     return create_property_mutation(opts.cookie, info)
 

diff --git a/ceibacli/cli.py b/ceibacli/cli.py
@@ -12,22 +12,13 @@
 from .actions import (add_jobs, compute_jobs, login_insilico, manage_jobs, query_properties,
                       report_properties)
 from .input_validation import DEFAULT_WEB, validate_input
-from .utils import Options
+from .utils import Options, exists
 
 logger = logging.getLogger(__name__)
 
 VERSION = pkg_resources.get_distribution('ceibacli').version
 
 
-def exists(input_file: str) -> Path:
-    """Check if the input file exists."""
-    path = Path(input_file)
-    if not path.exists():
-        raise argparse.ArgumentTypeError(f"{input_file} doesn't exist!")
-
-    return path
-
-
 def configure_logger(workdir: Path) -> None:
     """Set the logging infrasctucture."""
     file_log = workdir / 'ceibacli_output.log'
@@ -54,12 +45,9 @@ def parse_user_arguments() -> Tuple[str, Options]:
     # Common arguments
     parent_parser = argparse.ArgumentParser(add_help=False)
 
-    # you should provide either the input file with the arguments
-    # or each argument in the command line
-    group = parent_parser.add_mutually_exclusive_group()
     # Common collection argument
-    group.add_argument("-i", "--input", type=exists, help="Yaml input file")
-    group.add_argument("-w", "--web", default=DEFAULT_WEB, help="Web Service URL")
+    parent_parser.add_argument("-i", "--input", type=exists, help="Yaml input file")
+    parent_parser.add_argument("-w", "--web", default=DEFAULT_WEB, help="Web Service URL")
 
     # Command line arguments share
     collection_parser = argparse.ArgumentParser(add_help=False)
@@ -74,8 +62,7 @@ def parse_user_arguments() -> Tuple[str, Options]:
     subparsers.add_parser("compute", help="Compute available jobs", parents=[parent_parser, collection_parser])
 
     # Report properties to the database
-    subparsers.add_parser(
-        "report", help="Report the results back to the server", parents=[parent_parser])
+    subparsers.add_parser("report", help="Report the results back to the server", parents=[parent_parser, collection_parser])
 
     # Request data from the database
     subparsers.add_parser(
@@ -102,10 +89,9 @@ def parse_user_arguments() -> Tuple[str, Options]:
 
 def handle_input(args: argparse.Namespace) -> Options:
     """Check user input."""
-    if getattr(args, "input", None) is not None:
-        input_file = args.input
-    else:
-        user_input = {key: value for key, value in vars(args).items() if key not in {"command", "input"}}
+    input_file = getattr(args, "input", None)
+    if input_file is None:
+        user_input = {key: value for key, value in vars(args).items() if key not in {"command"}}
         input_file = Path(tempfile.gettempdir()) / "user_input.yml"
         with open(input_file, 'w') as handler:
             yaml.dump(user_input, handler)

diff --git a/ceibacli/input_validation.py b/ceibacli/input_validation.py
@@ -109,8 +109,11 @@ def is_in_array_uppercase(array: Iterable[str]) -> Schema:
     # Pattern to search for the optimized geometry
     Optional("geometry", default="geometry*xyz"): str,
 
-    # The data to report is not associated to a job
-    Optional("is_standalone", default=False): bool,
+    # The data to report is associated to a job
+    Optional("has_metadata", default=True): bool,
+
+    # If there is not metadata a collection name is required
+    Optional("collection_name", default=None): Or(None, str),
 
     # If the data is already in server you can either:
     # KEEP the old data

diff --git a/ceibacli/utils.py b/ceibacli/utils.py
@@ -1,9 +1,15 @@
 """Utility functions."""
 
+import argparse
+import hashlib
 import json
+from pathlib import Path
 from typing import Any, Dict, List, TypeVar
+
 import pandas as pd
 
+__all__ = ["Options", "exists", "generate_smile_identifier", "json_properties_to_dataframe"]
+
 T = TypeVar('T')
 
 
@@ -42,6 +48,15 @@ def converter(var):
         return {k: converter(v) for k, v in self.items()}
 
 
+def exists(input_file: str) -> Path:
+    """Check if the input file exists."""
+    path = Path(input_file)
+    if not path.exists():
+        raise argparse.ArgumentTypeError(f"{input_file} doesn't exist!")
+
+    return path
+
+
 def json_properties_to_dataframe(properties: List[Dict[str, Any]]) -> pd.DataFrame:
     """Transform a JSON list of dictionaries into a pandas DataFrame."""
     df = pd.DataFrame(properties)
@@ -51,3 +66,10 @@ def json_properties_to_dataframe(properties: List[Dict[str, Any]]) -> pd.DataFra
         df['data'] = df['data'].apply(lambda x: json.loads(x))
 
     return df
+
+
+def generate_smile_identifier(smile: str) -> str:
+    """Generate a (hopefully) for an smile that doesn't have a unique identifier."""
+    obj = hashlib.md5(smile.encode())
+    dig = obj.hexdigest()
+    return str(int(dig[-12:], 16))
diff --git a/docs/report.rst b/docs/report.rst
@@ -2,9 +2,9 @@
 Report
 ======
 The ``report`` command send the results of the jobs computed by the user to
-the web service. You can also send "standalone" data to the server. Where standalone
-means data that is not associated to a job in the server, for example because it
-has been previously computed.
+the web service. You can also send data that is not associated to any job to the server.
+In the last case, the results don't have all the metadata associated with a job in the server,
+for example because it has been previously computed or computed in another facility.
 
 To report the results you need to type in the terminal:
 ::
@@ -18,11 +18,34 @@ Or if you want to have more control over what is reported you can provide an inp
 
 Where the *input_compute.yml* is an file in `YAML format <https://en.wikipedia.org/wiki/YAML>`_ containing the :ref:`report input` metadata.
 
+To report results without associated jobs, follow the :ref:`report stand alone results`.
+
+.. _report stand alone results:
+
+Report results without associated jobs
+**************************************
+In the case that you have some results in *csv* format but those results were not computed with the
+`ceibacli compute` command, you need to specify
+in the YAML input file the following options
+::
+
+   # It states that the data don't have associated jobs
+   has_metadata: False
+   
+   path_results: Path to the csv files containing the results
+
+   # Pattern to search for the result files (default "results*csv")
+   output: "result*csv*"
+
+   # Where is going to be the data store
+   collection_name: simulation/name
+   
 .. _report input:
 
-Report Input File
-*****************
-The input file contains the following optional keywords:
+Report results from a job
+*************************
+If the results that you want to report where computed with the `ceibacli compute` command, you need
+to provide the following input:
 ::
 
    # Path to the Folder where the jobs run (default "workdir_ceibacli")
@@ -37,15 +60,12 @@ The input file contains the following optional keywords:
    # Pattern to search for the optimized molecular geometry
    geometry: "geometry*xyz"
 
-   # The data to report is not associated to a job (default False)
-   is_standalone: True
-
    # If the data is already in server you can either:
    # KEEP the old data
    # OVERWRITE and discard the old data
-   # MERGE the new and the old data
+   # MERGE the new and the old data (Default)
    # APPEND new data at the end of the old data array
-   # Default = KEEP
+   # Default = KEEP 
    duplication_policy: "KEEP"
 
 Check the :ref:`large objects data storage` for further information on

diff --git a/tests/files/input_test_report_standalone.yml b/tests/files/input_test_report_standalone.yml
@@ -2,7 +2,10 @@
 path_results: "tests/files/mocked_standalone"
 
 # The data is not associated with a job
-is_standalone: True
+has_metadata: False
+
+# Collection to upload 
+collection_name: "functional/basisset"
 
 # Pattern to search for the result files
 output: "result*csv"