diff --git a/.travis.yml b/.travis.yml index 6c9aadd0..d1221fc9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,5 +13,6 @@ install: script: pytest branches: only: + - 0.6-rc2 - dev - master diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index f6f0c2b4..d44606ca 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -27,6 +27,7 @@ Changelog - Various small bug fixes and dev improvements. + - Require `setuptools` for installation, and `pandas 0.20.2`. If `numexpr` is installed, version `2.6.2` is required. - **v0.5** (*2017-03-01*): diff --git a/doc/source/conf.py b/doc/source/conf.py index b7f13245..2037083e 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -140,6 +140,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_static_path = [] # it's empty; suppress warning # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/doc/source/config-files.rst b/doc/source/config-files.rst index 95264447..4c5d2d08 100644 --- a/doc/source/config-files.rst +++ b/doc/source/config-files.rst @@ -19,7 +19,7 @@ If you are planning to submit jobs to a cluster, then you need to know about a s That should be all you need to worry about as a pipeline user. If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control over pipelines, then you'll need to know about a few others: Pipeline developers -***************** +********************** If you want to add a new pipeline to looper, tweak the way looper interacts with a pipeline for a given project, or change the default cluster resources requested by a pipeline, then you need to know about a configuration file that coordinates linking your pipeline in to your looper project. diff --git a/doc/source/define-your-project.rst b/doc/source/define-your-project.rst index af9d543d..4a6e884e 100644 --- a/doc/source/define-your-project.rst +++ b/doc/source/define-your-project.rst @@ -40,8 +40,8 @@ For example, by default, your jobs will run serially on your local computer, whe Let's go through the more advanced details of both annotation sheets and project config files: -.. include:: sample-annotation-sheet.rst +.. include:: sample-annotation-sheet.rst.inc -.. include:: project-config.rst +.. include:: project-config.rst.inc diff --git a/doc/source/inputs.rst b/doc/source/inputs.rst deleted file mode 100644 index 07c26983..00000000 --- a/doc/source/inputs.rst +++ /dev/null @@ -1,3 +0,0 @@ -Required Inputs -============================================= - diff --git a/doc/source/pipeline-interface-mapping.rst b/doc/source/pipeline-interface-mapping.rst.inc similarity index 98% rename from doc/source/pipeline-interface-mapping.rst rename to doc/source/pipeline-interface-mapping.rst.inc index 114fd199..61a2be74 100644 --- a/doc/source/pipeline-interface-mapping.rst +++ b/doc/source/pipeline-interface-mapping.rst.inc @@ -1,4 +1,6 @@ -.. _pipeline-interface-mapping: +:orphan: + +.. _pi_mapping: Pipeline interface section: protocol_mapping ******************************************** diff --git a/doc/source/pipeline-interface-pipelines.rst b/doc/source/pipeline-interface-pipelines.rst.inc similarity index 99% rename from doc/source/pipeline-interface-pipelines.rst rename to doc/source/pipeline-interface-pipelines.rst.inc index 4cceeb3f..0e23cc7b 100644 --- a/doc/source/pipeline-interface-pipelines.rst +++ b/doc/source/pipeline-interface-pipelines.rst.inc @@ -1,3 +1,5 @@ +:orphan: + .. _pipeline-interface-pipelines: Pipeline interface section: pipelines diff --git a/doc/source/pipeline-interface.rst b/doc/source/pipeline-interface.rst index f7bae756..0a7888ef 100644 --- a/doc/source/pipeline-interface.rst +++ b/doc/source/pipeline-interface.rst @@ -31,7 +31,7 @@ Let's start with a very simple example. A basic ``pipeline_interface.yaml`` file The first section specifies that samples of protocol ``RRBS`` will be mapped to the pipeline specified by key ``rrbs_pipeline``. The second section describes where the pipeline named ``rrbs_pipeline`` is located and what command-line arguments it requires. Pretty simple. Let's go through each of these sections in more detail: -.. include:: pipeline-interface-mapping.rst +.. include:: pipeline-interface-mapping.rst.inc -.. include:: pipeline-interface-pipelines.rst +.. include:: pipeline-interface-pipelines.rst.inc diff --git a/doc/source/project-config.rst b/doc/source/project-config.rst.inc similarity index 99% rename from doc/source/project-config.rst rename to doc/source/project-config.rst.inc index 10695637..62e76b15 100644 --- a/doc/source/project-config.rst +++ b/doc/source/project-config.rst.inc @@ -1,3 +1,5 @@ +:orphan: + Project config file *************************************************** diff --git a/doc/source/sample-annotation-sheet.rst b/doc/source/sample-annotation-sheet.rst.inc similarity index 99% rename from doc/source/sample-annotation-sheet.rst rename to doc/source/sample-annotation-sheet.rst.inc index f6ea5c5d..a3464595 100644 --- a/doc/source/sample-annotation-sheet.rst +++ b/doc/source/sample-annotation-sheet.rst.inc @@ -1,3 +1,4 @@ +:orphan: Sample annotation sheet ************************************************** diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 425e1644..0b918829 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -41,7 +41,7 @@ Inside there will be two directories: - ``submissions`` [2]_ - which holds yaml representations of the samples and log files of the submited jobs. -The sample-specific output of each pipeline type varies and is described in :doc:`pipelines`. +The sample-specific output of each pipeline type varies. To use pre-made pipelines with your project, all you have to do is :doc:`define your project ` using looper's standard format. To link your own, custom built pipelines, you can :doc:`connect your pipeline to looper with a pipeline interface `. diff --git a/examples/microtest_project_config.yaml b/examples/microtest_project_config.yaml index 1109d790..1cc64b65 100644 --- a/examples/microtest_project_config.yaml +++ b/examples/microtest_project_config.yaml @@ -1,94 +1,35 @@ -# This project config file describes all *project-specific variables* -# Its primary purpose as as input to Looper, which will submit jobs as appropriate -# for each sample in the project. -# But it is also read by other tools, including: -# - project sample loop (primary purpose) -# - make_trackhubs scripts to produce web accessible results -# - stats summary scripts -# - analysis scripts requiring pointers to metadata, results, and other options. - metadata: - # output_dir: ABSOLUTE PATH to the parent, shared space where project results go output_dir: /scratch/lab_bock/shared/projects/microtest - # results and submission subdirs are subdirectors directories under parent output_dir - # results: where output sample folders will go - # submission: where cluster submit scripts and log files will go results_subdir: results_pipeline submission_subdir: submission - # pipelines_dir: ABSOLUTE PATH the directory where the Looper will find pipeline - # scripts (and accompanying pipeline config files) for submission. pipelines_dir: $CODEBASE/pipelines - # Elements in this section can be absolute or relative. - # Typically, this project config file is stored with the project metadata, so - # relative paths are considered relative to this project config file. - # sample_annotation: one-row-per-sample metadata sample_annotation: microtest_sample_annotation.csv - # merge_table: input for samples with more than one input file merge_table: microtest_merge_table.csv - # compare_table: comparison pairs or groups, like normalization samples - compare_table: null.csv - -# a list of annotation sheet columns that are "derived" -# the values in these are constructed using a regex-like expression -# of variables (defined in the next section). derived_columns: [data_source] - data_sources: - # specify the ABSOLUTE PATH of input files using variable path expressions - # entries correspond to values in the data_source column in sample_annotation table - # {variable} can be used to replace environment variables or other sample_annotation columns - # If you use {variable} codes, you should quote the field so python can parse it. bsf_samples: "{RAWDATA}{flowcell}/{flowcell}_{lane}_samples/{flowcell}_{lane}#{BSF_name}.bam" microtest: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}.bam" microtest_merge: "/data/groups/lab_bock/shared/resources/microtest/{sample_name}{file_number}.bam" - subprojects: config_test: pipeline_config: wgbs.py: wgbs_ds.yaml - -genomes: - human: hg19 - mouse: mm10 - -transcriptomes: - human: hg19_cdna - mouse: mm10_cdna - +implied_columns: + organism: + human: + genomes: hg19 + transcriptome: hg19_cdna + mouse: + genome: mm10 + transcriptome: mm10_cdna pipeline_config: - # pipeline configuration files used in project. - # Key string must match the _name of the pipeline script_ (including extension) - # Relative paths are relative to this project config file. - # Default (null) means use the generic config for the pipeline. - # wgbs.py: null - # Or you can point to a specific config to be used in this project: - # rrbs.py: rrbs_config.yaml - # wgbs.py: wgbs_config.yaml - # cgps: cpgs_config.yaml - + rrbs.py: rrbs_config.yaml pipeline_args: rnaBitSeq.py: - "-w": 50 - - -trackhubs: - trackhub_dir: /data/groups/lab_bock/public_html/arendeiro/microtest/ - # url: if you include this, the make_trackhubs will produce a link to your track hub in the project folder. - url: http://www.whatever.com/ - matrix_x: cell_type - matrix_y: cell_count - sort_order: cell_type=+ - parent_track_name: ews_rrbs - visibility: dense - hub_name: ews_hub - short_label_column: sample_name - email: arendeiro@cemm.oeaw.ac.at - -username: user -email: user@email.com + "-w": 50 \ No newline at end of file diff --git a/looper/__init__.py b/looper/__init__.py index 19e32a81..8be0cf76 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -19,14 +19,18 @@ # Default user logging format is simple DEFAULT_LOGGING_FMT = "%(message)s" # Developer logger format is more information-rich -DEV_LOGGING_FMT = "%(module)s:%(lineno)d [%(levelname)s] > %(message)s " +DEV_LOGGING_FMT = "%(module)s:%(lineno)d (%(funcName)s) [%(levelname)s] > %(message)s " def setup_looper_logger(level, additional_locations=None, devmode=False): """ - Called by test configuration via `pytest`'s `conftest`. - All arguments are optional and have suitable defaults. + Establish a logger for a looper CLI program. + + This configures a logger to provide information about a looper program's + execution. Verbosity, destination(s) for messages, and message text + format are controlled by the arguments' values. This is also used by + looper's test suite. :param int | str level: logging level :param tuple(str | FileIO[str]) additional_locations: supplementary diff --git a/looper/_version.py b/looper/_version.py index 29c83aa3..76d80dda 100644 --- a/looper/_version.py +++ b/looper/_version.py @@ -1 +1 @@ -__version__ = "0.6.0-rc1" +__version__ = "0.6.0-rc2" diff --git a/looper/looper.py b/looper/looper.py index d676d17f..604261bc 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -14,25 +14,19 @@ import pandas as _pd from . import setup_looper_logger, LOGGING_LEVEL, __version__ from .loodels import Project -from .models import COMPUTE_SETTINGS_VARNAME -from .utils import VersionInHelpParser +from .models import Sample, COMPUTE_SETTINGS_VARNAME, SAMPLE_EXECUTION_TOGGLE +from .utils import alpha_cased, VersionInHelpParser try: - from .models import \ - InterfaceManager, PipelineInterface, \ - ProtocolMapper + from .models import PipelineInterface, ProtocolMapper except: sys.path.append(os.path.join(os.path.dirname(__file__), "looper")) - from models import \ - InterfaceManager, PipelineInterface, \ - ProtocolMapper + from models import PipelineInterface, ProtocolMapper from colorama import init init() from colorama import Fore, Style -SAMPLE_EXECUTION_TOGGLE = "toggle" - # Descending by severity for correspondence with logic inversion. # That is, greater verbosity setting corresponds to lower logging level. _LEVEL_BY_VERBOSITY = [logging.ERROR, logging.CRITICAL, logging.WARN, @@ -131,7 +125,7 @@ def parse_arguments(): destroy_subparser, check_subparser, clean_subparser]: subparser.add_argument( "config_file", - help="Project YAML config file.") + help="Project configuration file (YAML).") subparser.add_argument( "--file-checks", action="store_false", @@ -140,11 +134,12 @@ def parse_arguments(): "-d", "--dry-run", action="store_true", - help="Don't actually submit.") + help="Don't actually submit the project/subproject.") subparser.add_argument( "--sp", dest="subproject", - help="Supply subproject") + help="Name of subproject to use, as designated in the " + "project's configuration file") # To enable the loop to pass args directly on to the pipelines... args, remaining_args = parser.parse_known_args() @@ -175,7 +170,7 @@ def parse_arguments(): -def run(prj, args, remaining_args, interface_manager): +def run(prj, args, remaining_args): """ Main Looper function: Submit jobs for samples in project. @@ -184,105 +179,126 @@ def run(prj, args, remaining_args, interface_manager): :param Iterable[str] remaining_args: arguments given to this module's parser that were not defined as options it should parse, to be passed on to parser(s) elsewhere - :param InterfaceManager interface_manager: aggregator and manager of - pipeline interfaces and protocol mappings """ - # Easier change later, especially likely for library --> protocol. - _read_type = "read_type" - _protocol = "library" - - _start_counter(len(prj.samples)) - + num_samples = prj.num_samples + _start_counter(num_samples) valid_read_types = ["single", "paired"] # Keep track of how many jobs have been submitted. - submit_count = 0 - job_count = 0 + job_count = 0 # Some job templates will be skipped. + submit_count = 0 # Some jobs won't be submitted. processed_samples = set() # Create a problem list so we can keep track and show them at the end. failures = [] + _LOGGER.info("Building submission bundle(s) for protocol(s): {}". + format(list(prj.protocols))) + submission_bundle_by_protocol = { + alpha_cased(p): prj.build_submission_bundles( + alpha_cased(p)) for p in prj.protocols} + for sample in prj.samples: - _LOGGER.debug(sample) _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) - _LOGGER.debug("Pipeline output folder: '%s'", pipeline_outfolder) + _LOGGER.debug("Sample output folder: '%s'", sample_output_folder) skip_reasons = [] # Don't submit samples with duplicate names. if sample.sample_name in processed_samples: - skip_reasons.append("Duplicate sample name.") + skip_reasons.append("Duplicate sample name") # Check if sample should be run. - if hasattr(sample, SAMPLE_EXECUTION_TOGGLE): - if sample[SAMPLE_EXECUTION_TOGGLE] != "1": - skip_reasons.append("Column '{}' deselected.".format(SAMPLE_EXECUTION_TOGGLE)) - - # Check if single_or_paired value is recognized. - if hasattr(sample, _read_type): - # Drop "-end", "_end", or just "end" from end of the column value. - sample.read_type = re.sub( - '[_\\-]?end$', '', str(sample.read_type)).lower() - if sample.read_type not in valid_read_types: - skip_reasons.append("{} must be in {}.".\ - format(_read_type, valid_read_types)) + if sample.is_dormant(): + skip_reasons.append("Inactive status (via {})". + format(SAMPLE_EXECUTION_TOGGLE)) # Get the base protocol-to-pipeline mappings - if hasattr(sample, _protocol): - protocol = sample.library.upper() - pipelines = interface_manager.build_pipelines(protocol) - if len(pipelines) == 0: - skip_reasons.append( - "No pipeline found for protocol {}.".format(protocol)) + try: + protocol = alpha_cased(sample.library) + except AttributeError: + skip_reasons.append("Missing 'library' attribute") else: - skip_reasons.append("Missing '{}' attribute.".format(_protocol)) - + protocol = protocol.upper() + _LOGGER.debug("Fetching submission bundle") + try: + _LOGGER.debug("Using '%s' as protocol key", protocol) + submission_bundles = submission_bundle_by_protocol[protocol] + except KeyError: + skip_reasons.append("No pipeline found for protocol") + if not submission_bundles: + skip_reasons.append("No submission bundle for protocol") if skip_reasons: _LOGGER.warn("> Not submitted: {}".format(skip_reasons)) failures.append([skip_reasons, sample.sample_name]) continue + # TODO: determine what to do with subtype(s) here. # Processing preconditions have been met. processed_samples.add(sample.sample_name) - sample.to_yaml() + + # At this point, we have a generic Sample; write that to disk + # for reuse in case of many jobs (pipelines) using base Sample. + # Do a single overwrite here, then any subsequent Sample can be sure + # that the file is fresh, with respect to this run of looper. + sample.to_yaml(subs_folder_path=prj.metadata.submission_subdir) + + # Store the base Sample data for reuse in creating subtype(s). + sample_data = sample.as_series() # Go through all pipelines to submit for this protocol. # Note: control flow doesn't reach this point if variable "pipelines" # cannot be assigned (library/protocol missing). - for pipeline_interface, pipeline_key, pipeline_job in pipelines: + # pipeline_key (previously pl_id) is no longer necessarily + # script name, it's more flexible. + for pipeline_interface, sample_subtype, pipeline_key, pipeline_job \ + in submission_bundles: + job_count += 1 + + _LOGGER.debug("Creating %s instance: '%s'", + sample_subtype.__name__, sample.sample_name) + sample = sample_subtype(sample_data) - # pipeline_key (previously pl_id) is no longer necessarily script name, it's more flexible. # The current sample is active. # For each pipeline submission consideration, start fresh. skip_reasons = [] - _LOGGER.debug("Setting pipeline attributes for job '{}' (PL_ID: '{}')". - format(pipeline_job, pipeline_key)) - + _LOGGER.debug("Setting pipeline attributes for job '{}' " + "(PL_ID: '{}')".format(pipeline_job, pipeline_key)) try: # Add pipeline-specific attributes. sample.set_pipeline_attributes( pipeline_interface, pipeline_name=pipeline_key) except AttributeError: # TODO: inform about WHICH missing attribute(s). - fail_message = "Pipeline required attribute(s) missing." + fail_message = "Pipeline required attribute(s) missing" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) - try: - # Check for any required inputs before submitting. - _LOGGER.debug("Confirming required inputs") - sample.confirm_required_inputs() - except IOError: - # TODO: inform about WHICH missing file(s). - fail_message = "Required input file(s) not found." - _LOGGER.warn("> Not submitted: %s", fail_message) - skip_reasons.append(fail_message) + # Check for any missing requirements before submitting. + _LOGGER.debug("Determining missing requirements") + error_type, missing_reqs_msg = \ + sample.determine_missing_requirements() + if missing_reqs_msg: + if prj.permissive: + _LOGGER.warn(missing_reqs_msg) + else: + raise error_type(missing_reqs_msg) + _LOGGER.warn("> Not submitted: %s", missing_reqs_msg) + skip_reasons.append(missing_reqs_msg) + + # Check if single_or_paired value is recognized. + if hasattr(sample, "read_type"): + # Drop "-end", "_end", or "end" from end of the column value. + sample.read_type = re.sub( + '[_\\-]?end$', '', str(sample.read_type)).lower() + if sample.read_type not in valid_read_types: + skip_reasons.append("read_type must be in {}". + format(valid_read_types)) # Identify cluster resources required for this submission. submit_settings = pipeline_interface.choose_resource_package( @@ -303,14 +319,17 @@ def run(prj, args, remaining_args, interface_manager): # Append arguments for this pipeline # Sample-level arguments are handled by the pipeline interface. try: - argstring = pipeline_interface.get_arg_string(pipeline_key, sample) - argstring += " " + argstring = pipeline_interface.get_arg_string( + pipeline_name=pipeline_key, sample=sample, + submission_folder_path=prj.metadata.submission_subdir) except AttributeError: # TODO: inform about which missing attribute(s). fail_message = "Required attribute(s) missing " \ - "for pipeline arguments string." + "for pipeline arguments string" _LOGGER.warn("> Not submitted: %s", fail_message) skip_reasons.append(fail_message) + else: + argstring += " " if skip_reasons: # Sample is active, but we've at least 1 pipeline skip reason. @@ -335,7 +354,8 @@ def run(prj, args, remaining_args, interface_manager): # because we don't care about parameters here. if hasattr(prj.pipeline_config, pipeline_key): # First priority: pipeline config in project config - pl_config_file = getattr(prj.pipeline_config, pipeline_key) + pl_config_file = getattr(prj.pipeline_config, + pipeline_key) # Make sure it's a file (it could be provided as null.) if pl_config_file: if not os.path.isfile(pl_config_file): @@ -350,47 +370,50 @@ def run(prj, args, remaining_args, interface_manager): cmd += " -C " + pl_config_file cmd += " -O " + prj.metadata.results_subdir - if submit_settings.setdefault("cores", 1) > 1: + if int(submit_settings.setdefault("cores", 1)) > 1: cmd += " -P " + submit_settings["cores"] try: - if submit_settings["mem"] > 1: + if float(submit_settings["mem"]) > 1: cmd += " -M " + submit_settings["mem"] except KeyError: _LOGGER.warn("Submission settings " "lack memory specification") # Add the command string and job name to the submit_settings object - submit_settings["JOBNAME"] = sample.sample_name + "_" + pipeline_key + submit_settings["JOBNAME"] = \ + sample.sample_name + "_" + pipeline_key submit_settings["CODE"] = cmd # Submit job! - job_count += 1 + _LOGGER.debug("Attempting job submission: '%s' ('%s')", + sample.sample_name, pl_name) submitted = cluster_submit( sample, prj.compute.submission_template, prj.compute.submission_command, submit_settings, - prj.metadata.submission_subdir, pipeline_outfolder, + prj.metadata.submission_subdir, sample_output_folder, pl_name, args.time_delay, submit=True, dry_run=args.dry_run, ignore_flags=args.ignore_flags, remaining_args=remaining_args) if submitted: + _LOGGER.debug("SUBMITTED") submit_count += 1 - - msg = "\nLooper finished. {} of {} job(s) submitted.".\ - format(submit_count, job_count) + else: + _LOGGER.debug("NOT SUBMITTED") + + # Report what went down. + _LOGGER.info("Looper finished") + _LOGGER.info("Samples generating jobs: %d of %d", + len(processed_samples), num_samples) + _LOGGER.info("Jobs submitted: %d of %d", submit_count, job_count) if args.dry_run: - msg += " Dry run. No jobs were actually submitted." - - _LOGGER.info(msg) - + _LOGGER.info("Dry run. No jobs were actually submitted.") if failures: _LOGGER.info("%d sample(s) with submission failure.", len(failures)) - sample_count_pairs_by_reason = aggregate_exec_skip_reasons(failures) + sample_by_reason = aggregate_exec_skip_reasons(failures) _LOGGER.info("{} unique reasons for submission failure: {}".format( - len(sample_count_pairs_by_reason), - sample_count_pairs_by_reason.keys())) - _LOGGER.info("Per-sample submission failure count for each reason:") - for reason, sample_nfail_pairs in sample_count_pairs_by_reason.items(): - _LOGGER.info("> {}: {}".format(reason, sample_nfail_pairs)) + len(sample_by_reason), + list(sample_by_reason.keys()))) + _LOGGER.info("Samples by failure: {}".format(dict(sample_by_reason))) @@ -404,7 +427,7 @@ def aggregate_exec_skip_reasons(skip_reasons_sample_pairs): :return Mapping[str, Iterable[str]]: mapping from explanation to collection of names of samples to which it pertains """ - from collections import Counter, defaultdict + from collections import defaultdict samples_by_skip_reason = defaultdict(list) for skip_reasons, sample in skip_reasons_sample_pairs: for reason in set(skip_reasons): @@ -423,11 +446,11 @@ def summarize(prj): columns = [] stats = [] - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) # Grab the basic info from the annotation sheet for this sample. @@ -435,7 +458,7 @@ def summarize(prj): sample_stats = sample.get_sheet_dict() columns.extend(sample_stats.keys()) # Version 0.3 standardized all stats into a single file - stats_file = os.path.join(pipeline_outfolder, "stats.tsv") + stats_file = os.path.join(sample_output_folder, "stats.tsv") if os.path.isfile(stats_file): _LOGGER.info("Found stats file: '%s'", stats_file) else: @@ -485,17 +508,17 @@ def destroy(prj, args, preview_flag=True): _LOGGER.info("Results to destroy:") - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join( + sample_output_folder = os.path.join( prj.metadata.results_subdir, sample.sample_name) if preview_flag: # Preview: Don't actually delete, just show files. - _LOGGER.info(str(pipeline_outfolder)) + _LOGGER.info(str(sample_output_folder)) else: - destroy_sample_results(pipeline_outfolder, args) + destroy_sample_results(sample_output_folder, args) if not preview_flag: _LOGGER.info("Destroy complete.") @@ -522,13 +545,13 @@ def clean(prj, args, preview_flag=True): _LOGGER.info("Files to clean:") - _start_counter(len(prj.samples)) + _start_counter(prj.num_samples) for sample in prj.samples: _LOGGER.info(_COUNTER.show(sample.sample_name, sample.library)) - pipeline_outfolder = os.path.join(prj.metadata.results_subdir, - sample.sample_name) - cleanup_files = glob.glob(os.path.join(pipeline_outfolder, + sample_output_folder = os.path.join( + prj.metadata.results_subdir, sample.sample_name) + cleanup_files = glob.glob(os.path.join(sample_output_folder, "*_cleanup.sh")) if preview_flag: # Preview: Don't actually clean, just show what will be cleaned. @@ -597,13 +620,13 @@ def _submission_status_text(curr, total, sample_name, sample_library): def cluster_submit( - sample, submit_template, submission_command, variables_dict, - submission_folder, pipeline_outfolder, pipeline_name, time_delay, - submit=False, dry_run=False, ignore_flags=False, remaining_args=None): + sample, submit_template, submission_command, variables_dict, + submission_folder, sample_output_folder, pipeline_name, time_delay, + submit=False, dry_run=False, ignore_flags=False, remaining_args=None): """ - Submit job to cluster manager. - - :param models.Sample sample: the sample object for submission + Write cluster submission script to disk and submit job for given Sample. + + :param models.Sample sample: the Sample object for submission :param str submit_template: path to submission script template :param str submission_command: actual command with which to execute the submission of the cluster job for the given sample @@ -611,7 +634,7 @@ def cluster_submit( the submission template :param str submission_folder: path to the folder in which to place submission files - :param str pipeline_outfolder: path to folder into which the pipeline + :param str sample_output_folder: path to folder into which the pipeline will write file(s), and where to search for flag file to check if a sample's already been submitted :param str pipeline_name: name of the pipeline that the job will run @@ -642,12 +665,10 @@ def cluster_submit( if not os.path.exists(submit_script_dirpath): os.makedirs(submit_script_dirpath) + # Add additional arguments, populate template fields, and write to disk. with open(submit_template, 'r') as handle: filedata = handle.read() - - # Update variable dict with any additional arguments. variables_dict["CODE"] += " " + str(" ".join(remaining_args or [])) - # Fill in submit_template with variables. for key, value in variables_dict.items(): # Here we add brackets around the key names and use uppercase because # this is how they are encoded as variables in the submit templates. @@ -655,16 +676,32 @@ def cluster_submit( with open(submit_script, 'w') as handle: handle.write(filedata) - # Prepare and write sample yaml object - sample.to_yaml() + # Ensure existence of on-disk representation of this sample. + if type(sample) is Sample: + # run() writes base Sample to disk for each non-skipped sample. + expected_filepath = os.path.join( + submission_folder, "{}.yaml".format(sample.name)) + _LOGGER.debug("Base Sample, to reuse file: '%s'", + expected_filepath) + if not os.path.exists(expected_filepath): + _LOGGER.warn("Missing expected Sample file; creating") + sample.to_yaml(subs_folder_path=submission_folder) + else: + _LOGGER.debug("Base Sample file exists") + else: + # Serialize Sample, generate data for disk, and write. + name_sample_subtype = sample.__class__.__name__ + _LOGGER.debug("Writing %s representation to disk: '%s'", + name_sample_subtype, sample.name) + sample.to_yaml(subs_folder_path=submission_folder) # Check if job is already submitted (unless ignore_flags is set to True) if not ignore_flags: flag_files = glob.glob(os.path.join( - pipeline_outfolder, pipeline_name + "*.flag")) + sample_output_folder, pipeline_name + "*.flag")) if len(flag_files) > 0: - flags = [os.path.basename(f) for f in flag_files] - _LOGGER.info("> Not submitting, flag(s) found: {}".format(flags)) + _LOGGER.info("> Not submitting, flag(s) found: {}". + format(flag_files)) submit = False else: pass @@ -672,7 +709,8 @@ def cluster_submit( if not submit: return False if dry_run: - _LOGGER.info("> DRY RUN: I would have submitted this") + _LOGGER.info("> DRY RUN: I would have submitted this: '%s'", + submit_script) else: subprocess.call(submission_command + " " + submit_script, shell=True) time.sleep(time_delay) # Delay next job's submission. @@ -783,7 +821,6 @@ def main(): # Parse command-line arguments and establish logger. args, remaining_args = parse_arguments() - _LOGGER.info("Command: {} (Looper version: {})". format(args.command, __version__)) # Initialize project @@ -801,20 +838,19 @@ def main(): # TODO split here, spawning separate run process for each # pipelines directory in project metadata pipelines directory. - try: - pipedirs = prj.metadata.pipelines_dir - _LOGGER.info("Pipelines path(s): {}".format(pipedirs)) - except AttributeError: - _LOGGER.error("Looper requires a metadata.pipelines_dir") - raise - if len(pipedirs) == 0: - _LOGGER.error("Looper requires a metadata.pipelines_dir") - raise AttributeError + if not hasattr(prj.metadata, "pipelines_dir") or \ + len(prj.metadata.pipelines_dir) == 0: + raise AttributeError( + "Looper requires at least one pipeline(s) location.") - interface_manager = InterfaceManager(prj.metadata.pipelines_dir) + if not prj.interfaces_by_protocol: + _LOGGER.error( + "The Project knows no protocols. Does it point " + "to at least one pipelines location that exists?") + return try: - run(prj, args, remaining_args, interface_manager=interface_manager) + run(prj, args, remaining_args) except IOError: _LOGGER.error("{} pipelines_dir: '{}'".format( prj.__class__.__name__, prj.metadata.pipelines_dir)) diff --git a/looper/models.py b/looper/models.py index 78f76078..6f7ed895 100644 --- a/looper/models.py +++ b/looper/models.py @@ -49,9 +49,11 @@ # TODO: the examples changes would involve library and output_dir. from collections import \ - defaultdict, Iterable, Mapping, MutableMapping, OrderedDict as _OrderedDict + defaultdict, Iterable, Mapping, MutableMapping, namedtuple, \ + OrderedDict as _OrderedDict from functools import partial import glob +import inspect import itertools import logging import os as _os @@ -65,7 +67,16 @@ import yaml from .utils import \ - parse_ftype, check_bam, check_fastq, get_file_size, partition + alpha_cased, check_bam, check_fastq, expandpath, \ + get_file_size, import_from_source, parse_ftype, partition, \ + standard_stream_redirector + + +# TODO: decide if we want to denote functions for export. +__functions__ = [] +__classes__ = ["AttributeDict", "PipelineInterface", "Project", + "ProtocolInterface", "ProtocolMapper", "Sample"] +__all__ = __functions__ + __classes__ COMPUTE_SETTINGS_VARNAME = "PEPENV" @@ -74,6 +85,8 @@ SAMPLE_NAME_COLNAME = "sample_name" SAMPLE_ANNOTATIONS_KEY = "sample_annotation" IMPLICATIONS_DECLARATION = "implied_columns" +DATA_SOURCES_SECTION = "data_sources" +SAMPLE_EXECUTION_TOGGLE = "toggle" COL_KEY_SUFFIX = "_key" ATTRDICT_METADATA = {"_force_nulls": False, "_attribute_identity": False} @@ -84,6 +97,27 @@ +def check_sheet(sample_file, dtype=str): + """ + Check if csv file exists and has all required columns. + + :param str sample_file: path to sample annotations file. + :param type dtype: data type for CSV read. + :raises IOError: if given annotations file can't be read. + :raises ValueError: if required column(s) is/are missing. + """ + df = _pd.read_table(sample_file, sep=None, dtype=dtype, + index_col=False, engine="python") + req = [SAMPLE_NAME_COLNAME] + missing = set(req) - set(df.columns) + if len(missing) != 0: + raise ValueError( + "Annotation sheet ('{}') is missing column(s): {}; has: {}". + format(sample_file, missing, df.columns)) + return df + + + def copy(obj): def copy(self): """ @@ -97,17 +131,187 @@ def copy(self): +def include_in_repr(attr, klazz): + """ + Determine whether to include attribute in an object's text representation. + + :param str attr: attribute to include/exclude from object's representation + :param str | type klazz: name of type or type itself of which the object + to be represented is an instance + :return bool: whether to include attribute in an object's + text representation + """ + classname = klazz.__name__ if isinstance(klazz, type) else klazz + return attr not in \ + {"Project": ["sheet", "interfaces_by_protocol"]}[classname] + + + def is_url(maybe_url): + """ + Determine whether a path is a URL. + + :param str maybe_url: path to investigate as URL + :return bool: whether path appears to be a URL + """ return urlparse(maybe_url).scheme != "" +def merge_sample(sample, merge_table, data_sources, derived_columns): + """ + Use merge table data to augment/modify Sample. + + :param Sample sample: sample to modify via merge table data + :param merge_table: data with which to alter Sample + :param Mapping data_sources: collection of named paths to data locations + :param Iterable[str] derived_columns: names of column for which + corresponding Sample attribute's value is data-derived + :return Set[str]: names of columns that were merged + """ + + merged_cols = {} + + if merge_table is None: + _LOGGER.log(5, "No data for sample merge, skipping") + return merged_cols + + if SAMPLE_NAME_COLNAME not in merge_table.columns: + raise KeyError( + "Merge table requires a column named '{}'.". + format(SAMPLE_NAME_COLNAME)) + + _LOGGER.debug("Merging Sample with data sources: {}". + format(data_sources)) + _LOGGER.debug("Merging Sample with derived columns: {}". + format(derived_columns)) + + sample_indexer = merge_table[SAMPLE_NAME_COLNAME] == \ + getattr(sample, SAMPLE_NAME_COLNAME) + merge_rows = merge_table[sample_indexer] + + if len(merge_rows) == 0: + _LOGGER.debug("No merge rows for sample '%s', skipping", sample.name) + return merged_cols + + # Hash derived columns for faster lookup in case of many samples/columns. + derived_columns = set(derived_columns) + _LOGGER.log(5, "%d rows to merge", len(merge_rows)) + + + # For each row in the merge table of this sample: + # 1) populate any derived columns + # 2) derived columns --> space-delimited strings + # 3) update the sample values with the merge table + # Keep track of merged cols, + # so we don't re-derive them later. + merged_cols = {key: "" for key in merge_rows.columns} + for _, row in merge_rows.iterrows(): + row_dict = row.to_dict() + for col in merge_rows.columns: + if col == SAMPLE_NAME_COLNAME or \ + col not in derived_columns: + _LOGGER.log(5, "Skipping column: '%s'", col) + continue + # Initialize key in parent dict. + col_key = col + COL_KEY_SUFFIX + merged_cols[col_key] = "" + row_dict[col_key] = row_dict[col] + row_dict[col] = sample.locate_data_source( + data_sources, col, row_dict[col], row_dict) # 1) + + _LOGGER.log(5, "Adding derived columns") + # Also add in any derived cols present. + for col in derived_columns: + # Skip over attributes that the sample + # either lacks, and those covered by the + # data from the current (row's) data. + if not hasattr(sample, col) or \ + col in row_dict: + _LOGGER.log(5, "Skipping column: '%s'", col) + continue + # Map column name key to sample's value + # for the attribute given by column name. + col_key = col + COL_KEY_SUFFIX + row_dict[col_key] = getattr(sample, col) + # Map the column name itself to the + # populated data source template string. + row_dict[col] = sample.locate_data_source( + data_sources, col, getattr(sample, col), row_dict) + _LOGGER.debug("PROBLEM adding derived column: " + "{}, {}, {}".format(col, row_dict[col], + getattr(sample, col))) + + # Since we are now jamming multiple (merged) + # entries into a single attribute, we have to + # join them into a space-delimited string + # and then set to sample attribute. + for key, val in row_dict.items(): + if key == SAMPLE_NAME_COLNAME or not val: + _LOGGER.log(5, "Skipping KV: {}={}".format(key, val)) + continue + _LOGGER.log(5, "merge: sample '%s'; %s=%s", + str(sample.name), str(key), str(val)) + if not key in merged_cols: + new_val = str(val).rstrip() + else: + new_val = "{} {}".format( + merged_cols[key], str(val)).strip() + merged_cols[key] = new_val # 2) + + # Don't update sample_name. + merged_cols.pop(SAMPLE_NAME_COLNAME, None) + + sample.update(merged_cols) # 3) + sample.merged_cols = merged_cols + sample.merged = True + + return sample + + + +def process_pipeline_interfaces(pipeline_interface_locations): + """ + Create a ProtocolInterface for each pipeline location given. + + :param Iterable[str] pipeline_interface_locations: locations, each of + which should be either a directory path or a filepath, that specifies + pipeline interface and protocol mappings information. Each such file + should be have a pipelines section and a protocol mappings section + whereas each folder should have a file for each of those sections. + :return Mapping[str, Iterable[ProtocolInterface]]: mapping from protocol + name to interface(s) for which that protocol is mapped + """ + interface_by_protocol = defaultdict(list) + for pipe_iface_location in pipeline_interface_locations: + if not _os.path.exists(pipe_iface_location): + _LOGGER.warn("Ignoring nonexistent pipeline interface " + "location '%s'", pipe_iface_location) + continue + proto_iface = ProtocolInterface(pipe_iface_location) + for proto_name in proto_iface.protomap: + _LOGGER.log(5, "Adding protocol name: '%s'", proto_name) + interface_by_protocol[alpha_cased(proto_name)].append(proto_iface) + return interface_by_protocol + + + +# Collect PipelineInterface, Sample type, pipeline path, and script with flags. +SubmissionBundle = namedtuple( + "SubmissionBundle", + field_names=["interface", "subtype", "pipeline", "pipeline_with_flags"]) + + + @copy class Paths(object): """ A class to hold paths as attributes. """ - def __str__(self): - return "Paths object." + def __getitem__(self, key): + """ + Provides dict-style access to attributes + """ + return getattr(self, key) def __iter__(self): """ @@ -120,11 +324,8 @@ def __iter__(self): """ return iter(self.__dict__.values()) - def __getitem__(self, key): - """ - Provides dict-style access to attributes - """ - return getattr(self, key) + def __repr__(self): + return "Paths object." @@ -185,8 +386,7 @@ def __setattr__(self, key, value): def __getattr__(self, item, default=None): """ - Fetch the value associated with the provided identifier. Unlike an - ordinary object, `AttributeDict` supports fetching + Fetch the value associated with the provided identifier. :param int | str item: identifier for value to fetch :return object: whatever value corresponds to the requested key/item @@ -198,6 +398,11 @@ def __getattr__(self, item, default=None): anyway. More specifically, respect attribute naming that appears to be indicative of the intent of protection. """ + try: + return super(AttributeDict, self).__getattribute__(item) + except (AttributeError, TypeError): + # Handle potential property and non-string failures. + pass try: # Fundamentally, this is still a mapping; # route object notation access pattern accordingly. @@ -241,21 +446,21 @@ def __setitem__(self, key, value): if isinstance(value, Mapping): try: # Combine AttributeDict instances. - _LOGGER.debug("Updating key: '{}'".format(key)) + _LOGGER.log(5, "Updating key: '{}'".format(key)) self.__dict__[key].add_entries(value) except (AttributeError, KeyError): # Create new AttributeDict, replacing previous value. self.__dict__[key] = AttributeDict(value) - _LOGGER.debug("'{}' now has keys {}". + _LOGGER.log(5, "'{}' now has keys {}". format(key, self.__dict__[key].keys())) elif value is not None or \ key not in self.__dict__ or self.__dict__["_force_nulls"]: _LOGGER.log(5, "Setting '{}' to {}".format(key, value)) self.__dict__[key] = value else: - _LOGGER.debug("Not setting {k} to {v}; _force_nulls: {nulls}". - format(k=key, v=value, - nulls=self.__dict__["_force_nulls"])) + _LOGGER.log(5, "Not setting {k} to {v}; _force_nulls: {nulls}". + format(k=key, v=value, + nulls=self.__dict__["_force_nulls"])) def __getitem__(self, item): @@ -305,6 +510,9 @@ def __len__(self): def __repr__(self): return repr(self.__dict__) + def __str__(self): + return "{}: {}".format(self.__class__.__name__, repr(self)) + @copy @@ -339,6 +547,10 @@ class Project(AttributeDict): settings can't be established, optional; if null (the default), a warning message will be logged, and no exception will be raised. :type no_compute_exception: type + :param defer_sample_construction: whether to wait to build this Project's + Sample objects until they're needed, optional; by default, the basic + Sample is created during Project construction + :type defer_sample_construction: bool :Example: @@ -356,21 +568,21 @@ class Project(AttributeDict): def __init__(self, config_file, subproject=None, default_compute=None, dry=False, permissive=True, file_checks=False, compute_env_file=None, - no_environment_exception=None, no_compute_exception=None): + no_environment_exception=None, no_compute_exception=None, + defer_sample_construction=False): - _LOGGER.info("Creating %s from file: '%s'", + _LOGGER.debug("Creating %s from file: '%s'", self.__class__.__name__, config_file) super(Project, self).__init__() - default_compute = default_compute or self.default_cmpenv_file - # Initialize local, serial compute as default (no cluster submission) # Start with default environment settings. _LOGGER.debug("Establishing default environment settings") self.environment, self.environment_file = None, None try: - self.update_environment(default_compute) + self.update_environment( + default_compute or self.default_compute_envfile) except Exception as e: _LOGGER.error("Can't load environment config file '%s'", str(default_compute)) @@ -413,11 +625,14 @@ def __init__(self, config_file, subproject=None, self.config_file = _os.path.abspath(config_file) # Parse config file - _LOGGER.info("Parsing %s config file", self.__class__.__name__) + _LOGGER.debug("Parsing %s config file", self.__class__.__name__) if subproject: _LOGGER.info("Using subproject: '{}'".format(subproject)) self.parse_config_file(subproject) + # Ensure data_sources is at least set if it wasn't parsed. + self.setdefault("data_sources", None) + self.name = self.infer_name(self.config_file) self.subproject = subproject @@ -435,37 +650,114 @@ def __init__(self, config_file, subproject=None, except AttributeError: self.derived_columns = self.DERIVED_COLUMNS_DEFAULT - # Sheet will be set to non-null value by call to add_sample_sheet(). - # That call also sets the samples (list) attribute for the instance - # and adds default derived columns. - self.sheet = None - self.samples = list() - self.add_sample_sheet() - self.finalize_pipelines_directory() + # SampleSheet creation populates project's samples, adds the + # sheet itself, and adds any derived columns. + _LOGGER.debug("Processing {} pipeline location(s): {}". + format(len(self.metadata.pipelines_dir), + self.metadata.pipelines_dir)) + self.interfaces_by_protocol = \ + process_pipeline_interfaces(self.metadata.pipelines_dir) + + path_anns_file = self.metadata.sample_annotation + _LOGGER.debug("Reading sample annotations sheet: '%s'", path_anns_file) + try: + self.sheet = check_sheet(path_anns_file) + except IOError: + _LOGGER.error("Alleged annotations file doesn't exist: '%s'", + path_anns_file) + anns_folder_path = _os.path.dirname(path_anns_file) + try: + annotations_file_folder_contents = \ + _os.listdir(anns_folder_path) + except OSError: + _LOGGER.error("Annotations file folder doesn't exist either: " + "'%s'", anns_folder_path) + else: + _LOGGER.error("Annotations file folder's contents: {}". + format(annotations_file_folder_contents)) + raise + + self.merge_table = None + self._samples = None if defer_sample_construction \ + else self._make_basic_samples() + + + def __repr__(self): + include = partial(include_in_repr, klazz=self.__class__) + return repr({k: v for k, v in self.__dict__.items() if include(k)}) + + + @property + def compute_env_var(self): + """ + Environment variable through which to access compute settings. + + :return str: name of the environment variable to pointing to + compute settings + """ + return COMPUTE_SETTINGS_VARNAME + @property - def default_cmpenv_file(self): + def default_compute_envfile(self): """ Path to default compute environment settings file. """ return _os.path.join( self.templates_folder, "default_compute_settings.yaml") @property - def templates_folder(self): - return _os.path.join(_os.path.dirname(__file__), "submit_templates") + def num_samples(self): + """ Number of samples available in this Project. """ + return sum(1 for _ in self.sample_names) @property - def compute_env_var(self): + def output_dir(self): """ - Environment variable through which to access compute settings. - - :return str: name of the environment variable to pointing to - compute settings + Directory in which to place results and submissions folders. + + By default, assume that the project's configuration file specifies + an output directory, and that this is therefore available within + the project metadata. If that assumption does not hold, though, + consider the folder in which the project configuration file lives + to be the project's output directory. + + :return str: path to the project's output directory, either as + specified in the configuration file or the folder that contains + the project's configuration file. """ - return COMPUTE_SETTINGS_VARNAME + try: + return self.metadata.output_dir + except AttributeError: + return _os.path.dirname(self.config_file) + + + @property + def project_folders(self): + """ + Names of folders to nest within a project output directory. + + :return Iterable[str]: names of output-nested folders + """ + return ["results_subdir", "submission_subdir"] + + + @property + def protocols(self): + """ + Determine this Project's unique protocol names. + + :return Set[str]: collection of this Project's unique protocol names + """ + protos = set() + for s in self.samples: + try: + protos.add(s.library) + except AttributeError: + _LOGGER.debug("Sample '%s' lacks protocol", s.sample_name) + return protos @property @@ -484,34 +776,36 @@ def required_metadata(self): @property - def project_folders(self): + def sample_names(self): + """ Names of samples of which this Project is aware. """ + return iter(self.sheet[SAMPLE_NAME_COLNAME]) + + + @property + def samples(self): """ - Names of folders to nest within a project output directory. - - :return Iterable[str]: names of output-nested folders + Generic/base Sample instance for each of this Project's samples. + + :return Iterable[Sample]: Sample instance for each + of this Project's samples """ - return ["results_subdir", "submission_subdir"] + if self._samples is None: + _LOGGER.debug("Building basic Sample(s) for %s", + self.__class__.__name__) + self._samples = self._make_basic_samples() + _LOGGER.debug("%s has %d basic Sample(s)", + self.__class__.__name__, len(self._samples)) + return self._samples @property - def output_dir(self): + def templates_folder(self): """ - Directory in which to place results and submissions folders. - - By default, assume that the project's configuration file specifies - an output directory, and that this is therefore available within - the project metadata. If that assumption does not hold, though, - consider the folder in which the project configuration file lives - to be the project's output directory. - - :return str: path to the project's output directory, either as - specified in the configuration file or the folder that contains - the project's configuration file. + Path to folder with default submission templates. + + :return str: path to folder with default submission templates """ - try: - return self.metadata.output_dir - except AttributeError: - return _os.path.dirname(self.config_file) + return _os.path.join(_os.path.dirname(__file__), "submit_templates") @staticmethod @@ -530,27 +824,135 @@ def infer_name(path_config_file): return config_folder - def _handle_missing_env_attrs(self, env_settings_file, when_missing): - """ Default environment settings aren't required; warn, though. """ - missing_env_attrs = \ - [attr for attr in ["environment", "environment_file"] - if not hasattr(self, attr) or getattr(self, attr) is None] - if not missing_env_attrs: - return - message = "'{}' lacks environment attributes: {}".\ - format(env_settings_file, missing_env_attrs) - if when_missing is None: - _LOGGER.warn(message) + def build_submission_bundles(self, protocol, priority=True): + """ + Create pipelines to submit for each sample of a particular protocol. + + With the argument (flag) to the priority parameter, there's control + over whether to submit pipeline(s) from only one of the project's + known pipeline locations with a match for the protocol, or whether to + submit pipelines created from all locations with a match for the + protocol. + + :param str protocol: name of the protocol/library for which to + create pipeline(s) + :param bool priority: to only submit pipeline(s) from the first of the + pipelines location(s) (indicated in the project config file) that + has a match for the given protocol; optional, default True + :return Iterable[(PipelineInterface, str, str)]: + :raises AssertionError: if there's a failure in the attempt to + partition an interface's pipeline scripts into disjoint subsets of + those already mapped and those not yet mapped + """ + + # Pull out the collection of interfaces (potentially one from each of + # the locations indicated in the project configuration file) as a + # sort of pool of information about possible ways in which to submit + # pipeline(s) for sample(s) of the indicated protocol. + try: + protocol_interfaces = \ + self.interfaces_by_protocol[protocol] + except KeyError: + _LOGGER.warn("Unknown protocol: '{}'".format(protocol)) + return [] + + job_submission_bundles = [] + pipeline_keys_used = set() + _LOGGER.debug("Building pipelines for {} PIs...". + format(len(protocol_interfaces))) + for proto_iface in protocol_interfaces: + # Short-circuit if we care only about the highest-priority match + # for pipeline submission. That is, if the intent is to submit + # pipeline(s) from a single location for each sample of the given + # protocol, we can stop searching the pool of pipeline interface + # information once we've found a match for the protocol. + if priority and len(job_submission_bundles) > 0: + return job_submission_bundles[0] + + this_protocol_pipelines = proto_iface.fetch_pipelines(protocol) + if not this_protocol_pipelines: + _LOGGER.warn("No mapping for protocol '%s' in %s", + protocol, proto_iface) + continue + + # TODO: update once dependency-encoding logic is in place. + # The proposed dependency-encoding format uses a semicolon + # between pipelines for which the dependency relationship is + # serial. For now, simply treat those as multiple independent + # pipelines by replacing the semicolon with a comma, which is the + # way in which multiple independent pipelines for a single protocol + # are represented in the mapping declaration. + pipeline_keys = \ + this_protocol_pipelines.replace(";", ",")\ + .strip(" ()\n")\ + .split(",") + # These cleaned pipeline keys are what's used to resolve the path + # to the pipeline to run. + pipeline_keys = [pk.strip() for pk in pipeline_keys] + + # Skip over pipelines already mapped by another location. + already_mapped, new_scripts = \ + partition(pipeline_keys, + partial(_is_member, items=pipeline_keys_used)) + pipeline_keys_used |= set(pipeline_keys) + + # Attempt to validate that partition yielded disjoint subsets. + try: + disjoint_partition_violation = \ + set(already_mapped) & set(new_scripts) + except TypeError: + _LOGGER.debug("Unable to hash partitions for validation") + else: + assert not disjoint_partition_violation, \ + "Partitioning {} with membership in {} as " \ + "predicate produced intersection: {}".format( + pipeline_keys, pipeline_keys_used, + disjoint_partition_violation) + + if len(already_mapped) > 0: + _LOGGER.debug("Skipping {} already-mapped script name(s): {}". + format(len(already_mapped), already_mapped)) + _LOGGER.debug("{} new scripts for protocol {} from " + "pipeline(s) location '{}': {}". + format(len(new_scripts), protocol, + proto_iface.source, new_scripts)) + + # For each pipeline script to which this protocol will pertain, + # create the new jobs/submission bundles. + new_jobs = [] + for pipeline_key in new_scripts: + # Determine how to reference the pipeline and where it is. + strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ + proto_iface.finalize_pipeline_key_and_paths( + pipeline_key) + # Determine which interface and Sample subtype to use. + sample_subtype = \ + proto_iface.fetch_sample_subtype( + protocol, strict_pipe_key, full_pipe_path) + # Package the pipeline's interface, subtype, command, and key. + submission_bundle = SubmissionBundle( + proto_iface.pipe_iface, sample_subtype, + strict_pipe_key, full_pipe_path_with_flags) + # Add this bundle to the collection of ones relevant for the + # current ProtocolInterface. + new_jobs.append(submission_bundle) + + job_submission_bundles.append(new_jobs) + + # Repeat logic check of short-circuit conditional to account for + # edge case in which it's satisfied during the final iteration. + if priority and len(job_submission_bundles) > 1: + return job_submission_bundles[0] else: - when_missing(message) + return list(itertools.chain(*job_submission_bundles)) def finalize_pipelines_directory(self, pipe_path=""): """ Finalize the establishment of a path to this project's pipelines. - - With the passed argument, override anything already set. - Otherwise, prefer path provided in this project's config, then + + With the passed argument, override anything already set. + Otherwise, prefer path provided in this project's config, then local pipelines folder, then a location set in project environment. :param str pipe_path: (absolute) path to pipelines @@ -586,6 +988,128 @@ def finalize_pipelines_directory(self, pipe_path=""): self.metadata.pipelines_dir = pipe_path + def get_arg_string(self, pipeline_name): + """ + For this project, given a pipeline, return an argument string + specified in the project config file. + """ + + def make_optarg_text(opt, arg): + """ Transform flag/option into CLI-ready text version. """ + return "{} {}".format(opt, _os.path.expandvars(arg)) \ + if arg else opt + + def create_argtext(name): + """ Create command-line argstring text from config section. """ + try: + optargs = getattr(self.pipeline_args, name) + except AttributeError: + return "" + # NS using __dict__ will add in the metadata from AttrDict (doh!) + _LOGGER.debug("optargs.items(): {}".format(optargs.items())) + optargs_texts = [make_optarg_text(opt, arg) + for opt, arg in optargs.items()] + _LOGGER.debug("optargs_texts: {}".format(optargs_texts)) + # TODO: may need to fix some spacing issues here. + return " ".join(optargs_texts) + + default_argtext = create_argtext(DEFAULT_COMPUTE_RESOURCES_NAME) + pipeline_argtext = create_argtext(pipeline_name) + + if not pipeline_argtext: + # The project config may not have an entry for this pipeline; + # no problem! There are no pipeline-specific args. Return text + # from default arguments, whether empty or not. + return default_argtext + elif default_argtext: + # Non-empty pipeline-specific and default argtext + return " ".join([default_argtext, pipeline_argtext]) + else: + # No default argtext, but non-empty pipeline-specific argtext + return pipeline_argtext + + + def build_sheet(self, *protocols): + """ + Create all Sample object for this project for the given protocol(s). + + :return pandas.core.frame.DataFrame: DataFrame with from base version + of each of this Project's samples, for indicated protocol(s) if + given, else all of this Project's samples + """ + # Use all protocols if none are explicitly specified. + samples = self.samples + protocols = {alpha_cased(p) for p in (protocols or self.protocols)} + return _pd.DataFrame( + [s.as_series() for s in samples if + hasattr(s, "library") and alpha_cased(s.library) in protocols]) + + + def make_project_dirs(self): + """ + Creates project directory structure if it doesn't exist. + """ + for folder_name in self.project_folders: + folder_path = self.metadata[folder_name] + _LOGGER.debug("Ensuring project dir exists: '%s'", folder_path) + if not _os.path.exists(folder_path): + _LOGGER.debug("Attempting to create project folder: '%s'", + folder_path) + try: + _os.makedirs(folder_path) + except OSError as e: + _LOGGER.warn("Could not create project folder: '%s'", + str(e)) + + + def _make_basic_samples(self): + """ Build the base Sample objects from the annotations sheet data. """ + + # This should be executed just once, establishing the Project's + # base Sample objects if they don't already exist. + if hasattr(self.metadata, "merge_table"): + if self.merge_table is None: + if self.metadata.merge_table and \ + _os.path.isfile(self.metadata.merge_table): + self.merge_table = _pd.read_table( + self.metadata.merge_table, + sep=None, engine="python") + _LOGGER.debug("Merge table shape: {}". + format(self.merge_table.shape)) + else: + _LOGGER.debug( + "Alleged path to merge table data is not a " + "file: '%s'", self.metadata.merge_table) + else: + _LOGGER.debug("Already parsed merge table") + else: + _LOGGER.debug("No merge table") + + # Create the Sample(s). + samples = [] + for _, row in self.sheet.iterrows(): + sample = Sample(row.dropna()) + sample.set_genome(self.get("genomes")) + sample.set_transcriptome(self.get("transcriptomes")) + + merge_sample(sample, self.merge_table, + self.data_sources, self.derived_columns) + sample.set_file_paths(self) + # Hack for backwards-compatibility + # Pipelines should now use `data_source`) + _LOGGER.debug("Setting sample's data path") + try: + sample.data_path = sample.data_source + except AttributeError: + _LOGGER.log(5, "Sample '%s' lacks data source; skipping " + "data path assignment", sample.sample_name) + else: + _LOGGER.log(5, "Path to sample data: '%s'", sample.data_source) + samples.append(sample) + + return samples + + def parse_config_file(self, subproject=None): """ Parse provided yaml config file and check required fields exist. @@ -598,6 +1122,9 @@ def parse_config_file(self, subproject=None): with open(self.config_file, 'r') as conf_file: config = yaml.safe_load(conf_file) + _LOGGER.debug("{} config data: {}".format( + self.__class__.__name__, config)) + # Parse yaml into the project's attributes. _LOGGER.debug("Adding attributes for {}: {}".format( self.__class__.__name__, config.keys())) @@ -627,6 +1154,29 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Metadata: %s", str(self.metadata)) delattr(self, "paths") + # In looper 0.6, we added pipeline_interfaces to metadata + # For backwards compatibility, merge it with pipelines_dir + + if "metadata" in config: + if "pipelines_dir" in self.metadata: + _LOGGER.warning("Looper v0.6 suggests " + "switching from pipelines_dir to " + "pipeline_interfaces. See docs for details: " + "http://looper.readthedocs.io/en/latest/") + if "pipeline_interfaces" in self.metadata: + if "pipelines_dir" in self.metadata: + raise AttributeError( + "You defined both 'pipeline_interfaces' and " + "'pipelines_dir'. Please remove your " + "'pipelines_dir' definition.") + else: + self.metadata.pipelines_dir = \ + self.metadata.pipeline_interfaces + _LOGGER.debug("Adding pipeline_interfaces to " + "pipelines_dir. New value: {}". + format(self.metadata.pipelines_dir)) + + # Ensure required absolute paths are present and absolute. for var in self.required_metadata: if var not in self.metadata: @@ -661,12 +1211,12 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Parsing relative sections") for sect in relative_sections: if not hasattr(self, sect): - _LOGGER.debug("%s lacks relative section '%s', skipping", - self.__class__.__name__, sect) + _LOGGER.log(5, "%s lacks relative section '%s', skipping", + self.__class__.__name__, sect) continue relative_vars = getattr(self, sect) if not relative_vars: - _LOGGER.debug("No relative variables, continuing") + _LOGGER.log(5, "No relative variables, continuing") continue for var in relative_vars.keys(): if not hasattr(relative_vars, var) or \ @@ -677,18 +1227,17 @@ def parse_config_file(self, subproject=None): _LOGGER.debug("Ensuring absolute path(s) for '%s'", var) # Parsed from YAML, so small space of possible datatypes. if isinstance(relpath, list): - setattr(relative_vars, var, - [self._ensure_absolute(maybe_relpath) - for maybe_relpath in relpath]) + absolute = [self._ensure_absolute(maybe_relpath) + for maybe_relpath in relpath] else: - abs_path = self._ensure_absolute(relpath) - _LOGGER.debug("Setting '%s' to '%s'", var, abs_path) - setattr(relative_vars, var, abs_path) + absolute = self._ensure_absolute(relpath) + _LOGGER.debug("Setting '%s' to '%s'", var, absolute) + setattr(relative_vars, var, absolute) # Project config may have made compute.submission_template relative. # Make sure it's absolute. if self.compute is None: - _LOGGER.debug("No compute, no submission template") + _LOGGER.log(5, "No compute, no submission template") elif not _os.path.isabs(self.compute.submission_template): # Relative to environment config file. self.compute.submission_template = _os.path.join( @@ -703,24 +1252,57 @@ def parse_config_file(self, subproject=None): path_config_file=self.config_file) - def _ensure_absolute(self, maybe_relpath): - _LOGGER.debug("Ensuring absolute path for '%s'", maybe_relpath) - if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): - _LOGGER.debug("Already absolute") - return maybe_relpath - # Maybe we have env vars that make the path absolute? - expanded = _os.path.expandvars(maybe_relpath) - _LOGGER.debug("Expanded: '%s'", expanded) - if _os.path.isabs(expanded): - _LOGGER.debug("Expanded is absolute") - return expanded - _LOGGER.debug("Making non-absolute path '%s' be absolute", - maybe_relpath) - # Set path to an absolute path, relative to project config. - config_dirpath = _os.path.dirname(self.config_file) - _LOGGER.debug("config_dirpath: %s", config_dirpath) - abs_path = _os.path.join(config_dirpath, maybe_relpath) - return abs_path + def set_compute(self, setting): + """ + Set the compute attributes according to the + specified settings in the environment file. + + :param str setting: name for non-resource compute bundle, the name of + a subsection in an environment configuration file + :return bool: success flag for attempt to establish compute settings + """ + + # Hope that environment & environment compute are present. + if setting and self.environment and "compute" in self.environment: + # Augment compute, creating it if needed. + if self.compute is None: + _LOGGER.debug("Creating Project compute") + self.compute = AttributeDict() + _LOGGER.debug("Adding entries for setting '%s'", setting) + self.compute.add_entries(self.environment.compute[setting]) + + # Ensure submission template is absolute. + if not _os.path.isabs(self.compute.submission_template): + try: + self.compute.submission_template = _os.path.join( + _os.path.dirname(self.environment_file), + self.compute.submission_template) + except AttributeError as e: + # Environment and environment compute should at least have been + # set as null-valued attributes, so execution here is an error. + _LOGGER.error(str(e)) + # Compute settings have been established. + else: + return True + else: + # Scenario in which environment and environment compute are + # both present but don't evaluate to True is fairly + # innocuous, even common if outside of the looper context. + _LOGGER.debug("Environment = {}".format(self.environment)) + + return False + + + def set_project_permissions(self): + """ + Make the project's public_html folder executable. + """ + try: + _os.chmod(self.trackhubs.trackhub_dir, 0o0755) + except OSError: + # This currently does not fail now + # ("cannot change folder's mode: %s" % d) + pass def update_environment(self, env_settings_file): @@ -730,13 +1312,11 @@ def update_environment(self, env_settings_file): :param str env_settings_file: path to file with new environment configuration data """ - if not env_settings_file: - return - with open(env_settings_file, 'r') as handle: + with open(env_settings_file, 'r') as f: _LOGGER.info("Loading %s: %s", self.compute_env_var, env_settings_file) - env_settings = yaml.load(handle) + env_settings = yaml.load(f) _LOGGER.debug("Parsed environment settings: %s", str(env_settings)) @@ -761,432 +1341,40 @@ def update_environment(self, env_settings_file): self.environment_file = env_settings_file - def make_project_dirs(self): - """ - Creates project directory structure if it doesn't exist. - """ - for folder_name in self.project_folders: - folder_path = self.metadata[folder_name] - _LOGGER.debug("Ensuring project dir exists: '%s'", folder_path) - if not _os.path.exists(folder_path): - _LOGGER.debug("Attempting to create project folder: '%s'", - folder_path) - try: - _os.makedirs(folder_path) - except OSError as e: - _LOGGER.warn("Could not create project folder: '%s'", - str(e)) - - - def set_project_permissions(self): - """ - Makes the project's public_html folder executable. - """ - for d in [self.trackhubs.trackhub_dir]: - try: - _os.chmod(d, 0o0755) - except OSError: - # This currently does not fail now - # ("cannot change folder's mode: %s" % d) - continue - - - def set_compute(self, setting): - """ - Set the compute attributes according to the - specified settings in the environment file. - - :param str setting: name for non-resource compute bundle, the name of - a subsection in an environment configuration file - :return bool: success flag for attempt to establish compute settings - """ - - # Hope that environment & environment compute are present. - if setting and self.environment and "compute" in self.environment: - - # Augment compute, creating it if needed - if self.compute is None: - _LOGGER.debug("Creating Project compute") - self.compute = AttributeDict() - _LOGGER.debug("Adding entries for setting '%s'", setting) - self.compute.add_entries(self.environment.compute[setting]) - - # Ensure submission template is absolute. - if not _os.path.isabs(self.compute.submission_template): - try: - self.compute.submission_template = _os.path.join( - _os.path.dirname(self.environment_file), - self.compute.submission_template) - except AttributeError as e: - # Environment and environment compute should at least have been - # set as null-valued attributes, so execution here is an error. - _LOGGER.error(str(e)) - # Compute settings have been established. - else: - return True - else: - # Scenario in which environment and environment compute are - # both present but don't evaluate to True is fairly - # innocuous, even common if outside of the looper context. - _LOGGER.debug("Environment = {}".format(self.environment)) - - return False - - - def get_arg_string(self, pipeline_name): - """ - For this project, given a pipeline, return an argument string - specified in the project config file. - """ - - def make_optarg_text(opt, arg): - """ Transform flag/option into CLI-ready text version. """ - return "{} {}".format(opt, _os.path.expandvars(arg)) \ - if arg else opt - - def create_argtext(name): - """ Create command-line argstring text from config section. """ - try: - optargs = getattr(self.pipeline_args, name) - except AttributeError: - return "" - # NS using __dict__ will add in the metadata from AttrDict (doh!) - _LOGGER.debug("optargs.items(): {}".format(optargs.items())) - optargs_texts = [make_optarg_text(opt, arg) - for opt, arg in optargs.items()] - _LOGGER.debug("optargs_texts: {}".format(optargs_texts)) - # TODO: may need to fix some spacing issues here. - return " ".join(optargs_texts) - - default_argtext, pipeline_argtext = \ - create_argtext(DEFAULT_COMPUTE_RESOURCES_NAME), create_argtext(pipeline_name) - - if not pipeline_argtext: - # The project config may not have an entry for this pipeline; - # no problem! There are no pipeline-specific args. Return text - # from default arguments, whether empty or not. - return default_argtext - elif default_argtext: - # Non-empty pipeline-specific and default argtext - return " ".join([default_argtext, pipeline_argtext]) - else: - # No default argtext, but non-empty pipeline-specific argtext - return pipeline_argtext - - - def add_sample_sheet(self, csv=None): - """ - Build a `SampleSheet` object from a csv file and - add it and its samples to the project. - - :param csv: Path to csv file. - :type csv: str - """ - - _LOGGER.debug("Adding sample sheet") - - # Make SampleSheet object - # By default read sample_annotation, but allow explict CSV arg. - self.sheet = SampleSheet(csv or self.metadata.sample_annotation) - - # Pair project and sheet. - self.sheet.prj = self - - # Generate sample objects from annotation sheet. - _LOGGER.debug("Creating samples from annotation sheet") - self.sheet.make_samples() - - # Add samples to Project - for sample in self.sheet.samples: - # Overwritten later if merged - sample.merged = False - self.add_sample(sample) # Appends sample to self.samples. - - # Merge sample files (!) using merge table if provided: - if hasattr(self.metadata, "merge_table"): - if self.metadata.merge_table is not None: - if _os.path.isfile(self.metadata.merge_table): - # read in merge table - - merge_table = _pd.read_table( - self.metadata.merge_table, - sep=None, index_col=False, engine="python") - - if SAMPLE_NAME_COLNAME not in merge_table.columns: - raise KeyError( - "Merge table requires a column named '{}'.". - format(SAMPLE_NAME_COLNAME)) - - for sample in self.sheet.samples: - sample_indexer = \ - merge_table[SAMPLE_NAME_COLNAME] == sample.name - merge_rows = merge_table[sample_indexer] - - # Check if there are rows in the - # merge table for this sample: - if len(merge_rows) > 0: - # For each row in the merge table of this sample: - # 1) populate any derived columns - # 2) derived columns --> space-delimited strings - # 3) update the sample values with the merge table - - # Keep track of merged cols, - # so we don't re-derive them later. - merged_cols = { - key: "" for key in merge_rows.columns} - for _, row in merge_rows.iterrows(): - row_dict = row.to_dict() - for col in merge_rows.columns: - if col == SAMPLE_NAME_COLNAME or \ - col not in self.derived_columns: - continue - # Initialize key in parent dict. - col_key = col + COL_KEY_SUFFIX - merged_cols[col_key] = "" - row_dict[col_key] = row_dict[col] - row_dict[col] = sample.locate_data_source( - col, row_dict[col], row_dict) # 1) - - # Also add in any derived cols present. - for col in self.derived_columns: - # Skip over attributes that the sample - # either lacks, and those covered by the - # data from the current (row's) data. - if not hasattr(sample, col) or \ - col in row_dict: - continue - # Map column name key to sample's value - # for the attribute given by column name. - col_key = col + COL_KEY_SUFFIX - row_dict[col_key] = getattr(sample, col) - # Map the column name itself to the - # populated data source template string. - row_dict[col] = sample.locate_data_source( - col, getattr(sample, col), row_dict) - _LOGGER.debug( - "PROBLEM adding derived column: " - "{}, {}, {}".format(col, - row_dict[col], getattr(sample, col))) - - # Since we are now jamming multiple (merged) - # entries into a single attribute, we have to - # join them into a space-delimited string - # and then set to sample attribute. - for key, val in row_dict.items(): - if key == SAMPLE_NAME_COLNAME or not val: - continue - _LOGGER.debug("merge: sample '%s'; %s=%s", - str(sample.name), - str(key), str(val)) - if not key in merged_cols: - new_val = str(val).rstrip() - else: - new_val = "{} {}".format( - merged_cols[key], str(val)).strip() - merged_cols[key] = new_val # 2) - - # Don't update sample_name. - merged_cols.pop(SAMPLE_NAME_COLNAME, None) - - sample.update(merged_cols) # 3) - sample.merged = True # mark sample as merged - sample.merged_cols = merged_cols - - # With all samples, prepare file paths. - for sample in self.sheet.samples: - if hasattr(sample, "organism"): - sample.get_genome_transcriptome() - sample.set_file_paths() - # Hack for backwards-compatibility - # Pipelines should now use `data_source`) - try: - sample.data_path = sample.data_source - except AttributeError: - _LOGGER.debug("Sample '%s' lacks data source --> skipping " - "data path assignment", sample.sample_name) - - - def add_sample(self, sample): - """ - Adds a sample to the project's `samples`. - """ - # Check sample is Sample object - if not isinstance(sample, Sample): - raise TypeError("Provided object is not a Sample object.") - - # Tie sample and project bilaterally - sample.prj = self - # Append - self.samples.append(sample) - - - -@copy -class SampleSheet(object): - """ - Class to model a sample annotation sheet. - - :param path: Path to sample file. - :type path: str - :param dtype: Data type to read sample file as. Default is str. - :type dtype: type - - :Example: - - .. code-block:: python - - from models import Project, SampleSheet - prj = Project("config.yaml") - sheet = SampleSheet("sheet.csv") - """ - - def __init__(self, path, dtype=str): - super(SampleSheet, self).__init__() - self.df = self.check_sheet(path, dtype) - self.path = path - self.samples = list() - - def __repr__(self): - if hasattr(self, "prj"): - return "SampleSheet for project '%s' with %i samples." % \ - (self.prj, len(self.df)) - else: - return "SampleSheet with %i samples." % len(self.df) - - - @staticmethod - def check_sheet(sample_file, dtype): - """ - Check if csv file exists and has all required columns. - - :param str sample_file: path to sample annotations file. - :param type dtype: data type for CSV read. - :raises IOError: if given annotations file can't be read. - :raises ValueError: if required column(s) is/are missing. - """ - - df = _pd.read_table(sample_file, sep=None, dtype=dtype, - index_col=False, engine="python") - req = [SAMPLE_NAME_COLNAME] - missing = set(req) - set(df.columns) - if len(missing) != 0: - raise ValueError( - "Annotation sheet ('{}') is missing column(s): {}; has: {}". - format(sample_file, missing, df.columns)) - return df - - - @staticmethod - def alpha_cased(text, lower=False): - """ - Filter text to just letters and homogenize case. - - :param str text: what to filter and homogenize. - :param bool lower: whether to convert to lowercase; default uppercase. - :return str: input filtered to just letters, with homogenized case. - """ - text = "".join(filter(lambda c: c.isalpha(), text)) - return text.lower() if lower else text.upper() - - - def make_samples(self): - """ - Create samples from annotation sheet (considering library), - and them to the project. - """ + def _ensure_absolute(self, maybe_relpath): + """ Ensure that a possibly relative path is absolute. """ + _LOGGER.log(5, "Ensuring absolute: '%s'", maybe_relpath) + if _os.path.isabs(maybe_relpath) or is_url(maybe_relpath): + _LOGGER.log(5, "Already absolute") + return maybe_relpath + # Maybe we have env vars that make the path absolute? + expanded = _os.path.expandvars(maybe_relpath) + _LOGGER.log(5, "Expanded: '%s'", expanded) + if _os.path.isabs(expanded): + _LOGGER.log(5, "Expanded is absolute") + return expanded + _LOGGER.log(5, "Making non-absolute path '%s' be absolute", + maybe_relpath) + # Set path to an absolute path, relative to project config. + config_dirpath = _os.path.dirname(self.config_file) + _LOGGER.log(5, "config_dirpath: %s", config_dirpath) + abs_path = _os.path.join(config_dirpath, maybe_relpath) + return abs_path - found_pipelines = False - try: - import pipelines # Use a pipelines package if installed. - except ImportError: - # pipelines_dir is optional. - pipeline_dirpaths = getattr( - self.prj.metadata, "pipelines_dir", None) - if pipeline_dirpaths: - if isinstance(pipeline_dirpaths, str): - pipeline_dirpaths = [pipeline_dirpaths] - sys.path.extend(pipeline_dirpaths) - _LOGGER.debug( - "Added {} pipelines path(s) to sys.path: {}". - format(len(pipeline_dirpaths), pipeline_dirpaths)) - try: - import pipelines - except ImportError: - pass - else: - found_pipelines = True - else: - found_pipelines = True - if not found_pipelines: - # Just return a basic Sample for each of the sheet's rows. - def make_sample(data): - return Sample(data) + def _handle_missing_env_attrs(self, env_settings_file, when_missing): + """ Default environment settings aren't required; warn, though. """ + missing_env_attrs = \ + [attr for attr in ["environment", "environment_file"] + if not hasattr(self, attr) or getattr(self, attr) is None] + if not missing_env_attrs: + return + message = "'{}' lacks environment attributes: {}".\ + format(env_settings_file, missing_env_attrs) + if when_missing is None: + _LOGGER.warn(message) else: - # Attempt creation of Sample subtype specific to protocol. - - # Get all pipelines package Sample subclasses. - import inspect - from utils import fetch_package_classes - sample_types = fetch_package_classes(pipelines, - lambda maybe_class: inspect.isclass(maybe_class) - and issubclass(maybe_class, Sample)) - - # TODO: perhaps modify or alter handling of need for __library__. - pairing = {self.alpha_cased(sample_class.__library__): sample_class - for sample_type, sample_class in sample_types} - - def make_sample(data): - try: - return pairing[self.alpha_cased(data.library)](data) - except (AttributeError, KeyError): - return Sample(data) - - for _, row in self.df.iterrows(): - self.samples.append(make_sample(row.dropna())) - - - def as_data_frame(self): - """ - Returns a `pandas.DataFrame` representation of self. - """ - return _pd.DataFrame([s.as_series() for s in self.samples]) - - - def write(self, path, sep=None): - """ - Saves an annotation sheet from the samples. - - :param path: Path to file to be written. - :type path: str - :param sep: Delimiter to use in the file written. - :type sep: str - - :Example: - - .. code-block:: python - - from models import SampleSheet - sheet = SampleSheet("/projects/example/sheet.csv") - sheet.write("~/projects/example/sheet2.csv") - """ - - valid_types = [".txt", ".tsv", ".csv"] - - # Infer delimiter if needed. - if sep is None: - file_type = _os.path.splitext(path)[1].lower() - if file_type not in valid_types: - help_msg = "Provide an argument for parameter 'sep' or pass a " \ - "filepath with an extension in: {}".\ - format(valid_types) - raise ValueError(help_msg) - sep = "," if file_type == ".csv" else "\t" - - # Convert to frame and write to disk. - with open(path, 'w') as sheetfile: - # TODO: decide which--if any--attributes to drop here. - self.as_data_frame().to_csv(sheetfile, sep=sep, index=False) + when_missing(message) @@ -1196,7 +1384,7 @@ class Sample(object): Class to model Samples based on a pandas Series. :param series: Sample's data. - :type series: pandas.core.series.Series + :type series: Mapping | pandas.core.series.Series :Example: @@ -1226,6 +1414,8 @@ def __init__(self, series): if isinstance(series, _pd.Series): series = series.to_dict() + elif isinstance(series, Sample): + series = series.as_series().to_dict() # Set series attributes on self. for key, value in series.items(): @@ -1247,17 +1437,21 @@ def __init__(self, series): self.required_paths = None self.yaml_file = None + # Not yet merged, potentially toggled when merge step is considered. + self.merged = False + # Sample dirs - self.paths = Paths() # Only when sample is added to project, can paths be added - # This is because sample-specific files will be created in a - # data root directory dependent on the project. - # The SampleSheet object, after being added to a project, will - # call Sample.set_file_paths(). + self.paths = Paths() - def __repr__(self): - return "Sample '{}'".format(self.name) + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + + def __ne__(self, other): + return not self == other def __getitem__(self, item): @@ -1270,12 +1464,18 @@ def __getitem__(self, item): raise KeyError(item) - def update(self, newdata): + def __repr__(self): + return "Sample '{}'".format(self.name) + + + def as_series(self): """ - Update Sample object with attributes from a dict. + Returns a `pandas.Series` object with all the sample's attributes. + + :return pandas.core.series.Series: pandas Series representation + of this Sample, with its attributes. """ - for key, value in newdata.items(): - setattr(self, key, value) + return _pd.Series(self.__dict__) def check_valid(self, required=None): @@ -1295,6 +1495,87 @@ def check_valid(self, required=None): return lacking + def determine_missing_requirements(self): + """ + Determine which of this Sample's required attributes/files are missing. + + :return (type, str): hypothetical exception type along with message + about what's missing; null and empty if nothing exceptional + is detected + """ + + # set_pipeline_attributes must be run first. + if not hasattr(self, "required_inputs"): + _LOGGER.warn("You must run set_pipeline_attributes " + "before determine_missing_requirements") + return None, "" + + if not self.required_inputs: + _LOGGER.debug("No required inputs") + return None, "" + + # First, attributes + missing, empty = [], [] + for file_attribute in self.required_inputs_attr: + _LOGGER.log(5, "Checking '{}'".format(file_attribute)) + try: + attval = getattr(self, file_attribute) + except AttributeError: + _LOGGER.log(5, "Missing required input attribute '%s'", + file_attribute) + missing.append(file_attribute) + continue + if attval == "": + _LOGGER.log(5, "Empty required input attribute '%s'", + file_attribute) + empty.append(file_attribute) + else: + _LOGGER.log(5, "'{}' is valid: '{}'". + format(file_attribute, attval)) + + if missing or empty: + return AttributeError, \ + "Missing attributes: {}. Empty attributes: {}".\ + format(missing, empty) + + # Second, files + missing_files = [] + for paths in self.required_inputs: + _LOGGER.log(5, "Text to split and check paths: '%s'", paths) + # There can be multiple, space-separated values here. + for path in paths.split(" "): + _LOGGER.log(5, "Checking path: '{}'".format(path)) + if not _os.path.exists(path): + _LOGGER.log(5, "Missing required input file: '{}'". + format(path)) + missing_files.append(path) + + if not missing_files: + return None, "" + else: + missing_message = \ + "Missing file(s): {}".format(", ".join(missing_files)) + return IOError, missing_message + + + def generate_filename(self, delimiter="_"): + """ + Create a name for file in which to represent this Sample. + + This uses knowledge of the instance's subtype, sandwiching a delimiter + between the name of this Sample and the name of the subtype before the + extension. If the instance is a base Sample type, then the filename + is simply the sample name with an extension. + + :param str delimiter: what to place between sample name and name of + subtype; this is only relevant if the instance is of a subclass + :return str: name for file with which to represent this Sample on disk + """ + base = self.name if type(self) is Sample else \ + "{}{}{}".format(self.name, delimiter, self.__class__.__name__) + return "{}.yaml".format(base) + + def generate_name(self): """ Generate name for the sample by joining some of its attribute strings. @@ -1302,88 +1583,134 @@ def generate_name(self): raise NotImplementedError("Not implemented in new code base.") - def as_series(self): + def get_attr_values(self, attrlist): """ - Returns a `pandas.Series` object with all the sample's attributes. + Get value corresponding to each given attribute. - :return pandas.core.series.Series: pandas Series representation - of this Sample, with its attributes. + :param str attrlist: name of an attribute storing a list of attr names + :return list | NoneType: value (or empty string) corresponding to + each named attribute; null if this Sample's value for the + attribute given by the argument to the "attrlist" parameter is + empty/null, or if this Sample lacks the indicated attribute """ - return _pd.Series(self.__dict__) + # If attribute is None, then value is also None. + attribute_list = getattr(self, attrlist, None) + if not attribute_list: + return None + + if not isinstance(attribute_list, list): + attribute_list = [attribute_list] + + # Strings contained here are appended later so shouldn't be null. + return [getattr(self, attr, "") for attr in attribute_list] - def to_yaml(self, path=None): + def get_sheet_dict(self): """ - Serializes itself in YAML format. + Create a K-V pairs for items originally passed in via the sample sheet. - :param str path: A file path to write yaml to. + This is useful for summarizing; it provides a representation of the + sample that excludes things like config files and derived entries. + + :return OrderedDict: mapping from name to value for data elements + originally provided via the sample sheet (i.e., the a map-like + representation of the instance, excluding derived items) """ - def obj2dict(obj, to_skip=("samples", "sheet", "sheet_attributes")): - """ - Build representation of object as a dict, recursively - for all objects that might be attributes of self. + return _OrderedDict([[k, getattr(self, k)] + for k in self.sheet_attributes]) - :param object obj: what to serialize to write to YAML. - :param Iterable[str] to_skip: names of attributes to ignore. -\ """ - if isinstance(obj, list): - return [obj2dict(i) for i in obj] - if isinstance(obj, AttributeDict): - return {k: obj2dict(v) for k, v in obj.__dict__.items() - if k not in to_skip and - (k not in ATTRDICT_METADATA or - v != ATTRDICT_METADATA[k])} - elif isinstance(obj, Mapping): - return {k: obj2dict(v) - for k, v in obj.items() if k not in to_skip} - elif isinstance(obj, (Paths, Sample)): - return {k: obj2dict(v) - for k, v in obj.__dict__.items() if k not in to_skip} - elif hasattr(obj, 'dtype'): # numpy data types - # TODO: this fails with ValueError for multi-element array. - return obj.item() - elif _pd.isnull(obj): - # Missing values as evaluated by pd.isnull(). - # This gets correctly written into yaml. - return "NaN" - else: - return obj - # If path is not specified, use default: - # prj.metadata.submission_dir + sample_name + yaml - self.yaml_file = path or \ - _os.path.join(self.prj.metadata.submission_subdir, - self.sample_name + ".yaml") - serial = obj2dict(self) - with open(self.yaml_file, 'w') as outfile: - outfile.write(yaml.safe_dump(serial, default_flow_style=False)) + def infer_columns(self, implications): + """ + Infer value for additional field(s) from other field(s). + + Add columns/fields to the sample based on values in those already-set + that the sample's project defines as indicative of implications for + additional data elements for the sample. + + :param Mapping implications: Project's implied columns data + :return None: this function mutates state and is strictly for effect + """ + + _LOGGER.log(5, "Sample attribute implications: {}". + format(implications)) + if not implications: + return + + for implier_name, implied in implications.items(): + _LOGGER.debug( + "Setting Sample variable(s) implied by '%s'", implier_name) + try: + implier_value = self[implier_name] + except KeyError: + _LOGGER.debug("No '%s' for this sample", implier_name) + continue + try: + implied_value_by_column = implied[implier_value] + _LOGGER.debug("Implications for '%s' = %s: %s", + implier_name, implier_value, + str(implied_value_by_column)) + for colname, implied_value in \ + implied_value_by_column.items(): + _LOGGER.log(5, "Setting '%s'=%s", + colname, implied_value) + setattr(self, colname, implied_value) + except KeyError: + _LOGGER.log( + 5, "Unknown implied value for implier '%s' = '%s'", + implier_name, implier_value) + + + def is_dormant(self): + """ + Determine whether this Sample is inactive. + By default, a Sample is regarded as active. That is, if it lacks an + indication about activation status, it's assumed to be active. If, + however, and there's an indication of such status, it must be '1' + in order to be considered switched 'on.' - def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, + :return bool: whether this Sample's been designated as dormant + """ + try: + flag = self[SAMPLE_EXECUTION_TOGGLE] + except KeyError: + # Regard default Sample state as active. + return False + # If specified, the activation flag must be set to '1'. + return flag != "1" + + + def locate_data_source(self, data_sources, column_name=DATA_SOURCE_COLNAME, source_key=None, extra_vars=None): """ - Uses the template path provided in the project config section - "data_sources" to piece together an actual path by substituting + Uses the template path provided in the project config section + "data_sources" to piece together an actual path by substituting variables (encoded by "{variable}"") with sample attributes. - :param str column_name: Name of sample attribute + :param Mapping data_sources: mapping from key name (as a value in + a cell of a tabular data structure) to, e.g., filepath + :param str column_name: Name of sample attribute (equivalently, sample sheet column) specifying a derived column. - :param str source_key: The key of the data_source, - used to index into the project config data_sources section. - By default, the source key will be taken as the value of - the specified column (as a sample attribute). - For cases where the sample doesn't have this attribute yet + :param str source_key: The key of the data_source, + used to index into the project config data_sources section. + By default, the source key will be taken as the value of + the specified column (as a sample attribute). + For cases where the sample doesn't have this attribute yet (e.g. in a merge table), you must specify the source key. - :param dict extra_vars: By default, this will look to - populate the template location using attributes found in the - current sample; however, you may also provide a dict of extra - variables that can also be used for variable replacement. + :param dict extra_vars: By default, this will look to + populate the template location using attributes found in the + current sample; however, you may also provide a dict of extra + variables that can also be used for variable replacement. These extra variables are given a higher priority. :return str: regex expansion of data source specified in configuration, with variable substitutions made + :raises ValueError: if argument to data_sources parameter is null/empty """ - sources_section = "data_sources" + if not data_sources: + # TODO: should this be a null/empty-string return, or actual error? + raise ValueError("No data sources") if not source_key: try: @@ -1392,15 +1719,16 @@ def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, reason = "'{attr}': to locate sample's data source, provide " \ "the name of a key from '{sources}' or ensure " \ "sample has attribute '{attr}'".format( - attr=column_name, sources=sources_section) + attr=column_name, sources=DATA_SOURCES_SECTION) raise AttributeError(reason) try: - regex = self.prj[sources_section][source_key] + regex = data_sources[source_key] except KeyError: _LOGGER.warn( "Config lacks entry for data_source key: '{}' " - "(in column: '{}')".format(source_key, column_name)) + "in column '{}'; known: {}".format( + source_key, column_name, data_sources.keys())) return "" # Populate any environment variables like $VAR with os.environ["VAR"] @@ -1429,125 +1757,94 @@ def locate_data_source(self, column_name=DATA_SOURCE_COLNAME, return val - def get_genome_transcriptome(self): + def make_sample_dirs(self): """ - Get genome and transcriptome, based on project config file. - If not available (matching config), genome and transcriptome will be set to sample.organism. + Creates sample directory structure if it doesn't exist. """ - try: - self.genome = getattr(self.prj.genomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks genome mapping for " - "organism '%s'", str(self.organism)) - try: - self.transcriptome = getattr(self.prj.transcriptomes, self.organism) - except AttributeError: - _LOGGER.debug("Project config lacks transcriptome mapping for " - "organism '%s'", str(self.organism)) + for path in self.paths: + if not _os.path.exists(path): + _os.makedirs(path) - def set_file_paths(self): + def set_file_paths(self, project): """ Sets the paths of all files for this sample. + + :param Project project: object with pointers to data paths and such """ # Any columns specified as "derived" will be constructed # based on regex in the "data_sources" section of project config. - for col in self.prj.derived_columns: + for col in project.derived_columns: # Only proceed if the specified column exists - # and was not already merged or derived. - if hasattr(self, col) and col not in self.merged_cols \ - and col not in self.derived_cols_done: - # Set a variable called {col}_key, so the - # original source can also be retrieved. - setattr(self, col + COL_KEY_SUFFIX, getattr(self, col)) - setattr(self, col, self.locate_data_source(col)) - self.derived_cols_done.append(col) - - self.infer_columns() - - # Parent - self.results_subdir = self.prj.metadata.results_subdir - self.paths.sample_root = _os.path.join( - self.prj.metadata.results_subdir, self.sample_name) - - # Track url - bigwig_filename = self.name + ".bigWig" - try: - # Project's public_html folder - self.bigwig = _os.path.join( - self.prj.trackhubs.trackhub_dir, bigwig_filename) - self.track_url = \ - "{}/{}".format(self.prj.trackhubs.url, bigwig_filename) - except: - _LOGGER.debug("No trackhub/URL") - pass - - - def infer_columns(self): - """ - Infer value for additional field(s) from other field(s). - - Add columns/fields to the sample based on values in those already-set - that the sample's project defines as indicative of implications for - additional data elements for the sample. - - :return None: this function mutates state and is strictly for effect - """ - if not hasattr(self.prj, IMPLICATIONS_DECLARATION): - return - - impliers = self.prj[IMPLICATIONS_DECLARATION] - - _LOGGER.debug( - "Sample variable(s) that can imply others: %s", str(impliers)) - for implier_name, implied in impliers.items(): - _LOGGER.debug( - "Setting Sample variable(s) implied by '%s'", implier_name) - try: - implier_value = self[implier_name] - except KeyError: - _LOGGER.debug("No '%s' for this sample", implier_name) - continue - try: - implied_value_by_column = implied[implier_value] - _LOGGER.debug("Implications for '%s' = %s: %s", - implier_name, implier_value, - str(implied_value_by_column)) - for colname, implied_value in \ - implied_value_by_column.items(): - _LOGGER.log(5, "Setting '%s'=%s", - colname, implied_value) - setattr(self, colname, implied_value) - except KeyError: - _LOGGER.log( - 5, "Unknown implied value for implier '%s' = '%s'", - implier_name, implier_value) + # and was not already merged or derived. + if hasattr(self, col) and col not in self.merged_cols \ + and col not in self.derived_cols_done: + # Set a variable called {col}_key, so the + # original source can also be retrieved. + setattr(self, col + COL_KEY_SUFFIX, getattr(self, col)) + setattr(self, col, self.locate_data_source( + data_sources=project.get(DATA_SOURCES_SECTION), + column_name=col)) + self.derived_cols_done.append(col) + self.infer_columns(implications=project.get(IMPLICATIONS_DECLARATION)) - def make_sample_dirs(self): - """ - Creates sample directory structure if it doesn't exist. - """ - for path in self.paths: - if not _os.path.exists(path): - _os.makedirs(path) + # Parent + self.results_subdir = project.metadata.results_subdir + self.paths.sample_root = _os.path.join( + project.metadata.results_subdir, self.sample_name) + # Track url + bigwig_filename = self.name + ".bigWig" + try: + # Project's public_html folder + self.bigwig = _os.path.join( + project.trackhubs.trackhub_dir, bigwig_filename) + self.track_url = \ + "{}/{}".format(project.trackhubs.url, bigwig_filename) + except: + _LOGGER.debug("No trackhub/URL") + pass - def get_sheet_dict(self): + + def set_genome(self, genomes): """ - Create a K-V pairs for items originally passed in via the sample sheet. + Set the genome for this Sample. + + :param Mapping[str, str] genomes: genome assembly by organism name + """ + self._set_assembly("genome", genomes) - This is useful for summarizing; it provides a representation of the - sample that excludes things like config files and derived entries. - :return OrderedDict: mapping from name to value for data elements - originally provided via the sample sheet (i.e., the a map-like - representation of the instance, excluding derived items) + def set_transcriptome(self, transcriptomes): """ - return _OrderedDict([[k, getattr(self, k)] - for k in self.sheet_attributes]) + Set the transcriptome for this Sample. + :param Mapping[str, str] transcriptomes: transcriptome assembly by + organism name + """ + self._set_assembly("transcriptome", transcriptomes) + + + def _set_assembly(self, ome, assemblies): + if not assemblies: + _LOGGER.debug("Empty/null assemblies mapping: {} ({})". + format(assemblies, type(assemblies))) + return + try: + assembly = assemblies[self.organism] + except AttributeError: + _LOGGER.debug("Sample '%s' lacks organism attribute", self.name) + assembly = None + except KeyError: + _LOGGER.log(5, "Unknown {} value: '{}'". + format(ome, self.organism)) + assembly = None + _LOGGER.log(5, "Setting {} as {} on sample: '{}'". + format(assembly, ome, self.name)) + setattr(self, ome, assembly) + def set_pipeline_attributes( self, pipeline_interface, pipeline_name, permissive=True): @@ -1571,108 +1868,37 @@ def set_pipeline_attributes( # Settings ending in _attr are lists of attribute keys. # These attributes are then queried to populate values # for the primary entries. - self.ngs_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "ngs_input_files") - self.required_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "required_input_files") - self.all_inputs_attr = pipeline_interface.get_attribute( - pipeline_name, "all_input_files") - + req_attr_names = [("ngs_input_files", "ngs_inputs_attr"), + ("required_input_files", "required_inputs_attr"), + ("all_input_files", "all_inputs_attr")] + for name_src_attr, name_dst_attr in req_attr_names: + _LOGGER.log(5, "Value of '%s' will be assigned to '%s'", + name_src_attr, name_dst_attr) + value = pipeline_interface.get_attribute( + pipeline_name, name_src_attr) + _LOGGER.log(5, "Assigning '{}': {}".format(name_dst_attr, value)) + setattr(self, name_dst_attr, value) + + # Post-processing of input attribute assignments. + # Ensure that there's a valid all_inputs_attr. + if not self.all_inputs_attr: + self.all_inputs_attr = self.required_inputs_attr + # Convert attribute keys into values. if self.ngs_inputs_attr: + _LOGGER.log(5, "Handling NGS input attributes: '%s'", self.name) # NGS data inputs exit, so we can add attributes like # read_type, read_length, paired. self.ngs_inputs = self.get_attr_values("ngs_inputs_attr") self.set_read_type(permissive=permissive) + else: + _LOGGER.log(5, "No NGS inputs: '%s'", self.name) - # input_size - if not self.all_inputs_attr: - self.all_inputs_attr = self.required_inputs_attr - - # Convert attribute keys into values + # Assign values for actual inputs attributes. self.required_inputs = self.get_attr_values("required_inputs_attr") self.all_inputs = self.get_attr_values("all_inputs_attr") self.input_file_size = get_file_size(self.all_inputs) - def confirm_required_inputs(self, permissive=False): - - # set_pipeline_attributes must be run first. - if not hasattr(self, "required_inputs"): - _LOGGER.warn("You must run set_pipeline_attributes " - "before confirm_required_inputs") - return True - - if not self.required_inputs: - _LOGGER.debug("No required inputs") - return True - - # First, attributes - for file_attribute in self.required_inputs_attr: - _LOGGER.debug("Checking '{}'".format(file_attribute)) - if not hasattr(self, file_attribute): - message = "Missing required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False - if getattr(self, file_attribute) is "": - message = "Empty required input attribute '{}'".\ - format(file_attribute) - _LOGGER.warn(message) - if not permissive: - raise IOError(message) - else: - return False - - # Second, files - missing_files = [] - for paths in self.required_inputs: - # There can be multiple, space-separated values here. - for path in paths.split(" "): - _LOGGER.debug("Checking path: '{}'".format(path)) - if not _os.path.exists(path): - _LOGGER.warn("Missing required input file: '{}'".format(path)) - missing_files.append(path) - - if len(missing_files) > 0: - message = "Missing/unreadable file(s): {}".\ - format(", ".join(["'{}'".format(path) - for path in missing_files])) - if not permissive: - raise IOError(message) - else: - _LOGGER.error(message) - return False - - return True - - - def get_attr_values(self, attrlist): - """ - Get value corresponding to each given attribute. - - :param str attrlist: name of an attribute storing a list of attr names - :return list: value (or empty string) corresponding to each named attr - """ - if not hasattr(self, attrlist): - return None - - attribute_list = getattr(self, attrlist) - - # If attribute is None, then value is also None. - if not attribute_list: - return None - - if not isinstance(attribute_list, list): - attribute_list = [attribute_list] - - # Strings contained here are appended later so shouldn't be null. - return [getattr(self, attr) if hasattr(self, attr) else "" - for attr in attribute_list] - - def set_read_type(self, n=10, permissive=True): """ For a sample with attr `ngs_inputs` set, this sets the @@ -1769,6 +1995,95 @@ def set_read_type(self, n=10, permissive=True): feature, self.name) + def to_yaml(self, path=None, subs_folder_path=None, delimiter="_"): + """ + Serializes itself in YAML format. + + :param str path: A file path to write yaml to; provide this or + the subs_folder_path + :param str subs_folder_path: path to folder in which to place file + that's being written; provide this or a full filepath + :param str delimiter: text to place between the sample name and the + suffix within the filename; irrelevant if there's no suffix + :return str: filepath used (same as input if given, otherwise the + path value that was inferred) + :raises ValueError: if neither full filepath nor path to extant + parent directory is provided. + """ + + # Determine filepath, prioritizing anything given, then falling + # back to a default using this Sample's Project's submission_subdir. + # Use the sample name and YAML extension as the file name, + # interjecting a pipeline name as a subfolder within the Project's + # submission_subdir if such a pipeline name is provided. + if not path: + if not subs_folder_path: + raise ValueError( + "To represent {} on disk, provide a full path or a path " + "to a parent (submissions) folder". + format(self.__class__.__name__)) + _LOGGER.debug("Creating filename for %s: '%s'", + self.__class__.__name__, self.name) + filename = self.generate_filename(delimiter=delimiter) + _LOGGER.debug("Filename: '%s'", filename) + path = _os.path.join(subs_folder_path, filename) + + _LOGGER.debug("Setting %s filepath: '%s'", + self.__class__.__name__, path) + self.yaml_file = path + + + def obj2dict(obj, + to_skip=("samples", "sheet", "sheet_attributes")): + """ + Build representation of object as a dict, recursively + for all objects that might be attributes of self. + + :param object obj: what to serialize to write to YAML. + :param Iterable[str] to_skip: names of attributes to ignore. +\ """ + if isinstance(obj, list): + return [obj2dict(i) for i in obj] + if isinstance(obj, AttributeDict): + return {k: obj2dict(v) for k, v in obj.__dict__.items() + if k not in to_skip and + (k not in ATTRDICT_METADATA or + v != ATTRDICT_METADATA[k])} + elif isinstance(obj, Mapping): + return {k: obj2dict(v) + for k, v in obj.items() if k not in to_skip} + elif isinstance(obj, (Paths, Sample)): + return {k: obj2dict(v) + for k, v in obj.__dict__.items() if + k not in to_skip} + elif hasattr(obj, 'dtype'): # numpy data types + # TODO: this fails with ValueError for multi-element array. + return obj.item() + elif _pd.isnull(obj): + # Missing values as evaluated by pd.isnull(). + # This gets correctly written into yaml. + return "NaN" + else: + return obj + + _LOGGER.debug("Serializing %s: '%s'", + self.__class__.__name__, self.name) + serial = obj2dict(self) + with open(self.yaml_file, 'w') as outfile: + _LOGGER.debug("Generating YAML data for %s: '%s'", + self.__class__.__name__, self.name) + yaml_data = yaml.safe_dump(serial, default_flow_style=False) + outfile.write(yaml_data) + + + def update(self, newdata): + """ + Update Sample object with attributes from a dict. + """ + for key, value in newdata.items(): + setattr(self, key, value) + + @copy class PipelineInterface(object): @@ -1791,28 +2106,64 @@ def __init__(self, config): self.pipe_iface_config = config else: - _LOGGER.debug("Parsing '%s' for %s config data", - config, self.__class__.__name__) + _LOGGER.debug("Parsing '%s' for PipelineInterface config data", + config) self.pipe_iface_file = config with open(config, 'r') as f: self.pipe_iface_config = yaml.load(f) + # Ensure that each pipeline path, if provided, is expanded. + self._expand_paths() + + + def __getitem__(self, item): + try: + return self._select_pipeline(item) + except _MissingPipelineConfigurationException: + raise KeyError("{} is not a known pipeline; known: {}". + format(item, self.pipe_iface_config.keys())) + def __iter__(self): return iter(self.pipe_iface_config.items()) def __repr__(self): - return repr(self.pipe_iface_config) + source = self.pipe_iface_file or "Mapping" + num_pipelines = len(self.pipe_iface_config) + pipelines = ", ".join(self.pipe_iface_config.keys()) + return "{} from {}, with {} pipeline(s): {}".format( + self.__class__.__name__, source, num_pipelines, pipelines) + + + def _expand_paths(self): + for pipe_data in self.pipe_iface_config.values(): + if "path" in pipe_data: + pipe_path = pipe_data["path"] + _LOGGER.log(5, "Expanding path: '%s'", pipe_path) + pipe_path = expandpath(pipe_path) + _LOGGER.log(5, "Expanded: '%s'", pipe_path) + pipe_data["path"] = pipe_path @property def pipeline_names(self): + """ + Names of pipelines about which this interface is aware. + + :return Iterable[str]: names of pipelines about which this + interface is aware + """ return self.pipe_iface_config.keys() @property def pipelines(self): + """ + Keyed collection of pipeline interface data. + + :return Mapping: pipeline interface configuration data + """ return self.pipe_iface_config.values() @@ -1895,15 +2246,28 @@ def file_size_ante(name, data): return rp_data - def get_arg_string(self, pipeline_name, sample): + def get_arg_string(self, pipeline_name, sample, + submission_folder_path="", **null_replacements): """ For a given pipeline and sample, return the argument string :param str pipeline_name: Name of pipeline. :param Sample sample: current sample for which job is being built + :param str submission_folder_path: path to folder in which files + related to submission of this sample will be placed. + :param dict null_replacements: mapping from name of Sample attribute + name to value to use in arg string if Sample attribute's value + is null :return str: command-line argument string for pipeline """ + # It's undesirable to put a null value in the argument string. + default_filepath = _os.path.join( + submission_folder_path, sample.generate_filename()) + _LOGGER.debug("Default sample filepath: '%s'", default_filepath) + proxies = {"yaml_file": default_filepath} + proxies.update(null_replacements) + _LOGGER.debug("Building arguments string") config = self._select_pipeline(pipeline_name) argstring = "" @@ -1917,8 +2281,7 @@ def get_arg_string(self, pipeline_name, sample): for key, value in args.iteritems(): if value is None: - _LOGGER.debug("Null value for opt arg key '%s'", - str(key)) + _LOGGER.debug("Null value for opt arg key '%s'", str(key)) continue try: arg = getattr(sample, value) @@ -1930,7 +2293,19 @@ def get_arg_string(self, pipeline_name, sample): pipeline_name, value, key) raise - _LOGGER.debug("Adding '{}' from attribute '{}' for argument '{}'".format(arg, value, key)) + # It's undesirable to put a null value in the argument string. + if arg is None: + _LOGGER.debug("Null value for Sample attribute: '%s'", value) + try: + arg = proxies[value] + except KeyError: + raise ValueError("No default for null " + "Sample attribute: '{}'".format(value)) + _LOGGER.debug("Found default for '{}': '{}'". + format(value, arg)) + + _LOGGER.debug("Adding '{}' from attribute '{}' for argument '{}'". + format(arg, value, key)) argstring += " " + str(key) + " " + str(arg) # Add optional arguments @@ -1959,14 +2334,19 @@ def get_arg_string(self, pipeline_name, sample): return argstring - def get_attribute(self, pipeline_name, attribute_key): - """ Return value of given attribute for named pipeline. """ + def get_attribute(self, pipeline_name, attribute_key, path_as_list=True): + """ + Return the value of the named attribute for the pipeline indicated. + + :param str pipeline_name: name of the pipeline of interest + :param str attribute_key: name of the pipeline attribute of interest + :param bool path_as_list: whether to ensure that a string attribute + is returned as a list; this is useful for safe iteration over + the returned value. + """ config = self._select_pipeline(pipeline_name) - try: - value = config[attribute_key] - except KeyError: - value = None - return [value] if isinstance(value, str) else value + value = config.get(attribute_key) + return [value] if isinstance(value, str) and path_as_list else value def get_pipeline_name(self, pipeline): @@ -2024,190 +2404,236 @@ def _select_pipeline(self, pipeline_name): -@copy -class InterfaceManager(object): - """ Manage pipeline use for multiple locations and protocols. +class ProtocolInterface(object): + """ PipelineInterface and ProtocolMapper for a single pipelines location. - This is done by aggregating protocol interface instances, - allowing one Project to use pipelines from multiple locations. + This class facilitates use of pipelines from multiple locations by a + single project. Also stored are path attributes with information about + the location(s) from which the PipelineInterface and ProtocolMapper came. - :param pipeline_dirs: locations containing pipelines and configuration - information; specifically, a directory with a 'pipelines' folder and - a 'config' folder, within which there is a pipeline interface file - and a protocol mappings file. - :type pipeline_dirs: Iterable[str] + :param interface_data_source: location (e.g., code repository) of pipelines + :type interface_data_source: str """ - def __init__(self, pipeline_dirs): - # Collect interface/mappings pairs by protocol name. - interfaces_and_protocols = \ - [ProtocolInterfaces(pipedir) for pipedir in pipeline_dirs] - self.ifproto_by_proto_name = defaultdict(list) - for ifproto in interfaces_and_protocols: - for proto_name in ifproto.protomap: - _LOGGER.debug("Protocol name: {}".format(proto_name)) - self.ifproto_by_proto_name[proto_name].append(ifproto) + SUBTYPE_MAPPING_SECTION = "sample_subtypes" - def build_pipelines(self, protocol_name, priority=True): - """ - Build up a sequence of scripts to execute for this protocol. - :param str protocol_name: name for the protocol for which to build - pipelines - :param bool priority: should only the top priority mapping be used? - :return Sequence[(PipelineInterface, str, str)]: sequence of jobs - (script paths) to execute for the given protocol; if priority - flag is set (as is the default), this is a single-element list, - the sequence of jobs built is interpreted as descending priority - """ + def __init__(self, interface_data_source): + super(ProtocolInterface, self).__init__() - try: - ifprotos = self.ifproto_by_proto_name[protocol_name] - except KeyError: - _LOGGER.warn("Unknown protocol: '{}'".format(protocol_name)) - return [] + if isinstance(interface_data_source, Mapping): + # TODO: for implementation, we need to determine pipelines_path. + raise NotImplementedError( + "Raw Mapping as source of {} data is not yet supported". + format(self.__class__.__name__)) + _LOGGER.debug("Creating %s from raw Mapping", + self.__class__.__name__) + self.source = None + self.pipe_iface_path = None + for name, value in self._parse_iface_data(interface_data_source): + setattr(self, name, value) - jobs = [] - pipeline_keys_used = set() - _LOGGER.debug("Building pipelines for {} PIs...".format(len(ifprotos))) - for ifproto in ifprotos: - try: - this_protocol_pipelines = \ - ifproto.protomap.mappings[protocol_name] - except KeyError: - _LOGGER.debug("Protocol {} missing mapping in '{}'". - format(protocol_name, ifproto.protomaps_path)) - else: - # TODO: update once dependency-encoding logic is in place. - _LOGGER.debug("Protocol: {}".format(protocol_name)) - pipeline_keys = this_protocol_pipelines.replace(";", ",")\ - .strip(" ()\n")\ - .split(",") - pipeline_keys = [pk.strip() for pk in pipeline_keys] - already_mapped, new_scripts = \ - partition(pipeline_keys, - partial(_is_member, items=pipeline_keys_used)) - pipeline_keys_used |= set(pipeline_keys) - - if len(pipeline_keys) != (len(already_mapped) + len(new_scripts)): - _LOGGER.error("{} --> {} + {}".format( - pipeline_keys, already_mapped, new_scripts)) - - raise RuntimeError( - "Partitioned {} script names into allegedly " - "disjoint sets of {} and {} elements.". - format(len(pipeline_keys), - len(already_mapped), - len(new_scripts))) - - _LOGGER.debug("Skipping {} already-mapped script names: {}". - format(len(already_mapped), - ", ".join(already_mapped))) - _LOGGER.debug("{} new scripts for protocol {} from " - "pipelines warehouse '{}': {}". - format(len(new_scripts), protocol_name, - ifproto.pipedir, ", ".join(new_scripts))) - - jobs.append([(ifproto.interface, ) + - ifproto.pipeline_key_to_path(pipeline_key) - for pipeline_key in pipeline_keys]) - - return jobs[0] if priority and len(jobs) > 1 else list(itertools.chain(*jobs)) + elif _os.path.isfile(interface_data_source): + # Secondary version that passes combined yaml file directly, + # instead of relying on separate hard-coded config names. + _LOGGER.debug("Creating %s from file: '%s'", + self.__class__.__name__, interface_data_source) + self.source = interface_data_source + self.pipe_iface_path = self.source + self.pipelines_path = _os.path.dirname(self.source) + + with open(interface_data_source, 'r') as interface_file: + iface = yaml.load(interface_file) + for name, value in self._parse_iface_data(iface): + setattr(self, name, value) + elif _os.path.isdir(interface_data_source): + _LOGGER.debug("Creating %s from files in directory: '%s'", + self.__class__.__name__, interface_data_source) + self.source = interface_data_source + self.pipe_iface_path = _os.path.join( + self.source, "config", "pipeline_interface.yaml") + self.pipelines_path = _os.path.join(self.source, "pipelines") + self.pipe_iface = PipelineInterface(self.pipe_iface_path) + self.protomap = ProtocolMapper(_os.path.join( + self.source, "config", "protocol_mappings.yaml")) -def _is_member(item, items): - return item in items + else: + raise ValueError("Alleged pipelines location '{}' exists neither " + "as a file nor as a folder.". + format(interface_data_source)) + def __repr__(self): + return "ProtocolInterface from '{}'".format(self.source or "Mapping") -# TODO: rename. -class ProtocolInterfaces: - """ PipelineInterface and ProtocolMapper for a single pipelines location. - Instances of this class are used by InterfaceManager to facilitate - multi-location pipelines use by a single project. Here also are stored - path attributes to retain information about the location from which the - interface and mapper came. + def fetch_pipelines(self, protocol): + """ + Fetch the mapping for a particular protocol, null if unmapped. - :param pipedir: location (e.g., code repository) of pipelines - :type pipedir: str + :param str protocol: name/key for the protocol for which to fetch the + pipeline(s) + :return str | Iterable[str] | NoneType: pipeline(s) to which the given + protocol is mapped, otherwise null + """ + return self.protomap.mappings.get(alpha_cased(protocol)) - """ - def __init__(self, pipedir): - if _os.path.isdir(pipedir): - self.pipedir = pipedir - self.config_path = _os.path.join(pipedir, "config") - self.interface_path = _os.path.join(self.config_path, - "pipeline_interface.yaml") - self.protomaps_path = _os.path.join(self.config_path, - "protocol_mappings.yaml") - self.interface = PipelineInterface(self.interface_path) - self.protomap = ProtocolMapper(self.protomaps_path) - self.pipelines_path = _os.path.join(pipedir, "pipelines") - elif _os.path.isfile(pipedir): - # Secondary version that passes combined yaml file directly, - # instead of relying on separate hard-coded config names as above - self.pipedir = None - self.interface_file = pipedir - self.pipelines_path = _os.path.dirname(pipedir) + def fetch_sample_subtype( + self, protocol, strict_pipe_key, full_pipe_path): + """ + Determine the interface and Sample subtype for a protocol and pipeline. - with open(self.interface_file, 'r') as interface_file: - iface = yaml.load(interface_file) - try: - if "protocol_mapping" in iface: - self.protomap = ProtocolMapper(iface["protocol_mapping"]) - else: - raise Exception("pipeline_interface file is missing " - "a 'protocol_mapping' section.") - if "pipelines" in iface: - self.interface = PipelineInterface(iface["pipelines"]) - else: - raise Exception("pipeline_interface file is missing " - "a 'pipelines' section.") - except Exception as e: - _LOGGER.error(str(iface)) - raise e + :param str protocol: name of the relevant protocol + :param str strict_pipe_key: key for specific pipeline in a pipeline + interface mapping declaration; this must exactly match a key in + the PipelineInterface (or the Mapping that represent it) + :param str full_pipe_path: (absolute, expanded) path to the + pipeline script + :return type: Sample subtype to use for jobs for the given protocol, + that use the pipeline indicated + :raises KeyError: if given a pipeline key that's not mapped in this + ProtocolInterface instance's PipelineInterface + """ + subtype = None - def pipeline_key_to_path(self, pipeline_key): - """ - Given a pipeline_key, return the path to the script for that pipeline - specified in this pipeline interface config file. + this_pipeline_data = self.pipe_iface[strict_pipe_key] + + try: + subtypes = this_pipeline_data[self.SUBTYPE_MAPPING_SECTION] + except KeyError: + _LOGGER.debug("%s from '%s' doesn't define section '%s' " + "for pipeline '%s'", + self.pipe_iface.__class__.__name__, self.source, + self.SUBTYPE_MAPPING_SECTION, strict_pipe_key) + # Without a subtypes section, if pipeline module defines a single + # Sample subtype, we'll assume that type is to be used when in + # this case, when the interface section for this pipeline lacks + # an explicit subtypes section specification. + subtype_name = None + else: + if subtypes is None: + # Designate lack of need for import attempt and provide + # class with name to format message below. + subtype = Sample + _LOGGER.debug("Null %s subtype(s) section specified for " + "pipeline: '%s'; using base %s type", + subtype.__name__, strict_pipe_key, + subtype.__name__) + elif isinstance(subtypes, str): + subtype_name = subtypes + _LOGGER.debug("Single subtype name for pipeline '%s' " + "in interface from '%s': '%s'", subtype_name, + strict_pipe_key, self.source) + else: + temp_subtypes = { + alpha_cased(p): st for p, st in subtypes.items()} + try: + subtype_name = temp_subtypes[alpha_cased(protocol)] + except KeyError: + # Designate lack of need for import attempt and provide + # class with name to format message below. + subtype = Sample + _LOGGER.debug("No %s subtype specified in interface from " + "'%s': '%s', '%s'; known: %s", + subtype.__name__, self.source, + strict_pipe_key, protocol, + ", ".join(temp_subtypes.keys())) + + # subtype_name is defined if and only if subtype remained null. + subtype = subtype or \ + _import_sample_subtype(full_pipe_path, subtype_name) or \ + Sample + _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) + return subtype + + + def finalize_pipeline_key_and_paths(self, pipeline_key): + """ + Determine pipeline's full path, arguments, and strict key. + + This handles multiple ways in which to refer to a pipeline (by key) + within the mapping that contains the data that defines a + PipelineInterface. It also ensures proper handling of the path to the + pipeline (i.e., ensuring that it's absolute), and that the text for + the arguments are appropriately dealt parsed and passed. :param str pipeline_key: the key in the pipeline interface file used for the protocol_mappings section. Previously was the script name. - :return (str, str): more restrictive version of input key, along with - absolute path for pipeline script. + :return (str, str, str): more precise version of input key, along with + absolute path for pipeline script, and full script path + options """ - # key may contain extra command-line flags; split key from flags. + # The key may contain extra command-line flags; split key from flags. + # The strict key is the script name itself, something like "ATACseq.py" strict_pipeline_key, _, pipeline_key_args = pipeline_key.partition(' ') - if self.interface.get_attribute(strict_pipeline_key, "path"): - script_path_only = self.interface.get_attribute( - strict_pipeline_key, "path")[0] - script_path_with_flags = " ".join([script_path_only, pipeline_key_args]) + if self.pipe_iface.get_attribute(strict_pipeline_key, "path"): + script_path_only = self.pipe_iface.get_attribute( + strict_pipeline_key, "path")[0].strip() + script_path_with_flags = \ + " ".join([script_path_only, pipeline_key_args]) else: # backwards compatibility w/ v0.5 script_path_only = strict_pipeline_key script_path_with_flags = pipeline_key - if _os.path.isabs(script_path_only): - if not _os.path.exists(script_path_only.strip()): - _LOGGER.warn("Missing script command: '{}'".format(script_path_only)) - return strict_pipeline_key, script_path_with_flags - else: - abs_script_path_only = _os.path.join(self.pipelines_path, script_path_only) - abs_script_path_with_flags = _os.path.join(self.pipelines_path, script_path_with_flags) + if not _os.path.isabs(script_path_only): + _LOGGER.log(5, "Expanding non-absolute script path: '%s'", + script_path_only) + script_path_only = _os.path.join( + self.pipelines_path, script_path_only) + _LOGGER.log(5, "Absolute script path: '%s'", script_path_only) + script_path_with_flags = _os.path.join( + self.pipelines_path, script_path_with_flags) + _LOGGER.log(5, "Absolute script path with flags: '%s'", + script_path_with_flags) + if not _os.path.exists(script_path_only): + _LOGGER.warn( + "Missing pipeline script: '%s'", script_path_only) + + return strict_pipeline_key, script_path_only, script_path_with_flags + + + @classmethod + def _parse_iface_data(cls, pipe_iface_data): + """ + Parse data from mappings to set instance attributes. + + The data that define a ProtocolInterface are a "protocol_mapping" + Mapping and a "pipelines" Mapping, which are used to create a + ProtocolMapper and a PipelineInterface, representing the configuration + data for pipeline(s) from a single location. There are a couple of + different ways (file, folder, and eventually, raw Mapping) to provide + this data, and this function provides some standardization to how + those data are processed, independent of input type/format. + + :param Mapping[str, Mapping] pipe_iface_data: mapping from section + name to section data mapping; more specifically, the protocol + mappings Mapping and the PipelineInterface mapping + :return list[(str, ProtocolMapper | PipelineInterface)]: pairs of + attribute name for the ProtocolInterface being created, and the + value for that attribute, + """ + assignments = [("protocol_mapping", ProtocolMapper, "protomap"), + ("pipelines", PipelineInterface, "pipe_iface")] + attribute_values = [] + for section_name, data_type, attr_name in assignments: + try: + data = pipe_iface_data[section_name] + except KeyError: + _LOGGER.error("Error creating %s from data: %s", + cls.__name__, str(pipe_iface_data)) + raise Exception("PipelineInterface file lacks section: '{}'". + format(section_name)) + attribute_values.append((attr_name, data_type(data))) + return attribute_values - if not _os.path.isfile(abs_script_path_only.strip()): - _LOGGER.warn("Missing script command: '{}'". - format(abs_script_path_only)) - return strict_pipeline_key, abs_script_path_with_flags @copy @@ -2222,15 +2648,32 @@ class ProtocolMapper(Mapping): """ def __init__(self, mappings_input): if isinstance(mappings_input, Mapping): - # Pre-parsed mappings data - self.mappings_file = None mappings = mappings_input + self.filepath = None else: # Parse file mapping protocols to pipeline(s). - self.mappings_file = mappings_input - with open(self.mappings_file, 'r') as mapfile: + with open(mappings_input, 'r') as mapfile: mappings = yaml.load(mapfile) - self.mappings = {k.upper(): v for k, v in mappings.items()} + self.filepath = mappings_input + self.mappings = {alpha_cased(k): v for k, v in mappings.items()} + + + def __getitem__(self, protocol_name): + return self.mappings[protocol_name] + + def __iter__(self): + return iter(self.mappings) + + def __len__(self): + return len(self.mappings) + + + def __repr__(self): + source = self.filepath or "mapping" + num_protocols = len(self.mappings) + protocols = ", ".join(self.mappings.keys()) + return "{} from {}, with {} protocol(s): {}".format( + self.__class__.__name__, source, num_protocols, protocols) def build_pipeline(self, protocol): @@ -2261,7 +2704,7 @@ def build_pipeline(self, protocol): self.parse_parallel_jobs(split_jobs[i], split_jobs[i - 1]) """ - # TODO: incorporate into the InterfaceManager? + def parse_parallel_jobs(self, job, dep): job = job.replace("(", "").replace(")", "") split_jobs = [x.strip() for x in job.split(',')] @@ -2271,24 +2714,11 @@ def parse_parallel_jobs(self, job, dep): else: self.register_job(job, dep) - # TODO: incorporate into InterfaceManager? + def register_job(self, job, dep): _LOGGER.info("Register Job Name: %s\tDep: %s", str(job), str(dep)) - def __getitem__(self, item): - return self.mappings[item] - - def __iter__(self): - return iter(self.mappings) - - def __len__(self): - return len(self.mappings) - - def __repr__(self): - return repr(self.__dict__) - - class _InvalidResourceSpecificationException(Exception): """ Pipeline interface resources--if present--needs default. """ @@ -2335,3 +2765,122 @@ class _MissingPipelineConfigurationException(Exception): """ A selected pipeline needs configuration data. """ def __init__(self, pipeline): super(_MissingPipelineConfigurationException, self).__init__(pipeline) + + + +def _import_sample_subtype(pipeline_filepath, subtype_name=None): + """ + Import a particular Sample subclass from a Python module. + + :param str pipeline_filepath: path to file to regard as Python module + :param str subtype_name: name of the target class (which must derive from + the base Sample class in order for it to be used), optional; if + unspecified, if the module defines a single subtype, then that will + be used; otherwise, the base Sample type will be used. + :return type: the imported class, defaulting to base Sample in case of + failure with the import or other logic + """ + base_type = Sample + + try: + _LOGGER.debug("Attempting to import module defined by {}". + format(pipeline_filepath)) + + # TODO: consider more fine-grained control here. What if verbose + # TODO: logging is only to file, not to stdout/err? + + # Redirect standard streams during the import to prevent noisy + # error messaging in the shell that may distract or confuse a user. + if _LOGGER.getEffectiveLevel() > logging.DEBUG: + with open(_os.devnull, 'w') as temp_standard_streams: + with standard_stream_redirector(temp_standard_streams): + pipeline_module = import_from_source(pipeline_filepath) + else: + pipeline_module = import_from_source(pipeline_filepath) + + except SystemExit: + # SystemExit would be caught as BaseException, but SystemExit is + # particularly suggestive of an a script without a conditional + # check on __main__, and as such warrant a tailored message. + _LOGGER.warn("'%s' appears to attempt to run on import; " + "does it lack a conditional on '__main__'? " + "Using base type: %s", + pipeline_filepath, base_type.__name__) + return base_type + + except (BaseException, Exception) as e: + _LOGGER.warn("Using base %s because of failure in attempt to " + "import pipeline module '%s': %r", + base_type.__name__, pipeline_filepath, e) + return base_type + + else: + _LOGGER.debug("Successfully imported pipeline module '%s', " + "naming it '%s'", pipeline_filepath, + pipeline_module.__name__) + + def class_names(cs): + return ", ".join([c.__name__ for c in cs]) + + # Find classes from pipeline module and determine which derive from Sample. + classes = _fetch_classes(pipeline_module) + _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) + + # Base Sample could be imported; we want the true subtypes. + proper_subtypes = _proper_subtypes(classes, base_type) + _LOGGER.debug("%d proper %s subtype(s): %s", len(proper_subtypes), + base_type.__name__, class_names(proper_subtypes)) + + # Determine course of action based on subtype request and number found. + if not subtype_name: + _LOGGER.debug("No specific subtype is requested from '%s'", + pipeline_filepath) + if len(proper_subtypes) == 1: + # No specific request and single subtype --> use single subtype. + subtype = proper_subtypes[0] + _LOGGER.debug("Single %s subtype found in '%s': '%s'", + base_type.__name__, pipeline_filepath, + subtype.__name__) + return subtype + else: + # We can't arbitrarily select from among 0 or multiple subtypes. + _LOGGER.debug("%s subtype cannot be selected from %d found in " + "'%s'; using base type", base_type.__name__, + len(proper_subtypes), pipeline_filepath) + return base_type + else: + # Specific subtype request --> look for match. + for st in proper_subtypes: + if st.__name__ == subtype_name: + _LOGGER.debug("Successfully imported %s from '%s'", + subtype_name, pipeline_filepath) + return st + raise ValueError( + "'{}' matches none of the {} {} subtype(s) defined " + "in '{}': {}".format(subtype_name, len(proper_subtypes), + base_type.__name__, pipeline_filepath, + class_names(proper_subtypes))) + + + +def _fetch_classes(mod): + """ Return the classes defined in a module. """ + try: + _, classes = zip(*inspect.getmembers( + mod, lambda o: inspect.isclass(o))) + except ValueError: + return [] + return list(classes) + + + +def _proper_subtypes(types, supertype): + """ Determine the proper subtypes of a supertype. """ + return list(filter( + lambda t: issubclass(t, supertype) and t != supertype, types)) + + + +def _is_member(item, items): + """ Determine whether an iterm is a member of a collection. """ + return item in items diff --git a/looper/utils.py b/looper/utils.py index d092dc88..6f74ef5f 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -2,8 +2,11 @@ from argparse import ArgumentParser from collections import Counter, defaultdict, Iterable +import contextlib import logging import os +import random +import string import subprocess as sp import yaml from ._version import __version__ @@ -21,22 +24,170 @@ def format_help(self): +def alpha_cased(text, lower=False): + """ + Filter text to just letters and homogenize case. + + :param str text: what to filter and homogenize. + :param bool lower: whether to convert to lowercase; default uppercase. + :return str: input filtered to just letters, with homogenized case. + """ + text = "".join(filter(lambda c: c.isalpha(), text)) + return text.lower() if lower else text.upper() + + + +def check_bam(bam, o): + """ + Check reads in BAM file for read type and lengths. + + :param str bam: BAM file path. + :param int o: Number of reads to look at for estimation. + """ + try: + p = sp.Popen(['samtools', 'view', bam], stdout=sp.PIPE) + # Count paired alignments + paired = 0 + read_length = Counter() + while o > 0: # Count down number of lines + line = p.stdout.readline().decode().split("\t") + flag = int(line[1]) + read_length[len(line[9])] += 1 + if 1 & flag: # check decimal flag contains 1 (paired) + paired += 1 + o -= 1 + p.kill() + except OSError: + reason = "Note (samtools not in path): For NGS inputs, " \ + "looper needs samtools to auto-populate " \ + "'read_length' and 'read_type' attributes; " \ + "these attributes were not populated." + raise OSError(reason) + + _LOGGER.debug("Read lengths: {}".format(read_length)) + _LOGGER.debug("paired: {}".format(paired)) + return read_length, paired + + + +def check_fastq(fastq, o): + raise NotImplementedError("Detection of read type/length for " + "fastq input is not yet implemented.") + + + +def expandpath(path): + """ + Expand a filesystem path that may or may not contain user/env vars. + + :param str path: path to expand + :return str: expanded version of input path + """ + return os.path.expandvars(os.path.expanduser(path)).replace("//", "/") + + + def fetch_package_classes(pkg, predicate=None): """ Enable single-depth fetch of package's classes if not exported. - + :param module pkg: the package of interest. - :param function(type) -> bool predicate: condition each class must + :param function(type) -> bool predicate: condition each class must satisfy in order to be returned. - :return Iterable(type): classes one layer deep within the package, that + :return Iterable(type): classes one layer deep within the package, that satisfy the condition if given. """ import inspect import itertools + + modules = [pkg] if inspect.ismodule(pkg) else \ + [obj for obj in inspect.getmembers( + pkg, lambda member: inspect.ismodule(member))] return list(itertools.chain( - *[inspect.getmembers(mod, predicate) - for mod in inspect.getmembers( - pkg, lambda obj: inspect.ismodule(obj))])) + *[inspect.getmembers(mod, predicate) for mod in modules])) + + + +def get_file_size(filename): + """ + Get size of all files in gigabytes (Gb). + + :param str | collections.Iterable[str] filename: A space-separated + string or list of space-separated strings of absolute file paths. + :return float: size of file(s), in gigabytes. + """ + if filename is None: + return float(0) + if type(filename) is list: + return float(sum([get_file_size(x) for x in filename])) + try: + total_bytes = sum([float(os.stat(f).st_size) + for f in filename.split(" ") if f is not '']) + except OSError: + # File not found + return 0.0 + else: + return float(total_bytes) / (1024 ** 3) + + + +def import_from_source(module_filepath): + """ + Import a module from a particular filesystem location. + + :param str module_filepath: path to the file that constitutes the module + to import + :return module: module imported from the given location, named as indicated + :raises ValueError: if path provided does not point to an extant file + """ + import sys + + if not os.path.exists(module_filepath): + raise ValueError("Path to alleged module file doesn't point to an " + "extant file: '{}'".format(module_filepath)) + + # Randomly generate module name. + fname_chars = string.ascii_letters + string.digits + name = "".join(random.choice(fname_chars) for _ in range(20)) + + # Import logic is version-dependent. + if sys.version_info >= (3, 5): + from importlib import util as _il_util + modspec = _il_util.spec_from_file_location( + name, module_filepath) + mod = _il_util.module_from_spec(modspec) + modspec.loader.exec_module(mod) + elif sys.version_info < (3, 3): + import imp + mod = imp.load_source(name, module_filepath) + else: + # 3.3 or 3.4 + from importlib import machinery as _il_mach + loader = _il_mach.SourceFileLoader(name, module_filepath) + mod = loader.load_module() + + return mod + + + +def parse_ftype(input_file): + """ + Checks determine filetype from extension. + + :param str input_file: String to check. + :return str: filetype (extension without dot prefix) + :raises TypeError: if file does not appear of a supported type + """ + if input_file.endswith(".bam"): + return "bam" + elif input_file.endswith(".fastq") or \ + input_file.endswith(".fq") or \ + input_file.endswith(".fq.gz") or \ + input_file.endswith(".fastq.gz"): + return "fastq" + else: + raise TypeError("Type of input file ends in neither '.bam' " + "nor '.fastq' [file: '" + input_file + "']") @@ -82,33 +233,42 @@ def partition(items, test): assume that the argument is not terribly large and that the function is cheap to compute and use a simpler single-pass approach. - :param collections.Iterable[object] items: items to partition + :param Sized[object] items: items to partition :param function(object) -> bool test: test to apply to each item to perform the partitioning procedure :return: list[object], list[object]: partitioned items sequences """ passes, fails = [], [] - _LOGGER.debug("Testing {} items: {}".format(len(items), items)) + _LOGGER.log(5, "Testing {} items: {}".format(len(items), items)) for item in items: - _LOGGER.debug("Testing item {}".format(item)) + _LOGGER.log(5, "Testing item {}".format(item)) group = passes if test(item) else fails group.append(item) return passes, fails -# TODO: -# It appears that this isn't currently used. -# It could be included as a validation stage in Project instantiation. -# If Project instance being validated lacked specific relevant -# configuration section the call here would either need to be skipped, -# or this would need to pass in such a scenario. That would not be -# a challenge, but it just needs to be noted. +@contextlib.contextmanager +def standard_stream_redirector(stream): + """ + Temporarily redirect stdout and stderr to another stream. + + This can be useful for capturing messages for easier inspection, or + for rerouting and essentially ignoring them, with the destination as + something like an opened os.devnull. + + :param FileIO[str] stream: temporary proxy for standard streams + """ + import sys + genuine_stdout, genuine_stderr = sys.stdout, sys.stderr + sys.stdout, sys.stderr = stream, stream + try: + yield + finally: + sys.stdout, sys.stderr = genuine_stdout, genuine_stderr + + -# TODO: -# Test this with additional pipeline config file, -# pointed to in relevant section of project config file: -# http://looper.readthedocs.io/en/latest/define-your-project.html#project-config-section-pipeline-config class CommandChecker(object): """ Validate PATH availability of executables referenced by a config file. @@ -124,8 +284,10 @@ class CommandChecker(object): :param sections_to_skip: analogous to the check names parameter, but for specific sections to skip. :type sections_to_skip: Iterable[str] - + """ + + def __init__(self, path_conf_file, sections_to_check=None, sections_to_skip=None): @@ -142,9 +304,9 @@ def __init__(self, path_conf_file, # Determine which sections to validate. sections = {sections_to_check} if isinstance(sections_to_check, str) \ - else set(sections_to_check or conf_data.keys()) + else set(sections_to_check or conf_data.keys()) excl = {sections_to_skip} if isinstance(sections_to_skip, str) \ - else set(sections_to_skip or []) + else set(sections_to_skip or []) sections -= excl self._logger.info("Validating %d sections: %s", @@ -154,8 +316,8 @@ def __init__(self, path_conf_file, # Store per-command mapping of status, nested under section. self.section_to_status_by_command = defaultdict(dict) # Store only information about the failures. - self.failures_by_section = defaultdict(list) # Access by section. - self.failures = set() # Access by command. + self.failures_by_section = defaultdict(list) # Access by section. + self.failures = set() # Access by command. for s in sections: # Fetch section data or skip. @@ -244,86 +406,3 @@ def is_command_callable(command, name=""): _LOGGER.debug("Command{0}is not callable: {1}". format(alias_value, command)) return not bool(code) - - - -def parse_ftype(input_file): - """ - Checks determine filetype from extension. - - :param str input_file: String to check. - :return str: filetype (extension without dot prefix) - :raises TypeError: if file does not appear of a supported type - """ - if input_file.endswith(".bam"): - return "bam" - elif input_file.endswith(".fastq") or \ - input_file.endswith(".fq") or \ - input_file.endswith(".fq.gz") or \ - input_file.endswith(".fastq.gz"): - return "fastq" - else: - raise TypeError("Type of input file ends in neither '.bam' " - "nor '.fastq' [file: '" + input_file + "']") - - - -def check_bam(bam, o): - """ - Check reads in BAM file for read type and lengths. - - :param str bam: BAM file path. - :param int o: Number of reads to look at for estimation. - """ - try: - p = sp.Popen(['samtools', 'view', bam], stdout=sp.PIPE) - # Count paired alignments - paired = 0 - read_length = Counter() - while o > 0: # Count down number of lines - line = p.stdout.readline().decode().split("\t") - flag = int(line[1]) - read_length[len(line[9])] += 1 - if 1 & flag: # check decimal flag contains 1 (paired) - paired += 1 - o -= 1 - p.kill() - except OSError: - reason = "Note (samtools not in path): For NGS inputs, " \ - "looper needs samtools to auto-populate " \ - "'read_length' and 'read_type' attributes; " \ - "these attributes were not populated." - raise OSError(reason) - - _LOGGER.debug("Read lengths: {}".format(read_length)) - _LOGGER.debug("paired: {}".format(paired)) - return read_length, paired - - - -def check_fastq(fastq, o): - raise NotImplementedError("Detection of read type/length for " - "fastq input is not yet implemented.") - - - -def get_file_size(filename): - """ - Get size of all files in gigabytes (Gb). - - :param str | collections.Iterable[str] filename: A space-separated - string or list of space-separated strings of absolute file paths. - :return float: size of file(s), in gigabytes. - """ - if filename is None: - return float(0) - if type(filename) is list: - return float(sum([get_file_size(x) for x in filename])) - try: - total_bytes = sum([float(os.stat(f).st_size) - for f in filename.split(" ") if f is not '']) - except OSError: - # File not found - return 0.0 - else: - return float(total_bytes) / (1024 ** 3) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index aa459e2c..cc4117e1 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,3 +1,3 @@ colorama==0.3.9 -pandas==0.20.1 +pandas==0.20.2 pyyaml==3.12 diff --git a/scripts/cleanFailed.sh b/scripts/cleanFailed.sh deleted file mode 100755 index 49842138..00000000 --- a/scripts/cleanFailed.sh +++ /dev/null @@ -1,10 +0,0 @@ -# Deletes all directories with a failed flag -ls -d */*failed* - -read -p "Are you sure? " -n 1 -r -echo # (optional) move to a new line -if [[ $REPLY =~ ^[Yy]$ ]] -then - ls -d */*failed* | cut -d'/' -f1 | xargs rm -rfv -fi - diff --git a/scripts/convertBismarkReport.R b/scripts/convertBismarkReport.R deleted file mode 100755 index e2bc8da6..00000000 --- a/scripts/convertBismarkReport.R +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE) -library("data.table") -suppressPackageStartupMessages(library("optparse")) -#d <- fread("/data/groups/lab_bock/fhalbrit/projects/hema_precursors//results_pipeline//results_pipeline/MPP_10_D1_1_R1//bismark_hg38/extractor/MPP_10_D1_1_R1.aln.dedup.filt.CpG_report_filt.txt") - - -optionList <- list( - make_option( c("-i", "--input"), type="character", help="Input file. A Bismark CpG report (CHR START STRAND HITCOUNT MISSCOUNT DINUCLEOTIDE CONTEXT)"), - make_option( c("-f", "--formats"), type="character", default="cov,min", help="A comma-separated list of output formats. Supported formats are: cov (Bismark coverage file: CHR START END METHPERCENT HITCOUNT MISSCOUNT), min (minimal coverage file: CHR START HITS TOTAL). Default: cov,min"), - make_option( c("-c", "--noCovFilter"), default=FALSE,type="logical", action="store_true", help="Disable coverage filter. If not set, CpG's without any coverage will be removed"), - make_option( c("-s", "--noChromFilter"), default=FALSE, type="logical", action="store_true", help="Disable chromosome filter. If not set, non-standard chromosomes (everything with an underscore in the name) will be removed"), - make_option( c("-a", "--noAdjustMinusStrand"), default=FALSE, type="logical", action="store_true", help="Disable reverse strand adjustment. If not set, the coordiantes of all sites on the reverse strand (-) will be adjusted by subtracting 1") -) -opts <- parse_args(OptionParser(option_list=optionList)) - - -if (is.null(opts$input)) { - print_help(OptionParser(option_list=optionList)) - stop("No input file provided") -} else { - cpgReport <- opts$input - filterUncovered <- !opts$noCovFilter - removeNonStandardChroms <- !opts$noChromFilter - adjustMinusStrand <- !opts$noAdjustMinusStrand - outputFormats <- strsplit(tolower(opts$formats),",")[[1]] - - message("+ Starting to convert Bismark CpG report file: ", cpgReport) - - # read in data: - message("\tReading and modifying data...") - d <- fread(cpgReport) - setnames(d, paste0("V", 1:7), c("chr", "start", "strand", "hitCount", "missCount", "dinucleotide", "context")) - - # calculate total read count: - d[, readCount:=hitCount+missCount] - - # remove unnecessary columns: - d[, c("dinucleotide", "context", "missCount"):=NULL] - - # remove uncovered regions: - if(filterUncovered) { - message("\tRemove uncovered CpG's...") - d <- d[ readCount>0,] - } - - # adjust the coordinate of C's on the (-)-strand: - if(adjustMinusStrand) { - message("\tAdjusting reverse strand coordinates...") - d[strand=="-",start := as.integer(start-1)] - } - d[, strand:=NULL] - - # aggregate all regions with identical coordinates: - message("\tAggregating regions by coordinate...") - d <- d[,list(hitCount= sum(hitCount), readCount=sum(readCount)), by=list(chr, start)] - setcolorder(d,c("chr", "start", "hitCount", "readCount")); - - # remove non-standard chromosomes (_random, unintegrated contiqs, etc.) - if(removeNonStandardChroms) { - message("\tFiltering chromosomes...") - d <- d[ !grep("_",chr),]; - } - - # write output file(s): - for(outputFormat in outputFormats) { - outName <- paste0(gsub(".txt$", "", cpgReport, perl=TRUE, ignore.case=TRUE), ".", outputFormat) - if(outputFormat == "cov") { - message("\tWriting Bismark coverage format (CHR START END METHPERCENT HITCOUNT MISSCOUNT): ", outName) - d[, methPerc:= hitCount/readCount*100] - d[, missCount:= readCount-hitCount] - write.table(d[,list(chr,start,start,methPerc,hitCount,missCount)], file=outName, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) - } - else if(outputFormat == "min") { - message("\tWriting minimal coverage output format (CHR START HITS TOTAL): ", outName) - write.table(d[,list(chr,start,hitCount,readCount)], file=outName, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE) - } - else { - warning("\tUnrecognized output format: ", outputFormat) - } - } - - message("+ Finished conversion: ", cpgReport) -} diff --git a/scripts/fastqcSummary.py b/scripts/fastqcSummary.py deleted file mode 100755 index b5649c2d..00000000 --- a/scripts/fastqcSummary.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python - -import os, inspect, ConfigParser, subprocess, sys, errno, glob, zipfile, csv -from argparse import ArgumentParser -#import csv -#import -#import glob -#import -#import re - -# constants: -nThreadsPerCpu = 4 -nMemPerThread = 1024 -nCpusSlurm = 8 -defaultRawDataPath = "/fhgfs/groups/lab_bsf/samples/" - -# parse user-supplied arguments: -parser = ArgumentParser(description='FASTQC') -parser.add_argument('-c', '--config-file', dest='confFile', help="Supply config file with [-c]. The path of the sample annotation sheet will be parsed from this. Example: /fhgfs/groups/lab_bock/shared/COREseq/config.txt") -parser.add_argument('-a', '--annot-file', dest='annotFile', help="Specify a sample annotation sheet directly") -parser.add_argument('-o', '--output-dir', dest='outputDir', help="Directory to write results to") -parser.add_argument('-f', '--fastqc', dest='fastqcPath', help="Full path of FASTQC exectuable", default="/cm/shared/apps/FastQC/0.11.2/fastqc") -parser.add_argument('-s', '--slurm', dest='useSlurm', action='store_true', help="Execute script on SLURM cluster.", default=False) -parser.add_argument('-q', '--quick-summary', dest='quickSummary', action='store_true', help="Skip FastQC, just write the summary report", default=False) -parser.add_argument('-p', '--parallel', dest='nCpus', help="Number of CPUs to use (going to start 4 threads per CPU)", default=1) -parser.add_argument('-d', '--raw-path', dest='rawPath', help="Raw data path") -args, remaining_args = parser.parse_known_args() - -# get input directory either directly as command line argument (highest priority) or from a config file: -annotFile = None -outputDir = None -rawDataPath = defaultRawDataPath -if args.annotFile: - annotFile = args.annotFile -elif args.confFile: - #get configurations - config = ConfigParser.ConfigParser({"results": None, "raw_data_path": defaultRawDataPath}) - config.readfp(open(os.path.abspath(args.confFile))) - annotFile = config.get("paths","psa") - if annotFile is None: - print "The config file provided does not define an annotation sheet (parameter name: 'psa')" - raise SystemExit - outputDir = config.get("paths","project_root") - if outputDir is not None: - outputDir = outputDir + "/fastqc" - rawDataPath = config.get("paths","raw_data_path") -else: - print "Supply either a config file (--config-file=X) or the full path of the annotation sheet (--annot-file=X)" - raise SystemExit - -# define relevant paths: -scriptPath = os.path.abspath(inspect.getfile(inspect.currentframe())) -fastqcPath = os.path.abspath(args.fastqcPath) -annotFile = os.path.abspath(annotFile) -if args.outputDir: - outputDir = args.outputDir -if outputDir is None: - print "No output directory specified (--output-dir=X)" - raise SystemExit -outputDir = os.path.abspath(args.outputDir) -if args.rawPath: - rawDataPath = args.rawPath -nCpus = args.nCpus - -# print some basic information: -print "FASTQC Summary" -print "----" -print "Full script path:\t" + scriptPath -print "Full FASTQC path:\t" + fastqcPath -print "Raw data root directory:\t" + rawDataPath -print "Sample sheet:\t" + annotFile -print "Output root directory:\t:" + outputDir -print "#CPUs:\t:" + str(nCpus) -print "#treads/CPU:\t:" + str(nThreadsPerCpu) -print "#mem/thread:\t:" + str(nMemPerThread) -print "----" - -# create results directory if it doesn't exist yet: -try: - os.makedirs(outputDir) -except OSError as exception: - if exception.errno != errno.EEXIST: - raise - -### MAIN JOB EXECUTION ### - -# if desired, submit the job for execution on the cluster: -if args.useSlurm: - slurmScript = outputDir + "/fastqc_slurm.sub" - slurmLog = outputDir + "/fastqc_slurm.log" - - with open(slurmScript, "w") as fout: - fout.write("#!/bin/bash\n") - fout.write("#SBATCH --job-name=fastqc\n") - fout.write("#SBATCH --mem-per-cpu=" + str(nThreadsPerCpu * nMemPerThread) + "\n") - fout.write("#SBATCH --cpus-per-task=" + str(nCpus) + "\n") - fout.write("#SBATCH -m block\n") - fout.write("#SBATCH --partition=mediumq\n") - fout.write("#SBATCH --time=24:00:00\n") - fout.write("#SBATCH --output " + slurmLog + "\n") - fout.write("echo 'Compute node:' `hostname`\n") - fout.write("echo 'Start time:' `date +'%Y-%m-%d %T'`\n") - fout.write("python " + scriptPath + " --raw-path=" + rawDataPath + " --annot-file=" + annotFile + " --parallel=" + str(nCpusSlurm) + " --output-dir=" + outputDir + "\n") - fout.write("echo 'End time:' `date +'%Y-%m-%d %T'`\n") - - subprocess.check_call(["sbatch", slurmScript]) - -# otherwise, just execute the command directly on the current machine: -# (this is what the SLURM-based execution mode will do once the job has been allocated to a specific node) -else: - # execute FastQC on all BAM files: - if not args.quickSummary: - subprocess.check_call([fastqcPath, "--version"]) - - bamFolders = {} - - with open(annotFile, "rb") as annotF: - annotDict = csv.DictReader(annotF) - for row in annotDict: - bamDir = rawDataPath + row["flowcell"] + "/" + row["flowcell"] + "_" + row["lane"] + "_samples/" - bamFile = bamDir + row["flowcell"] + "_" + row["lane"] + "#" + row["BSF_name"] + ".bam" - - if os.path.isfile(bamFile): - bamFolders[bamDir] = True - - for bamFolder in bamFolders: - subprocess.check_call(fastqcPath + " " + bamFolder+"/*.bam --threads="+str(int(nCpus) * nThreadsPerCpu) + " --noextract --outdir="+outputDir, shell=True) # N.B. can't use the proper syntax with an array for args, because FastQC cannot handle the quoted string ('...') as an input path name - - allKeys = {} - resultsMap = {} - zipSuffix = ".zip" - sep = "\t" - summaryFile = outputDir + "/summary.tsv" - - # collate summaries in one overview file: - print "Collecting summary statistics into: " + summaryFile - with open(summaryFile, "w") as fout: - for fastqcZip in glob.glob(outputDir + "/*"+zipSuffix): - curName = fastqcZip[len(outputDir+"/"):-len(zipSuffix)] - curMap = {} - #print curName - with zipfile.ZipFile(fastqcZip) as z: - with z.open(curName+"/summary.txt") as f: - for line in f: - tokens = line.split(sep) - curMap[tokens[1]] = tokens[0] - allKeys[tokens[1]] = True - resultsMap[curName] = curMap - - fout.write("Dataset" + sep + sep.join(allKeys.keys())+"\n") - for sample, curMap in resultsMap.items(): - fout.write(sample) - for k in allKeys: - fout.write(sep + curMap[k]) - fout.write("\n") - diff --git a/scripts/flagCheck.sh b/scripts/flagCheck.sh deleted file mode 100755 index 2468689b..00000000 --- a/scripts/flagCheck.sh +++ /dev/null @@ -1,23 +0,0 @@ -completed=`ls */*completed.flag 2> /dev/null | wc -l` -running=`ls */*running.flag 2> /dev/null | wc -l` -failed=`ls */*failed.flag 2> /dev/null | wc -l` -echo "completed: $completed" -echo "running: $running" -echo "failed: $failed" -ls */*.flag | xargs -n1 basename | sort | uniq -c - -if [ $failed -lt 30 ]; then -echo "List of failed flags:" -ls */*failed.flag 2> /dev/null -fi - -if [ $completed -lt 30 ]; then -echo "List of completed flags:" -ls */*completed.flag 2> /dev/null -fi - -if [ $running -lt 30 ]; then -echo "List of running flags:" -ls */*running.flag 2> /dev/null -fi - diff --git a/scripts/make_SummaryTable.py b/scripts/make_SummaryTable.py deleted file mode 100755 index 6ca15a6e..00000000 --- a/scripts/make_SummaryTable.py +++ /dev/null @@ -1,319 +0,0 @@ -#! /usr/bin/env python - -# This script loops through all the samples, -# creates a summary stats table -import csv -import os -from argparse import ArgumentParser -from pypiper import AttributeDict -import yaml - - -# Argument Parsing -# ####################################################################################### -parser = ArgumentParser(description='make_SummaryTable') -parser.add_argument('-c', '--config-file', dest='config_file', help="path to YAML config file", required=True, type=str) -parser.add_argument('--excel', dest='excel', action='store_true', help="generate extra XLS and XLSX sheet", default=False, required=False) -# Charles : On time the legacy/rigid mode will be removed -parser.add_argument('--rigid', dest='rigid', action='store_true', help="the legacy rigid mode that only takes in the hard-coded values", default=False, required=False) -args = parser.parse_args() - -with open(args.config_file, 'r') as config_file: - config_yaml = yaml.load(config_file) - config = AttributeDict(config_yaml, default=True) -paths = config.paths - - - -if not os.path.exists(paths.output_dir): - raise Exception(paths.output_dir + " : project directory does not exist!") - - -# FOR RIGID -# ####################################################################################### -fields_in = [] -fields_out = [] -if args.rigid: - # the hard-coded fields for the legacy/rigid mode - fields_in = ['sample_name','instrument_model','flowcell','lane','read_length','Read_type','organism','Genome'\ - ,'cell_type','Raw_reads','Trimmed_reads','Trimmed_rate','Aligned_reads','Aligned_rate'\ - ,'Multimap_reads','Multimap_rate','Unique_CpGs','Total_CpGs','meanCoverage',\ - 'bisulfiteConversionRate','globalMethylationMean',\ - 'K1_unmethylated_count','K1_unmethylated_meth','K3_methylated_count','K3_methylated_meth'] - fields_out = ['Sample','Instrument','Flowcell','Lane','Read Length','Read Type','Organism','Genome'\ - ,'Cell Type','Raw Reads','Trimmed Reads','Trimmed Rate','Aligned Reads','Aligned Rate'\ - ,'Multimap Reads','Multimap Rate','Unique CpGs','Total CpGs','Mean Coverage',\ - 'Bisulfite Conversion Rate',' Global Methylation Mean',\ - 'K1 Unmethylated Count','K1 Unmethylated Meth','K3 Methylated Count','K3 Methylated Meth'] - - -# Open samples CSV file -# ####################################################################################### -csv_file_path = os.path.join(os.path.dirname(args.config_file),config.metadata.sample_annotation) -print("\nOpening CSV file: " + csv_file_path) -if os.path.isfile(csv_file_path): - csv_file = open(os.path.join(os.path.dirname(args.config_file),config.metadata.sample_annotation), 'rb') - print("Found " + csv_file_path) -else: - raise Exception(csv_file_path + " : that file does not exist!") -csv_reader = csv.DictReader(csv_file) - - -# Looping over all samples -# ####################################################################################### -global_list = dict() -global_keys = dict() - -pipelines = [] -sample_count = 0 -column_count = 0 -print("\nStart iterating over samples") - -for row in csv_reader: - - sample_count += 1 - sample_name = row['sample_name'] - print("\n##### Processing sample #"+ str(sample_count) + " : " + sample_name + " #####") - - # wrap this all in a try block, so it can skip a few bad samples - # without breaking the whole thing - try: - - # Open sample TSV stat file - stats_file_dir = os.path.join(paths.output_dir,paths.results_subdir,sample_name) - stats_file_path = os.path.join(paths.output_dir,paths.results_subdir,sample_name,row['library']+'_stats.tsv') - if not os.path.isfile(stats_file_path): - for thefile in os.listdir(stats_file_dir): - if thefile.endswith("stats.tsv"): stats_file_path = os.path.join(stats_file_dir,thefile) - if os.path.isfile(stats_file_path): - stats_file = open(stats_file_path, 'rb') - print("Found: " + stats_file_path) - else: - raise Exception(stat_file_path + " : file does not exist!") - - - stats_dict = dict() - stats_dict_keys = dict() - - # Check if file has third column -> define pipelines based on that - # plus read info from file - - for line in stat_file: - - line_content = line.split('\t') - key = line_content[0] - value = line_content[1] - pip = "x" - if len(line_content) == 3: - pip = line_content[2].strip() - pipelines.append(pip) - if not pip in stats_dict: stats_dict[pip] = dict() - if not pip in stats_dict_keys: stats_dict_keys[pip] = [] - stats_dict[pip][key] = value.strip() - stats_dict_keys[pip].append(key) - - pipelines = list(set(pipelines)) - print "Pipelines: " + str(pipelines) - - - # stats_dict and stats_dict_keys are pipeline specific - for pip in pipelines: - if not pip in global_list: global_list[pip] = [] - if not pip in global_keys: global_keys[pip] = [] - - - - # if there are two pipelines make sure that certain values are present in both - missing_cols = ["Raw_reads", "Fastq_reads", "Trimmed_reads", "Trim_loss_rate"] - if len(pipelines) == 2: - for col in missing_cols: - if not col in stats_dict[pipelines[1]] and col in stats_dict[pipelines[0]]: stats_dict[pipelines[1]][col] = stats_dict[pipelines[0]][col] - if not col in stats_dict[pipelines[0]] and col in stats_dict[pipelines[1]]: stats_dict[pipelines[0]][col] = stats_dict[pipelines[1]][col] - for pip in pipelines: - stats_dict_keys[pip] = list(set(stats_dict_keys[pip] + missing_cols)) - - # Write to global list and keys - new_row = dict() - column_count = 0 - for pip in pipelines: - new_row = row.copy() - new_row.update(stats_dict[pip]) - global_list[pip].append(new_row) - global_keys[pip] = csv_reader.fieldnames + stats_dict_keys[pip] - - - except Exception as e: - - print("Sample " + sample_name + " failed. Error: " + str(e)) - -csv_file.close() -# print global_keys -# print global_list - -# Writing to Output Files -# ####################################################################################### -if not args.rigid: - - # Writing TSV file - # ####################################################################################### - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - tsv_outfile_path = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+ pip_nam + '_stats_summary.tsv') - tsv_outfile = open(tsv_outfile_path, 'w') - - if global_list[pip] and global_keys[pip]: - - tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=global_keys[pip], delimiter='\t') - tsv_writer.writeheader() - - for i,sample in enumerate(global_list[pip]): - tsv_writer.writerow(sample) - if args.excel: - for j,field in enumerate(global_keys[pip]): - if i == 0: xls_sheet.write(0, j, field) - xls_sheet.write(i+1, j, sample[field]) - - tsv_outfile.close() - - print("\nInput used: " + csv_file_path) - print("Results TSV file: " + tsv_outfile_path) - - - - - # Output XLS file - # ####################################################################################### - if args.excel: - - raise Exception("--excel not implemented") - - import xlwt - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - - xls_book = xlwt.Workbook(encoding="utf-8") - xls_sheet_name = "Stats" + pip_nam - xls_sheet = xls_book.add_sheet(xls_sheet_name) - - # Where should this be written? Here or below? - # if args.rigid: - # for i,field in enumerate(fields_out): - # xls_sheet.write(0, i, field) - - import xlrd - import openpyxl - - # saving the XLS sheet - xls_filename = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+'_stats_summary.xls') - xls_book.save(xls_filename) - print("Results XLS file: " + xls_filename) - - # convert XLS sheet to XLSX format - xlsx_book_in = xlrd.open_workbook(xls_filename) - index = 0 - nrows = sample_count + 2 - ncols = 0 - if global_keys[pip]: ncols = len(global_keys[pip]) - else: ncols = column_count - ncols += 1 - xlsx_sheet_in = xlsx_book_in.sheet_by_index(0) - xlsx_book_out = openpyxl.Workbook() - xlsx_sheet = xlsx_book_out.active - xlsx_sheet.title = xls_sheet_name - for row in range(1, nrows): - for col in range(1, ncols): - xlsx_sheet.cell(row=row, column=col).value = xlsx_sheet_in.cell_value(row-1, col-1) - xlsx_filename = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+'_stats_summary.xlsx') - xlsx_book_out.save(xlsx_filename) - print("Results XLSX file: " + xlsx_filename) - - print("\n") - - - -# RIGID -else: - if args.excel: - raise Exception("--excel not implemented for option --rigid") - - for pip in pipelines: - - pip_nam = "_" + pip - if pip_nam == "_x": pip_nam = "" - # Open file to write to - tsv_outfile_path = os.path.join(paths.output_dir,os.path.basename(paths.output_dir)+ pip_nam + '_stats_summary.tsv') - tsv_outfile = open(tsv_outfile_path, 'w') - tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=fields_out, delimiter='\t') - tsv_writer.writeheader() - - - - - # for each sample data (one element of the global list) - for sample_dict in global_list[pip]: - - new_row = dict() - # Write each field - for i in range(0,len(fields_in)): - - field = fields_in[i] - field_out = fields_out[i] - content = str('') - content_float = float(-1e10) - content_int = int(-1) - - # extract all the listed fields - # some data types might not have all the fields in stats_dict, then catch the KeyError - try: - if field == 'Trimmed_rate': - content_float = 100.0*float(sample_dict['Trimmed_reads'])/float(sample_dict['Raw_reads']) - elif field == 'Aligned_rate': - content_float = 100.0*float(sample_dict['Aligned_reads'])/float(sample_dict['Trimmed_reads']) - elif field == 'Multimap_rate': - content_float = 100.0*float(sample_dict['Multimap_reads'])/float(sample_dict['Trimmed_reads']) - elif field in sample_dict: - content = str(sample_dict[field].strip()) - else: - content = 'NA' - print("No field called: " + field) - except KeyError: - content = 'NA' - print("Data missing to calculate: " + field) - - # convert str to float or int if needed - got_comma = content.find('.') - try: - content_float = float(content) - except ValueError: - pass - if not got_comma: - content_int = int(content_float) - - # write the field for each row - if content_int > -1: - column_count += 1 - new_row[field_out] = content_int - if args.excel: xls_sheet.write(sample_count, i, content_int) - elif content_float > -1e10: - column_count += 1 - new_row[field_out] = content_float - if args.excel: xls_sheet.write(sample_count, i, content_float) - else: - column_count += 1 - new_row[field_out] = content - if args.excel: xls_sheet.write(sample_count, i, content) - - tsv_writer.writerow(new_row) - - tsv_outfile.close() - - - - - diff --git a/scripts/make_trackhubs.py b/scripts/make_trackhubs.py deleted file mode 100644 index 71f7cb0a..00000000 --- a/scripts/make_trackhubs.py +++ /dev/null @@ -1,543 +0,0 @@ -#! /usr/bin/env python -""" Create a trackhub for each sample. """ - -from argparse import ArgumentParser -import csv -import datetime -import getpass -import os -import subprocess -import yaml -from looper.looper import SAMPLE_EXECUTION_TOGGLE -from pypiper import AttributeDict - - -# Argument Parsing -# ####################################################################################### -parser = ArgumentParser(description='make_trackhubs') -parser.add_argument('-c', '--config-file', dest='config_file', help="path to YAML config file", required=True, type=str) -parser.add_argument('-f', dest='filter', action='store_false', required=False, default=True) -parser.add_argument('-v', '--visibility', dest='visibility', help='visibility mode (default: full)', required=False, default='full', type=str) -parser.add_argument('--copy', dest='copy', help='copy sepcified file types instead of creating symbolic links, example: --copy BAM-BB-BW-BED-TH', required=False, type=str) - -args = parser.parse_args() - -with open(args.config_file, 'r') as config_file: - config_yaml = yaml.load(config_file) - config = AttributeDict(config_yaml, default=True) - -trackhubs = config.trackhubs -paths = config.paths - -print(config) - -if not os.path.exists(paths.output_dir): - raise Exception(paths.output_dir + " : that project directory does not exist!") - -present_genomes = {} -subGroups_perGenome = {} -subGroups = { - "exp_category": {}, - "FACS_marker": {}, - "cell_type": {}, - "treatment": {}, - "treatment_length": {}, - "cell_count": {}, - "library": {}, - "data_type": {} -} -# add x- and y-dimension to subGroups even if they are not in the standard column selection: -subGroups[trackhubs.matrix_x] = {} -subGroups[trackhubs.matrix_y] = {} - - -csv_file_path = os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation) -print "\nOpening CSV file: " + csv_file_path -if os.path.isfile(csv_file_path): - csv_file = open(os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation), 'rb') # opens the csv file -else: - raise Exception(csv_file_path + " : that file does not exist!") - -try: - - csv_file_0 = open(os.path.join(os.path.dirname(args.config_file), config.metadata.sample_annotation), 'rb') - input_file_0 = csv.DictReader(csv_file_0) # creates the reader object - - pipeline = "" - genome = "" - for row in input_file_0: - if ("library" in row.keys()): - pipeline = str(row["library"]).upper() - if ("organism" in row.keys()): - genome = str(getattr(config.genomes, str(row["organism"]))) - print 'Pipeline: ' + pipeline - print 'Genome: ' + genome - print("Trackhub dir: " + trackhubs.trackhub_dir) - if pipeline != "": - pipeline += '_' - - paths.write_dir = "" - - if args.copy: - paths.write_dir = trackhubs.trackhub_dir - if not os.path.exists(paths.write_dir): - os.makedirs(paths.write_dir) - else: - paths.write_dir = paths.output_dir - if not os.path.islink(trackhubs.trackhub_dir): - os.symlink(os.path.relpath(paths.write_dir, os.path.dirname(trackhubs.trackhub_dir)), trackhubs.trackhub_dir) - print 'Linking to: ' + str(trackhubs.trackhub_dir) - else: - print 'Link already exists: ' + str(trackhubs.trackhub_dir) - print 'Writing files to: ' + paths.write_dir - - genomes_file = open(os.path.join(paths.write_dir, pipeline + 'genomes.txt'), 'w') - - track_out = os.path.join(paths.write_dir, genome) - if not os.path.exists(track_out): - os.makedirs(track_out) - print 'Writing tracks to: ' + track_out - else: - print 'Trackhubs already exists! Overwriting everything in ' + track_out - userID = os.getuid() - for root, dirs, files in os.walk(track_out, topdown=False): - for name in files: - ownerID = 0 - try: - ownerID = os.stat(os.path.join(root, name)).st_uid - except: - os.remove(os.path.join(root, name)) - if ownerID == userID: - try: - os.remove(os.path.join(root, name)) - except: - pass - for name in dirs: - ownerID = os.stat(os.path.join(root, name)).st_uid - if ownerID == userID: - try: - os.rmdir(os.path.join(root, name)) - except: - pass - - # write hub.txt - hub_file_name = pipeline + "hub.txt" - hub_file = open(os.path.join(paths.write_dir, hub_file_name), 'w') - hub_file.writelines("hub " + trackhubs.hub_name + "\n") - hub_file.writelines("shortLabel " + trackhubs.hub_name + "\n") - hub_file.writelines("longLabel " + trackhubs.hub_name + "\n") - hub_file.writelines("genomesFile " + pipeline + "genomes.txt\n") - hub_file.writelines("email " + trackhubs.email + "\n") - - # Write a HTML document. - html_out = str() - html_out_tab1 = str() - html_out_tab2 = str() - clean_title = os.path.basename(paths.output_dir).replace('_',' ') - # Write HTML header and title - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n' - html_out += '\n'.format(getpass.getuser()) - html_out += '\n'.format(datetime.datetime.now().isoformat()) - html_out += '\n'.format(clean_title) - html_out += '\n' - html_out += '{}\n'.format(clean_title) - html_out += '\n' - html_out += '\n' - - tableDict = dict() - - input_file = csv.DictReader(csv_file) - sample_count = 0 - - print '\nStart iterating over samples' - for row in input_file: # iterates the rows of the file in orders - - sample_count += 1 - - sample_name = row["sample_name"] - print '\nProcessing sample #' + str(sample_count) + " : " + sample_name - - tableDict[sample_name] = dict() - - if SAMPLE_EXECUTION_TOGGLE in row: - exec_flag = row[SAMPLE_EXECUTION_TOGGLE] - if exec_flag == "0" or exec_flag.lower() == "false": - print(sample_name + ": not selected") - continue - else: - print(sample_name + ": SELECTED") - - sample_path = os.path.join(paths.output_dir, paths.results_subdir, sample_name) - - present_subGroups = "\tsubGroups " - - # bsmap aligned bam files - bsmap_mapped_bam = os.path.join(sample_path, "bsmap_" + genome, sample_name + ".bam") - bsmap_mapped_bam_name = os.path.basename(bsmap_mapped_bam) - bsmap_mapped_bam_index = os.path.join(sample_path, "bsmap_" + genome, sample_name + ".bam.bai") - bsmap_mapped_bam_index_name = os.path.basename(bsmap_mapped_bam_index) - - # With the new meth bigbeds, RRBS pipeline should yield this file: - meth_bb_file = os.path.join(sample_path, "bigbed_" + genome, "RRBS_" + sample_name + ".bb") - meth_bb_name = os.path.basename(meth_bb_file) - - # bismark bigwig files - bismark_bw_file = os.path.join(sample_path, "bismark_" + genome, "extractor", sample_name + ".aln.dedup.filt.bw") - bismark_bw_name = os.path.basename(bismark_bw_file) - - # bigwigs are better actually - if not os.path.isfile(bismark_bw_file): - bismark_bw_file = os.path.join(sample_path, "bigwig_" + genome, "RRBS_" + sample_name + ".bw") - bismark_bw_name = os.path.basename(bismark_bw_file) - - # biseqMethcalling bed file - biseq_bed = os.path.join(sample_path, "biseq_" + genome, "RRBS_cpgMethylation_" + sample_name + ".bed") - biseq_bed_name = os.path.basename(biseq_bed) - - # tophat files - if args.filter: - tophat_bw_file = os.path.join(sample_path, "tophat_" + genome, sample_name + ".aln.filt_sorted.bw") - else: - tophat_bw_file = os.path.join(sample_path, "tophat_" + genome, sample_name + ".aln_sorted.bw") - tophat_bw_name = os.path.basename(tophat_bw_file) - - if os.path.isfile(tophat_bw_file) or os.path.isfile(bismark_bw_file) or os.path.isfile(meth_bb_file): - - track_out_file = os.path.join(track_out, pipeline + "trackDB.txt") - if track_out_file not in present_genomes.keys(): - # initialize a new genome - open(track_out_file, 'w').close() - genomes_file.writelines("genome " + genome.split('_')[0] + "\n") - genomes_file.writelines("trackDb " + os.path.join(genome, os.path.basename(track_out_file)) + "\n") - present_genomes[track_out_file] = [] - subGroups_perGenome[track_out_file] = subGroups - - # construct subGroups for each sample and initialize subgroups if not present - for key in subGroups_perGenome[track_out_file].keys(): - if key not in input_file.fieldnames: - continue - if not row[key] in ["NA", "", " "]: - present_subGroups += key + "=" + row[key] + " " - if not row[key] in subGroups_perGenome[track_out_file][key]: - subGroups_perGenome[track_out_file][key][row[key]] = row[key] - - # TODO NS: we should only have build these once; like so: - # Build short label - if trackhubs.short_label_column is not None: - shortLabel = row[trackhubs.short_label_column] - else: - shortLabel = "sl_" - if ("Library" in row.keys()): - shortLabel += row["library"][0] - if ("cell_type" in row.keys()): - shortLabel += "_" + row["cell_type"] - if ("cell_count" in row.keys()): - shortLabel += "_" + row["cell_count"] - - ########################################## - ### Aligned BAM files and index files - ########################################## - - if os.path.isfile(bsmap_mapped_bam): - - print " FOUND bsmap mapped file: " + bsmap_mapped_bam - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BAM') > -1: - cmd = "cp " + bsmap_mapped_bam + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - cmd = "cp " + bsmap_mapped_bam_index + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(bsmap_mapped_bam, track_out), os.path.join(track_out, pipeline + bsmap_mapped_bam_name)) - os.symlink(os.path.relpath(bsmap_mapped_bam_index, track_out), os.path.join(track_out, pipeline + bsmap_mapped_bam_index_name)) - - # construct track for data file - track_text = "\n\ttrack " + bsmap_mapped_bam_name + "_Meth_Align" + "\n" - track_text += "\tparent DNA_Meth_Align on\n" - track_text += "\ttype bam\n" - track_text += present_subGroups + "data_type=Meth_Align" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth_Align" + "\n" - track_text += "\tbigDataUrl " + pipeline + bsmap_mapped_bam_name + "\n" - - tableDict[sample_name]['BAM'] = dict([('label', 'BAM'), ('link', os.path.relpath(os.path.join(track_out, pipeline + bsmap_mapped_bam_name), track_out))]) - tableDict[sample_name]['BAI'] = dict([('label', 'BAI'), ('link', os.path.relpath(os.path.join(track_out, pipeline + bsmap_mapped_bam_index_name), track_out))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No bsmap mapped bam found: " + bsmap_mapped_bam_name) - - ########################################## - ### For BigBed files - ########################################## - - if os.path.isfile(meth_bb_file): - - print " FOUND BigBed file: " + meth_bb_file - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BB') > -1: - cmd = "cp " + meth_bb_file + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(meth_bb_file, track_out), os.path.join(track_out, meth_bb_name)) - - # construct track for data file - track_text = "\n\ttrack " + meth_bb_name + "_Meth_BB" + "\n" - track_text += "\tparent DNA_Meth_BB on\n" - track_text += "\ttype bigBed\n" - track_text += present_subGroups + "data_type=Meth_BB" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth_BB" + "\n" - track_text += "\tbigDataUrl " + pipeline + meth_bb_name + "\n" - - tableDict[sample_name]['BB'] = dict([('label', 'BB'), ('link', os.path.relpath(os.path.relpath(os.path.join(track_out, meth_bb_name), track_out)))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No Bigbed file found: " + meth_bb_file) - - ########################################## - ### For Methylation (bismark) BIGWIG files - ########################################## - - if os.path.isfile(bismark_bw_file): - print " FOUND bismark bw: " + bismark_bw_file - # copy or link the file to the hub directory - if args.copy and args.copy.find('BW') > -1: - cmd = "cp " + bismark_bw_file + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(bismark_bw_file, track_out), os.path.join(track_out, bismark_bw_name)) - # add data_type subgroup (not included in sampleAnnotation) - if "Meth" not in subGroups_perGenome[track_out_file]["data_type"]: - subGroups_perGenome[track_out_file]["data_type"]["Meth"] = "Meth" - # construct track for data file - track_text = "\n\ttrack " + bismark_bw_name + "_Meth" + "\n" - track_text += "\tparent " + trackhubs.parent_track_name + " on\n" - track_text += "\ttype bigWig\n" - track_text += present_subGroups + "data_type=Meth" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_Meth" + "\n" - track_text += "\tbigDataUrl " + bismark_bw_name + "\n" - track_text += "\tviewLimits 0:100" + "\n" - track_text += "\tviewLimitsMax 0:100" + "\n" - track_text += "\tmaxHeightPixels 100:30:10" + "\n" - - tableDict[sample_name]['BW'] = dict([('label', 'BW'), ('link', os.path.relpath(os.path.relpath(os.path.join(track_out, bismark_bw_name), track_out)))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No bismark bw found: " + bismark_bw_file) - - ########################################## - ### For biseq BED files - ########################################## - - if os.path.isfile(biseq_bed): - - print " FOUND biseq bed file: " + biseq_bed - - # copy or link the file to the hub directory - if args.copy and args.copy.find('BED') > -1: - cmd = "cp " + biseq_bed + " " + track_out - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(biseq_bed, track_out), os.path.join(track_out, biseq_bed_name)) - - tableDict[sample_name]['BED'] = dict([('label', 'BED'), ('link', os.path.relpath(os.path.join(track_out, biseq_bed_name), track_out))]) - - else: - print (" No biseq bed file found: " + biseq_bed) - - ########################################## - ### For RNA (tophat) files - ########################################## - - if os.path.isfile(tophat_bw_file): - print " FOUND tophat bw: " + tophat_bw_file - # copy or link the file to the hub directory - if args.copy and args.copy.find('TH') > -1: - cmd = "cp " + tophat_bw_file + " " + track_out + "\n" - cmd += "chmod o+r " + os.path.join(track_out, tophat_bw_name) - print(cmd) - subprocess.call(cmd, shell=True) - else: - os.symlink(os.path.relpath(tophat_bw_file, track_out), os.path.join(track_out, tophat_bw_name)) - # add data_type subgroup (not included in sampleAnnotation) - if "RNA" not in subGroups_perGenome[track_out_file]["data_type"]: - subGroups_perGenome[track_out_file]["data_type"]["RNA"] = "RNA" - # construct track for data file - track_text = "\n\ttrack " + tophat_bw_name + "_RNA" + "\n" - track_text += "\tparent " + trackhubs.parent_track_name + " on\n" - track_text += "\ttype bigWig\n" - track_text += present_subGroups + "data_type=RNA" + "\n" - track_text += "\tshortLabel " + shortLabel + "\n" - track_text += "\tlongLabel " + sample_name + "_RNA" + "\n" - track_text += "\tbigDataUrl " + tophat_bw_name + "\n" - track_text += "\tautoScale on" + "\n" - - tableDict[sample_name]['TH'] = dict([('label', 'BW'), ('link', os.path.relpath(os.path.join(track_out, tophat_bw_name), track_out))]) - - present_genomes[track_out_file].append(track_text) - else: - print (" No tophat bw found: " + tophat_bw_file) - - # write composit-header followed by the individual tracks to a genome specific trackDB.txt - composit_text = "" - for key in present_genomes.keys(): - # construct composite header - composit_text += "\ntrack " + str(trackhubs.parent_track_name) + "\n" - composit_text += "compositeTrack on" - count = 0 - dim_text = "dimensions dimX=" + str(trackhubs.matrix_x) + " dimY=" + str(trackhubs.matrix_y) - for subGroup in subGroups_perGenome[key].keys(): - if len(subGroups_perGenome[key][subGroup]) < 1: - continue - if not subGroup == str(trackhubs.matrix_x) and not subGroup == str(trackhubs.matrix_y): - dim_text += " dimA=" + subGroup - count += 1 - composit_text += "\nsubGroup" + str(count) + " " + subGroup + " " + subGroup + " " - for type in subGroups_perGenome[key][subGroup].keys(): - composit_text += type + "=" + subGroups_perGenome[key][subGroup][type] + " " - composit_text += "\nshortLabel " + str(trackhubs.parent_track_name) + "\n" - composit_text += "longLabel " + str(trackhubs.parent_track_name) + "\n" - composit_text += "type bigWig" + "\n" - composit_text += "color 0,60,120" + "\n" - composit_text += "spectrum on" + "\n" - composit_text += "visibility " + args.visibility + "\n" - composit_text += dim_text + "\n" - composit_text += "sortOrder " + str(trackhubs.sortOrder) + "\n" - - # write composite header - trackDB = open(key, 'a') - trackDB.writelines(composit_text) - # write individual tracks - for i in range(len(present_genomes[key])): - trackDB.writelines(present_genomes[key][i]) - super_text = "\n" - super_text += "track DNA_Meth_Align\n" - super_text += "shortLabel DNA_Meth_Align\n" - super_text += "longLabel DNA_Meth_Align\n" - super_text += "superTrack on\n" - super_text += "\n" - super_text += "track DNA_Meth_BB\n" - super_text += "shortLabel DNA_Meth_BB\n" - super_text += "longLabel DNA_Meth_BB\n" - super_text += "superTrack on\n" - - trackDB.writelines(super_text) - trackDB.close() - - report_name = pipeline + 'report.html' - - html_out += '\n' - html_out += '

{} Project

\n'.format(clean_title) - html_out += '\n' - - today = datetime.datetime.now() - #html_out += '

Last updated on ' + str(today.day) +'/'+ str(today.month) +'/'+ str(today.year) + ' at ' + str(today.hour) +':'+ str(today.minute) +'

\n' - html_out += '


\n' - - html_out += '

Useful Links

\n' - tsv_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.tsv' - tsv_stats_path = os.path.relpath(os.path.join(paths.output_dir,tsv_stats_name),track_out) - xls_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.xls' - xls_stats_path = os.path.relpath(os.path.join(paths.output_dir,xls_stats_name),track_out) - xlsx_stats_name = os.path.basename(paths.output_dir)+'_stats_summary.xlsx' - xlsx_stats_path = os.path.relpath(os.path.join(paths.output_dir,xlsx_stats_name),track_out) - - if os.path.isfile(os.path.join(paths.write_dir,tsv_stats_name)): - if os.path.isfile(os.path.join(paths.write_dir,xls_stats_name)): - if os.path.isfile(os.path.join(paths.write_dir,xlsx_stats_name)): - html_out += '

Stats summary table: {} {} {}

\n'.format(tsv_stats_path,'TSV',xls_stats_path,'XLS', xlsx_stats_path,'XLSX') - else: - html_out += '

Stats summary table: {} {}

\n'.format(tsv_stats_path,'TSV',xls_stats_path,'XLS') - else: - html_out += '

Stats summary table: {}

\n'.format(tsv_stats_path,'TSV') - url = str(trackhubs.url).replace(':','%3A').replace('/','%2F') - paths.ucsc_browser_link = 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db='+genome.split('_')[0]+'&hubUrl='+url+'%2F'+hub_file_name - html_out += '

UCSC Genome Browser: {}

\n'.format(paths.ucsc_browser_link,'Link') - html_out += '


\n' - - html_file_name = os.path.join(track_out, report_name) - file_handle = open(name=html_file_name, mode='w') - file_handle.write(html_out) - - - html_out_tab = '

Data Files

\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - html_out_tab += '\n' - key_list = tableDict.keys() - key_list.sort() - counter = 0 - for key in key_list: - counter += 1 - value = tableDict[key] - html_out_tab += '\n' - html_out_tab += '\n'.format(str(counter)+'.') - html_out_tab += '\n'.format(key) - html_out_tab += '\n'.format(value['BAM']['link'],value['BAM']['label']) - html_out_tab += '\n'.format(value['BAI']['link'],value['BAI']['label']) - html_out_tab += '\n'.format(value['BB']['link'],value['BB']['label']) - html_out_tab += '\n'.format(value['BW']['link'],value['BW']['label']) - html_out_tab += '\n'.format(value['BED']['link'],value['BED']['label']) - html_out_tab += '\n' - html_out_tab += '
Sample NameAligned BAMBAM IndexBigBedBigWigBiseq Bed
{}{}{}{}{}{}{}
\n' - file_handle.write(html_out_tab) - - html_out = '


\n' - html_out += '

This report was generated with software of the Biomedical Sequencing Facility: www.biomedical-sequencing.at

\n' - html_out += '

Contact: bsf@cemm.oeaw.ac.at

\n' - html_out += '


\n' - html_out += '

Valid XHTML 1.0 Transitional\n' - html_out += 'Valid CSS!

' - html_out += '\n' - html_out += '\n' - html_out += '\n' - - file_handle.write(html_out) - file_handle.close() - - html_link_name = os.path.join(track_out, "index.html") - os.symlink(os.path.relpath(html_file_name,track_out),html_link_name) - - cmd = "cp /scratch/lab_bsf/projects/BSA_0000_RRBS_Global_Report/styles.css " + track_out - subprocess.call(cmd, shell=True) - cmd = "chmod -R go+rX " + paths.write_dir - subprocess.call(cmd, shell=True) - - hub_file_link = str(trackhubs.url) + "/" + hub_file_name - report_link = str(trackhubs.url) + "/" + genome + "/" - link_string = 'Report ' + report_link + '\n' - link_string += 'UCSCbrowser ' + paths.ucsc_browser_link + '\n' - print '\nDONE!' - print link_string - - link_file = open(name=os.path.join(paths.write_dir, pipeline + 'links.txt'), mode='w') - link_file.write(link_string) - link_file.close() - -finally: - csv_file.close() diff --git a/scripts/normalize_wig.R b/scripts/normalize_wig.R deleted file mode 100755 index 38b0f443..00000000 --- a/scripts/normalize_wig.R +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env Rscript - -library(data.table) -suppressPackageStartupMessages(library("optparse")) - -##scale=10000000 -##genome="mm10" -##results_dir="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/" -##stats_path="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/ALL_stats_summary.tsv" - -# specify our desired options in a list -option_list = list( - make_option(c("-r", "--results_dir"), type="character", help="Input Results folder (REQUIRED)"), - make_option(c("-g", "--genome"), type="character", help="Genome used for alignment (REQUIRED)"), - make_option(c("-s", "--stats"), type="character", help="Alignment stats table for all samples (REQUIRED)"), - make_option(c("-n", "--scale"), type="character", help="Normalization scale (REQUIRED)") - ) - -opt = parse_args(OptionParser(option_list=option_list)) -if (length(opt)<4) { - print_help(OptionParser(option_list=option_list)) -}else { - results_dir=opt$results_dir - genome=opt$genome - stats_path=opt$stats - scale=opt$scale -} - -print(results_dir) -print(genome) -print(stats_path) -print(scale) - -chroSizes_path=paste0("/data/groups/lab_bock/shared/resources/genomes/",genome,"/",genome,".chromSizes") - - - - -stats=fread(stats_path) -stats=stats[pipeline=="rnaTopHat"] -stats[,wigPath:=paste0(results_dir,"/",sampleName,"/tophat_",genome,"/",sampleName,".aln_sorted.wig"),] - -for (i in c(2:nrow(stats))){ - sampleName=stats[i]$sampleName - message(sampleName) - wigFileName=stats[i]$wigPath - mappedReads=stats[i]$Aligned_reads - if (file.exists(wigFileName)){ - system(paste0("sed 's/ \\+/\\t/g' ",wigFileName," > ", wigFileName,"_temp",sep="")) - wig=fread(paste0(wigFileName,"_temp"),header=FALSE) - wig[V1=="variableStep",V3:=paste0(V1," ",V2)] - wig[grep("variableStep",V3),V1:=NA] - wig[grep("variableStep",V3),V2:=NA] - wig[,V2:=round(as.numeric(V2)/mappedReads*scale,2),] - wig[,c("V1","V2"):=list(as.character(V1),as.character(V2)),] - wig[grep("variableStep",V3),c("V1","V2"):=list(V3,"")] - wig[,V3:=NULL,] - write.table(wig,sub(".wig","_norm.wig_temp",wigFileName),sep="\t",col.names=FALSE,row.names=FALSE,quote=FALSE) - system(paste0("sed 's/\t$//g' ",sub(".wig","_norm.wig_temp",wigFileName)," > ", sub(".wig","_norm.wig",wigFileName))) - system(paste("wigToBigWig",sub(".wig","_norm.wig",wigFileName),chroSizes_path,sub(".wig","_norm.bw",wigFileName),sep=" ")) - system(paste("rm ",sub(".wig",".wig_temp",wigFileName))) - system(paste("rm ",sub(".wig","_norm.wig_temp",wigFileName))) - }else{ - message(paste0("File not found. Skipping: ",wigFileName)) - next} -} - diff --git a/scripts/normalize_wig_submit.sh b/scripts/normalize_wig_submit.sh deleted file mode 100755 index 374eb7b5..00000000 --- a/scripts/normalize_wig_submit.sh +++ /dev/null @@ -1,13 +0,0 @@ -#! /bin/bash - -scale=10000000 -genome="mm10" -results_dir="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/" -stats_path="/scratch/lab_bock/shared/projects/geissmann/results_pipeline/results_pipeline/ALL_stats_summary.tsv" - -logdir="$results_dir/log/" -mkdir -p $logdir - - - -sbatch --export=NONE --get-user-env=L --job-name=normalize_wig --ntasks=1 --cpus-per-task=1 --mem-per-cpu=8000 --partition=longq --time=2-00:00:00 -o ${logdir}/normalize_wig_%j.log normalize_wig.R -g $genome -n $scale -r $results_dir -s $stats_path \ No newline at end of file diff --git a/scripts/summarizePipelineStats.R b/scripts/summarizePipelineStats.R deleted file mode 100755 index 4f7f150f..00000000 --- a/scripts/summarizePipelineStats.R +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE); -library(data.table) -library(reshape2) #no longer necessary after data.table 1.9.5?? -suppressPackageStartupMessages(library("optparse")) - -# specify our desired options in a list -option_list = list( -make_option(c("-i", "--inputFolder"), type="character", help="Input Results folder (REQUIRED)")) - -opt = parse_args(OptionParser(option_list=option_list)) -if (is.null(opt$inputFolder)) { - print_help(OptionParser(option_list=option_list)); - inputFolder = "/fhgfs/groups/lab_bock/shared/COREseq/results_pipeline3" -# q(); -} else { - inputFolder=opt$inputFolder -} -message("input folder: ", inputFolder); -pipeDirs = list.dirs(inputFolder, recursive=FALSE) - -message("Read all *_stats.tsv files in the pipeline results folder") -results=list() -dir = pipeDirs[[1]]; -for (dir in pipeDirs) { - message(dir); - statFiles = list.files(dir, pattern="_stats", recursive=FALSE) - statFiles2 = list.files(dir, pattern="stats_", recursive=FALSE) - statFiles = c(statFiles, statFiles2) - for (statFile in statFiles) { - message(statFile); - pipeline = gsub("_stats.tsv", "", statFile) - pipeline = gsub("stats_", "", pipeline) - statPath = paste0(dir, "/", statFile); - # Not the best, but I had to put this in just in case - # there are empty lines in the stat file; this removes them - message(":") - system(paste0("sed -i '/^\\s*$/d' ", statPath)) - message(":") - a = fread(statPath) - setnames(a, c("key", "value")) - a[,key:=gsub(" ", "_", key)] # Change spaces to underscores - #Order keys as factors, to maintain order through later cast. - a[,key:=factor(key, levels=unique(key))] - #setkey(a, "key") - a[,sampleName:=basename(dir)] - a[,pipeline:=pipeline] - sampleName = basename(dir) - if (is.null(results[[pipeline]])) { results[[pipeline]] = list(); } - results[[pipeline]][[sampleName]] = a; - } -} -if (length(results) ==0) { - stop("No stats files found."); -} -results -#Combined, divided by pipeline -resultsDT = lapply(results, function(x) { do.call(rbind, x); }) - -# Select latest for identical statistics -resultsDT = lapply(resultsDT, function(x) { x[,list(value=value[length(value)]), by=c("key", "sampleName", "pipeline"), roll=+Inf] }) - -# Cast to wide format -resultsMat = lapply(resultsDT, dcast, formula= "... ~ key") -resultsMat = lapply(resultsMat, as.data.table) -# Convert number-only cols to numerics, so I can do some stats below. -numToNumeric = function(DT) { - return(DT[,lapply(.SD, function(x) { if(!any(grepl("[a-zA-Z:_\\-]", x))) { return(as.numeric(x)); } else { return(x)} })]) -} -#lapply(resultsMat, sapply, mode) -resultsMat = lapply(resultsMat, numToNumeric) - -################################################################################ -# Do any pipeline-specific calculations here -################################################################################ -nofail = function(x) { - tryCatch( {x}, error = function(e) { message("Pipeline-specific summary error: ", e); } ) -} - #WGBS - if ("WGBS" %in% names(resultsMat) ) { - nofail( { resultsMat$WGBS[, total_efficiency := (Deduplicated_reads)/Raw_reads] }) - nofail( { resultsMat$WGBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$WGBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$WGBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] }) - nofail( { resultsMat$WGBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] }) - } - - if ("RRBS" %in% names(resultsMat) ) { - nofail( { resultsMat$RRBS[, total_efficiency := (Aligned_reads)/Raw_reads] }) - nofail( { resultsMat$RRBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$RRBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - #nofail( { resultsMat$RRBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] }) - #nofail( { resultsMat$RRBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] }) - } - - # Tophat -if ("rnaTopHat" %in% names(resultsMat) ) { - nofail( { resultsMat$rnaTopHat[, total_efficiency := Filtered_reads/Raw_reads] }) - nofail( { resultsMat$rnaTopHat[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$rnaTopHat[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$rnaTopHat[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] }) - nofail( { resultsMat$rnaTopHat[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads] }) - } - - # Bitseq -if ("rnaBitSeq" %in% names(resultsMat) ) { - nofail( { resultsMat$rnaBitSeq[, total_efficiency := Filtered_reads/Raw_reads] }) - nofail( { resultsMat$rnaBitSeq[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] }) - nofail( { resultsMat$rnaBitSeq[, alignment_rate := (Aligned_reads)/Trimmed_reads] }) - nofail( { resultsMat$rnaBitSeq[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] }) - nofail( { resultsMat$rnaBitSeq[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads] }) - nofail( { resultsMat$rnaBitSeq[, ERCC_alignment_rate := (ERCC_aligned_reads)/Trimmed_reads] }) - } - -################################################################################ -# Write results -################################################################################ -commonCols = Reduce(intersect, lapply(resultsMat, colnames)); -commonList = lapply(resultsMat, function(x) { x[,commonCols, with=FALSE] }) -commonTable = do.call(rbind, commonList) - - -# Write individual result tables for each pipeline -pipelines = names(resultsMat) -for (p in pipelines) { - pipeStatFile = paste0(inputFolder, "/", p, "_stats_summary.tsv") - message("Writing pipeline stats table: ", pipeStatFile) - write.table(resultsMat[[p]], pipeStatFile, sep="\t",row.names=FALSE,quote=FALSE) -} -if (length(names(resultsMat)) > 1 ) { # only if there are multiple pipelines -# Produce an additional table with only common features -commonTableFile = paste0(inputFolder, "/ALL_stats_summary.tsv"); -message("Writing common table: ", commonTableFile); -write.table(commonTable, commonTableFile,sep="\t",row.names=FALSE,quote=FALSE) -} - diff --git a/scripts/summarizePipelineStats_complex.R b/scripts/summarizePipelineStats_complex.R deleted file mode 100755 index d0888b30..00000000 --- a/scripts/summarizePipelineStats_complex.R +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env Rscript -options(echo=FALSE); -library(data.table) -library(reshape2) #no longer necessary after data.table 1.9.5?? -suppressPackageStartupMessages(library("optparse")) - -# specify our desired options in a list -option_list = list( -make_option(c("-i", "--inputFolder"), type="character", help="Input Results folder (REQUIRED)")) - -opt = parse_args(OptionParser(option_list=option_list)) -if (is.null(opt$inputFolder)) { - print_help(OptionParser(option_list=option_list)); - inputFolder = "/fhgfs/groups/lab_bock/shared/COREseq/results_pipeline3" -# q(); -} else { - inputFolder=opt$inputFolder -} - -message("input folder: ", inputFolder); -pipeDirs = list.dirs(inputFolder, recursive=FALSE) - -message("Read all *_stats.txt files in the pipeline results folder") -results=list() -dir = pipeDirs[[1]]; -for (dir in pipeDirs) { - message(dir); - statFiles = list.files(dir, pattern="_stats.tsv") - statFiles2 = list.files(dir, pattern="stats_") - statFiles = c(statFiles, statFiles2) - for (statFile in statFiles) { - message(statFile); - pipeline = gsub("_stats.tsv", "", statFile) - pipeline = gsub("stats_", "", pipeline) - statPath = paste0(dir, "/", statFile); - a = fread(statPath) - setnames(a, c("key", "value")) - a[,key:=gsub(" ", "_", key)] # Change spaces to underscores - #Order keys as factors, to maintain order through later cast. - a[,key:=factor(key, levels=unique(key))] - #setkey(a, "key") - a[,sampleName:=basename(dir)] - a[,pipeline:=pipeline] - sampleName = basename(dir) - if (is.null(results[[pipeline]])) { results[[pipeline]] = list(); } - results[[pipeline]][[sampleName]] = a; - } -} -if (length(results) ==0) { - stop("No stats files found."); -} -results -#Combined, divided by pipeline -resultsDT = lapply(results, function(x) { do.call(rbind, x); }) - -# Select latest for identical statistics -resultsDT = lapply(resultsDT, function(x) { x[,list(value=value[length(value)]), by=c("key", "sampleName", "pipeline"), roll=+Inf] }) - -# Cast to wide format -resultsMat = lapply(resultsDT, dcast, formula= "... ~ key") -resultsMat = lapply(resultsMat, as.data.table) -# Convert number-only cols to numerics, so I can do some stats below. -numToNumeric = function(DT) { - return(DT[,lapply(.SD, function(x) { if(!any(grepl("[a-zA-Z:_\\-]", x))) { return(as.numeric(x)); } else { return(x)} })]) -} -resultsMat = lapply(resultsMat, numToNumeric) -#lapply(resultsMat, sapply, mode) - -################################################################################ -# Do any pipeline-specific calculations here -################################################################################ - -#WGBS - -if ("WGBS" %in% names(resultsMat)){ - resultsMat$WGBS[, total_efficiency := (Deduplicated_reads)/Raw_reads] - resultsMat$WGBS[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$WGBS[, alignment_rate := (Aligned_reads)/Trimmed_reads] - resultsMat$WGBS[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads] - resultsMat$WGBS[, filt_loss_rate := (Deduplicated_reads - Filtered_reads)/Deduplicated_reads] -} - - -# Tophat -if ("rnaTopHat" %in% names(resultsMat)){ - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, total_efficiency := Filtered_reads/Raw_reads]} - resultsMat$rnaTopHat[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$rnaTopHat[, alignment_rate := (Aligned_reads)/Trimmed_reads] - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - if ("Deduplicated_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads]} - resultsMat$rnaTopHat[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads]} - else if ("Deduplicated_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaTopHat[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads]} -} -# Bitseq -if ("rnaBitSeq" %in% names(resultsMat)){ - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaBitSeq[, total_efficiency := Filtered_reads/Raw_reads]} - resultsMat$rnaBitSeq[, trim_loss_rate := (Raw_reads - Trimmed_reads)/Raw_reads] - resultsMat$rnaBitSeq[, alignment_rate := Aligned_reads/Trimmed_reads] - if ("Filtered_reads" %in% names(resultsMat$rnaTopHat)){ - resultsMat$rnaBitSeq[, dupe_loss_rate := (Filtered_reads - Deduplicated_reads)/Filtered_reads] - resultsMat$rnaBitSeq[, filt_loss_rate := (Aligned_reads - Filtered_reads)/Aligned_reads]} - else {resultsMat$rnaBitSeq[, dupe_loss_rate := (Aligned_reads - Deduplicated_reads)/Aligned_reads]} - resultsMat$rnaBitSeq[, ERCC_alignment_rate := (ERCC_aligned_reads)/Trimmed_reads] -} - -################################################################################ -# Write results -################################################################################ -commonCols = Reduce(intersect, lapply(resultsMat, colnames)); -commonList = lapply(resultsMat, function(x) { x[,commonCols, with=FALSE] }) -commonTable = do.call(rbind, commonList) - - -# Write individual result tables for each pipeline -pipelines = names(resultsMat) -for (p in pipelines) { - pipeStatFile = paste0(inputFolder, "/", p, "_stats_summary.tsv") - message("Writing pipeline stats table: ", pipeStatFile) - write.table(resultsMat[[p]], pipeStatFile, sep="\t",row.names=FALSE,quote=FALSE) -} - -# Produce an additional table with only common features -commonTableFile = paste0(inputFolder, "/ALL_stats_summary.tsv"); -message("Writing common table: ", commonTableFile); -write.table(commonTable, commonTableFile,sep="\t",row.names=FALSE,quote=FALSE) - - diff --git a/setup.cfg b/setup.cfg index e13f2591..5d8fdac6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ +[aliases] +test = pytest + [pytest] # Only request extra info from failures and errors. addopts = -rfE -[aliases] -test = pytest - diff --git a/setup.py b/setup.py index c54d33a9..1e102fe0 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,36 @@ #! /usr/bin/env python import os +from setuptools import setup import sys # Additional keyword arguments for setup(). extra = {} +# Ordinary dependencies DEPENDENCIES = [] with open("requirements/requirements-all.txt", "r") as reqs_file: for line in reqs_file: if not line.strip(): continue - DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + #DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + DEPENDENCIES.append(line) +# numexpr for pandas try: - from setuptools import setup - if sys.version_info >= (3,): - extra["use_2to3"] = True - extra["install_requires"] = DEPENDENCIES + import numexpr except ImportError: - from distutils.core import setup - extra["requires"] = DEPENDENCIES + # No numexpr is OK for pandas. + pass +else: + # pandas 0.20.2 needs updated numexpr; the claim is 2.4.6, but that failed. + DEPENDENCIES.append("numexpr==2.6.2") + +# 2to3 +if sys.version_info >= (3, ): + extra["use_2to3"] = True +extra["install_requires"] = DEPENDENCIES # Additional files to include with package @@ -34,7 +43,9 @@ def get_static(name, condition=None): return [i for i in filter(lambda x: eval(condition), static)] # scripts to be added to the $PATH -scripts = get_static("scripts", condition="'.' in x") +# scripts = get_static("scripts", condition="'.' in x") +# scripts removed (TO remove this) +scripts = None with open("looper/_version.py", 'r') as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") @@ -64,9 +75,7 @@ def get_static(name, condition=None): package_data={'looper': ['submit_templates/*']}, include_package_data=True, test_suite="tests", - tests_require=["mock", "pytest"], - setup_requires=(["pytest-runner"] - if {"ptr", "test", "pytest"} & set(sys.argv) - else []), + tests_require=(["mock", "pytest"]), + setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), **extra ) diff --git a/tests/conftest.py b/tests/conftest.py index 96cdf5f9..4765186b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ """ +import copy import logging import os import shutil @@ -16,21 +17,22 @@ from pandas.io.parsers import EmptyDataError import pytest +import yaml from looper import setup_looper_logger -from looper.models import PipelineInterface -from looper.loodels import Project +from looper.models import PipelineInterface, Project, SAMPLE_NAME_COLNAME -# TODO: needed for interactive mode, but may crush cmdl option for setup. _LOGGER = logging.getLogger("looper") +P_CONFIG_FILENAME = "project_config.yaml" + # {basedir} lines are formatted during file write; other braced entries remain. PROJECT_CONFIG_LINES = """metadata: sample_annotation: samples.csv output_dir: test - pipelines_dir: pipelines + pipeline_interfaces: pipelines merge_table: merge.csv derived_columns: [{derived_column_names}] @@ -171,6 +173,18 @@ } COMPARISON_FUNCTIONS = ["__eq__", "__ne__", "__len__", "keys", "values", "items"] +COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] +PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": "annotations.csv"}} + + + +def update_project_conf_data(extension): + """ Updated Project configuration data mapping based on file extension """ + updated = copy.deepcopy(PROJECT_CONFIG_DATA) + filename = updated["metadata"]["sample_annotation"] + base, _ = os.path.splitext(filename) + updated["metadata"]["sample_annotation"] = "{}.{}".format(base, extension) + return updated @@ -207,11 +221,47 @@ def conf_logs(request): + +@pytest.fixture(scope="function") +def sample_annotation_lines(): + return SAMPLE_ANNOTATION_LINES + + + +@pytest.fixture(scope="function") +def path_empty_project(request, tmpdir): + """ Provide path to Project config file with empty annotations. """ + + # Determine how to write the data and how to name a file. + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + extension = "txt" + else: + delimiter = "," + extension = "csv" + + # Update the Project configuration data. + conf_data = update_project_conf_data(extension) + + # Write the needed files. + anns_path = os.path.join( + tmpdir.strpath, conf_data["metadata"]["sample_annotation"]) + + with open(anns_path, 'w') as anns_file: + anns_file.write(delimiter.join(COLUMNS)) + conf_path = os.path.join(tmpdir.strpath, "proj-conf.yaml") + with open(conf_path, 'w') as conf_file: + yaml.dump(conf_data, conf_file) + + return conf_path + + + def interactive(prj_lines=PROJECT_CONFIG_LINES, iface_lines=PIPELINE_INTERFACE_CONFIG_LINES, merge_table_lines = MERGE_TABLE_LINES, sample_annotation_lines=SAMPLE_ANNOTATION_LINES, - project_kwargs=None): + loglevel=logging.DEBUG, project_kwargs=None): """ Create Project and PipelineInterface instances from default or given data. @@ -227,14 +277,24 @@ def interactive(prj_lines=PROJECT_CONFIG_LINES, table file :param collections.Iterable[str] sample_annotation_lines: lines for a sample annotations file + :param str | int loglevel: level at which to attend to log messages :param dict project_kwargs: keyword arguments for Project constructor :return Project, PipelineInterface: one Project and one PipelineInterface, """ + + # Establish logging for interactive session + import logging, sys + h = logging.StreamHandler(sys.stdout) + h.setLevel(loglevel) + logging.root.setLevel(loglevel) + logging.root.addHandler(h) + + # TODO: don't work with tempfiles once ctors tolerate Iterable. dirpath = tempfile.mkdtemp() path_conf_file = _write_temp( prj_lines, - dirpath=dirpath, fname="project_config.yaml") + dirpath=dirpath, fname=P_CONFIG_FILENAME) path_iface_file = _write_temp( iface_lines, dirpath=dirpath, fname="pipeline_interface.yaml") @@ -287,8 +347,8 @@ def _write_temp(lines, dirpath, fname): **{"derived_column_names": ", ".join(DERIVED_COLNAMES)} ) filepath = os.path.join(dirpath, fname) - _LOGGER.debug("Writing %d lines to file '%s'", len(lines), filepath) data_source_formatter = string.Formatter() + num_lines = 0 with open(filepath, 'w') as tmpf: for l in lines: if "{basedir}" in l: @@ -298,7 +358,67 @@ def _write_temp(lines, dirpath, fname): l = data_source_formatter.vformat( l, (), derived_columns_replacement) tmpf.write(l) - return tmpf.name + num_lines += 1 + _LOGGER.debug("Wrote %d line(s) to disk: '%s'", num_lines, filepath) + return filepath + + + +@pytest.fixture(scope="function") +def project_config_lines(): + """ Provide safer iteration over the lines for Project config file. """ + return PROJECT_CONFIG_LINES + + + +@pytest.fixture(scope="function") +def path_project_conf(tmpdir, project_config_lines): + """ + Write the Project configuration data. + + :param py.path.local.LocalPath tmpdir: temporary Path fixture + :param Iterable[str] project_config_lines: collection of lines for + Project configuration file + :return str: path to file with Project configuration data + """ + return _write_temp( + project_config_lines, tmpdir.strpath, P_CONFIG_FILENAME) + + + +@pytest.fixture(scope="function") +def proj_conf_data(path_project_conf): + """ + Read and parse raw Project configuration data. + + :param str path_project_conf: path to file with Project configuration data + :return Mapping: the data parsed from the configuration file written, + a Mapping form of the raw Project config text lines + """ + with open(path_project_conf, 'r') as conf_file: + return yaml.safe_load(conf_file) + + + +@pytest.fixture(scope="function") +def path_sample_anns(tmpdir, sample_annotation_lines): + """ + Write the sample annotations file and return the path to it. + + :param py.path.local.LocalPath tmpdir: temporary Path fixture + :param Iterable[str] sample_annotation_lines: collection of lines for + the sample annotations files + :return str: path to the sample annotations file that was written + """ + filepath = _write_temp( + sample_annotation_lines, tmpdir.strpath, ANNOTATIONS_FILENAME) + return filepath + + + +@pytest.fixture(scope="function") +def p_conf_fname(): + return P_CONFIG_FILENAME @@ -313,7 +433,7 @@ def write_project_files(request): """ dirpath = tempfile.mkdtemp() path_conf_file = _write_temp(PROJECT_CONFIG_LINES, - dirpath=dirpath, fname="project_config.yaml") + dirpath=dirpath, fname=P_CONFIG_FILENAME) path_merge_table_file = _write_temp( MERGE_TABLE_LINES, dirpath=dirpath, fname=MERGE_TABLE_FILENAME @@ -390,7 +510,7 @@ def request_class_attribute(req, attr): -def _create(request, data_type): +def _create(request, data_type, **kwargs): """ Create instance of desired type, using file in request class. @@ -403,7 +523,7 @@ def _create(request, data_type): _LOGGER.debug("Using %s as source of data to build %s", data_source, data_type.__class__.__name__) try: - return data_type(data_source) + return data_type(data_source, **kwargs) except EmptyDataError: with open(data_source, 'r') as datafile: _LOGGER.error("File contents:\n{}".format(datafile.readlines())) @@ -421,7 +541,9 @@ def proj(request): :return looper.models.Project: object created by parsing data in file pointed to by `request` class """ - return _create(request, Project) + p = _create(request, Project) + p.finalize_pipelines_directory() + return p diff --git a/tests/models/conftest.py b/tests/models/conftest.py index 410c6303..3882d052 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -1,8 +1,19 @@ -""" Models' tests' configuration. """ +""" Configuration for modules with independent tests of models. """ from collections import OrderedDict -import pytest +import copy +import os +import sys +if sys.version_info < (3, 3): + from collections import Iterable, Mapping +else: + from collections.abc import Iterable, Mapping + import pandas as pd +import pytest +import yaml + +from looper.models import DEFAULT_COMPUTE_RESOURCES_NAME, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" @@ -31,6 +42,146 @@ submission_command: sh """ +BASIC_PROTOMAP = {"ATAC": "ATACSeq.py"} + +# Compute resource bundles for pipeline interface configuration data +DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, + "time": "0-01:00:00", "partition": "local"} +MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, + "time": "0-07:00:00", "partition": "serial"} +HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, + "time": "30-00:00:00", "partition": "longq"} + + + +def pytest_generate_tests(metafunc): + """ Conditional customization of test cases in this directory. """ + try: + classname = metafunc.cls.__name__ + except AttributeError: + # Some functions don't belong to a class. + pass + else: + if classname == "ConstructorPathParsingTests": + # Provide test case with two PipelineInterface config bundles. + metafunc.parametrize( + argnames="config_bundles", + argvalues=[(atacseq_iface_without_resources(), + {"name": "sans-path"})]) + + + +@pytest.fixture(scope="function") +def atacseq_iface_without_resources(): + """ + Provide the ATAC-Seq pipeline interface as a fixture, without resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :return Mapping: all of the pipeline interface configuration data for + ATAC-Seq, minus the resources section + """ + return { + "name": "ATACseq", + "looper_args": True, + "required_input_files": ["read1", "read2"], + "all_input_files": ["read1", "read2"], + "ngs_input_files": ["read1", "read2"], + "arguments": { + "--sample-name": "sample_name", + "--genome": "genome", + "--input": "read1", + "--input2": "read2", + "--single-or-paired": "read_type" + }, + "optional_arguments": { + "--frip-ref-peaks": "FRIP_ref", + "--prealignments": "prealignments", + "--genome-size": "macs_genome_size" + } + } + + + +@pytest.fixture(scope="function") +def atac_pipe_name(): + """ Oft-used as filename for pipeline module and PipelineInterface key. """ + return "ATACSeq.py" + + + +@pytest.fixture(scope="function") +def atacseq_iface_with_resources( + atacseq_iface_without_resources, resources): + """ + + :param dict atacseq_iface_without_resources: PipelineInterface config + data, minus a resources section + :param Mapping resources: resources section of PipelineInterface + configuration data + :return Mapping: pipeline interface data for ATAC-Seq pipeline, with all + of the base sections plus resources section + """ + iface_data = copy.deepcopy(atacseq_iface_without_resources) + iface_data["resources"] = copy.deepcopy(resources) + return iface_data + + + +@pytest.fixture(scope="function") +def atacseq_piface_data(atacseq_iface_with_resources, atac_pipe_name): + """ + Provide a test case with data for an ATACSeq PipelineInterface. + + :param str atac_pipe_name: name/key for the pipeline to which the + interface data pertains + :return dict: configuration data needed to create PipelineInterface + """ + return {atac_pipe_name: copy.deepcopy(atacseq_iface_with_resources)} + + + +@pytest.fixture(scope="function") +def basic_data_raw(): + return copy.deepcopy({ + "AttributeDict": {}, "ProtocolMapper": BASIC_PROTOMAP, + "Sample": {SAMPLE_NAME_COLNAME: "arbitrary-sample"}}) + + + +@pytest.fixture(scope="function") +def basic_instance_data(request, instance_raw_data): + """ + Transform the raw data for a basic model instance to comply with its ctor. + + :param pytest._pytest.fixtures.SubRequest request: test case requesting + the basic instance data + :param Mapping instance_raw_data: the raw data needed to create a + model instance + :return object: basic instance data in a form accepted by its constructor + """ + # Cleanup is free with _write_config, using request's temp folder. + transformation_by_class = { + "AttributeDict": lambda data: data, + "PipelineInterface": lambda data: + _write_config(data, request, "pipeline_interface.yaml"), + "ProtocolInterface": lambda data: + _write_config(data, request, "pipeline_interface.yaml"), + "ProtocolMapper": lambda data: data, + "Sample": lambda data: pd.Series(data)} + which_class = request.getfixturevalue("class_name") + return transformation_by_class[which_class](instance_raw_data) + + + +@pytest.fixture(scope="function") +def default_resources(): + """ Provide test case with default PipelineInterface resources section. """ + return copy.deepcopy(DEFAULT_RESOURCES) + @pytest.fixture(scope="function") @@ -42,6 +193,35 @@ def env_config_filepath(tmpdir): +@pytest.fixture(scope="function") +def huge_resources(): + """ Provide non-default resources spec. section for PipelineInterface. """ + return copy.deepcopy(HUGE_RESOURCES) + + + +@pytest.fixture(scope="function") +def instance_raw_data(request, basic_data_raw, atacseq_piface_data): + """ Supply the raw data for a basic model instance as a fixture. """ + which_class = request.getfixturevalue("class_name") + if which_class == "PipelineInterface": + return copy.deepcopy(atacseq_piface_data) + elif which_class == "ProtocolInterface": + return {"protocol_mapping": + copy.deepcopy(basic_data_raw["ProtocolMapper"]), + "pipelines": copy.deepcopy(atacseq_piface_data)} + else: + return copy.deepcopy(basic_data_raw[which_class]) + + + +@pytest.fixture(scope="function") +def midsize_resources(): + """ Provide non-default resources spec. section for PipelineInterface. """ + return copy.deepcopy(MIDSIZE_RESOURCES) + + + @pytest.fixture(scope="function") def minimal_project_conf_path(tmpdir): """ Write minimal sample annotations and project configuration. """ @@ -55,3 +235,88 @@ def minimal_project_conf_path(tmpdir): "metadata:\n sample_annotation: {}".format(anns_file) conf_file.write(config_lines) return conf_file.strpath + + + +@pytest.fixture(scope="function") +def path_proj_conf_file(tmpdir, proj_conf): + """ Write basic project configuration data and provide filepath. """ + conf_path = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(conf_path, 'w') as conf: + yaml.safe_dump(proj_conf, conf) + return conf_path + + + +@pytest.fixture(scope="function") +def path_anns_file(request, tmpdir, sample_sheet): + """ Write basic annotations, optionally using a different delimiter. """ + filepath = os.path.join(tmpdir.strpath, "annotations.csv") + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + else: + delimiter = "," + with open(filepath, 'w') as anns_file: + sample_sheet.to_csv(anns_file, sep=delimiter, index=False) + return filepath + + + +@pytest.fixture(scope="function") +def piface_config_bundles(request, resources): + """ + Provide the ATAC-Seq pipeline interface as a fixture, including resources. + + Note that this represents the configuration data for the interface for a + single pipeline. In order to use this in the form that a PipelineInterface + expects, this needs to be the value to which a key is mapped within a + larger Mapping. + + :param pytest._pytest.fixtures.SubRequest request: hook into test case + requesting this fixture, which is queried for a resources value with + which to override the default if it's present. + :param Mapping resources: pipeline interface resource specification + :return Iterable[Mapping]: collection of bundles of pipeline interface + configuration bundles + """ + iface_config_datas = request.getfixturevalue("config_bundles") + if isinstance(iface_config_datas, Mapping): + data_bundles = iface_config_datas.values() + elif isinstance(iface_config_datas, Iterable): + data_bundles = iface_config_datas + else: + raise TypeError("Expected mapping or list collection of " + "PipelineInterface data: {} ({})".format( + iface_config_datas, type(iface_config_datas))) + resource_specification = request.getfixturevalue("resources") \ + if "resources" in request.fixturenames else resources + for config_bundle in data_bundles: + config_bundle.update(resource_specification) + return iface_config_datas + + + +@pytest.fixture(scope="function") +def resources(): + """ Basic PipelineInterface compute resources data. """ + return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), + "huge": copy.copy(HUGE_RESOURCES)} + + + +def _write_config(data, request, filename): + """ + Write configuration data to file. + + :param str Sequence | Mapping data: data to write to file, YAML compliant + :param pytest._pytest.fixtures.SubRequest request: test case that + requested a fixture from which this function was called + :param str filename: name for the file to write + :return str: full path to the file written + """ + # We get cleanup for free by writing to file in requests temp folder. + dirpath = request.getfixturevalue("tmpdir").strpath + filepath = os.path.join(dirpath, filename) + with open(filepath, 'w') as conf_file: + yaml.safe_dump(data, conf_file) + return filepath diff --git a/tests/models/independent/test_AttributeDict.py b/tests/models/independent/test_AttributeDict.py index 6d63430e..959447d5 100644 --- a/tests/models/independent/test_AttributeDict.py +++ b/tests/models/independent/test_AttributeDict.py @@ -162,8 +162,6 @@ class AttributeDictUpdateTests: """ - # TODO: ensure that we cover tests cases for both merged and non-merged. - _TOTALLY_ARBITRARY_VALUES = [ "abc", 123, (4, "text", ("nes", "ted")), list("-101") @@ -562,30 +560,6 @@ def test_all_defaults_no_metadata(self, tmpdir, proj, metadata_attribute): lines, _ = self._yaml_data(sample, filepath) assert all([metadata_attribute not in line for line in lines]) - - @pytest.mark.parametrize( - argnames="metadata_attribute", argvalues=ATTRDICT_METADATA.keys(), - ids=lambda attr_name: " metadata item = {} ".format(attr_name)) - def test_non_defaults_have_metadata( - self, tmpdir, proj, metadata_attribute): - """ Only non-default metadata elements are written to file. """ - for i, sample in enumerate(proj.samples): - filepath = os.path.join(tmpdir.strpath, "sample{}.yaml".format(i)) - - # Flip the value of an attribute in the project section. - newval = not ATTRDICT_METADATA[metadata_attribute] - lines, data = self._yaml_data( - sample, filepath, section_to_change="prj", - attr_to_change=metadata_attribute, newval=newval) - - # Is the test sensitive? - assert newval == data["prj"][metadata_attribute] - # How about specific? - num_meta_lines = sum(1 if any( - [meta_item in line for meta_item - in ATTRDICT_METADATA.keys()]) else 0 for line in lines) - assert 1 == num_meta_lines - @staticmethod def _yaml_data(sample, filepath, section_to_change=None, diff --git a/tests/models/independent/test_PipelineInterface.py b/tests/models/independent/test_PipelineInterface.py index 8da94069..c23350fa 100644 --- a/tests/models/independent/test_PipelineInterface.py +++ b/tests/models/independent/test_PipelineInterface.py @@ -1,14 +1,18 @@ """ Tests for PipelineInterface ADT. """ import copy +import inspect import itertools +import logging +import os import random +import mock import pytest import yaml from looper.models import \ - PipelineInterface, _InvalidResourceSpecificationException, \ + PipelineInterface, Sample, _InvalidResourceSpecificationException, \ _MissingPipelineConfigurationException, DEFAULT_COMPUTE_RESOURCES_NAME @@ -16,19 +20,13 @@ __email__ = "vreuter@virginia.edu" +_LOGGER = logging.getLogger(__name__) + + # Values with which to build pipeline interface keys and names PIPELINE_NAMES = ["ATACseq", "WGBS"] EXTENSIONS = [".py", ".sh", ".R"] -# Compute resource bundles for pipeline interface configuration data -DEFAULT_RESOURCES = {"file_size": 0, "cores": 1, "mem": 8000, - "time": "0-01:00:00", "partition": "local"} -MIDSIZE_RESOURCES = {"file_size": 10, "cores": 8, "mem": 16000, - "time": "0-07:00:00", "partition": "serial"} -HUGE_RESOURCES = {"file_size": 30, "cores": 24, "mem": 64000, - "time": "30-00:00:00", "partition": "longq"} -HUGE_RESOURCES_NAME = "huge" - def pytest_generate_tests(metafunc): @@ -36,6 +34,8 @@ def pytest_generate_tests(metafunc): try: parameters = metafunc.cls.PARAMETERS except AttributeError: + _LOGGER.debug("No indirect parameterization for test class: '{}'". + format(metafunc.cls)) pass else: for name, values in parameters.items(): @@ -53,29 +53,6 @@ def basic_pipe_iface_data(request): -@pytest.fixture(scope="function") -def resources(): - """ Basic PipelineInterface compute resources data. """ - return {DEFAULT_COMPUTE_RESOURCES_NAME: copy.deepcopy(DEFAULT_RESOURCES), - "huge": copy.copy(HUGE_RESOURCES)} - - - -@pytest.mark.parametrize(argnames="from_file", argvalues=[False, True]) -def test_constructor_input_types(tmpdir, from_file, basic_pipe_iface_data): - """ PipelineInterface constructor handles Mapping or filepath. """ - if from_file: - pipe_iface_config = tmpdir.join("pipe-iface-conf.yaml").strpath - with open(tmpdir.join("pipe-iface-conf.yaml").strpath, 'w') as f: - yaml.safe_dump(basic_pipe_iface_data, f) - else: - pipe_iface_config = basic_pipe_iface_data - pi = PipelineInterface(pipe_iface_config) - assert basic_pipe_iface_data == pi.pipe_iface_config - assert pi.pipe_iface_file == (pipe_iface_config if from_file else None) - - - @pytest.fixture(scope="function") def pi_with_resources(request, basic_pipe_iface_data, resources): """ Add resource bundle data to each config section. """ @@ -95,10 +72,27 @@ def pi_with_resources(request, basic_pipe_iface_data, resources): +@pytest.mark.parametrize(argnames="from_file", argvalues=[False, True]) +def test_constructor_input_types(tmpdir, from_file, basic_pipe_iface_data): + """ PipelineInterface constructor handles Mapping or filepath. """ + if from_file: + pipe_iface_config = tmpdir.join("pipe-iface-conf.yaml").strpath + with open(tmpdir.join("pipe-iface-conf.yaml").strpath, 'w') as f: + yaml.safe_dump(basic_pipe_iface_data, f) + else: + pipe_iface_config = basic_pipe_iface_data + pi = PipelineInterface(pipe_iface_config) + assert basic_pipe_iface_data == pi.pipe_iface_config + assert pi.pipe_iface_file == (pipe_iface_config if from_file else None) + + + @pytest.mark.parametrize( argnames="funcname_and_kwargs", argvalues=[("choose_resource_package", {"file_size": 4}), - ("get_arg_string", {"sample": "arbitrary-sample-name"}), + ("get_arg_string", + {"sample": Sample( + {"sample_name": "arbitrary-sample-name"})}), ("get_attribute", {"attribute_key": "irrelevant-attr-name"}), ("get_pipeline_name", {}), @@ -115,9 +109,17 @@ def test_unconfigured_pipeline_exception( except KeyError: # Already no default resource package. pass + + # Each of the functions being tested should take pipeline_name arg, + # and we want to test behavior for the call on an unknown pipeline. funcname, kwargs = funcname_and_kwargs + func = getattr(pi, funcname) + required_parameters = inspect.getargspec(func).args + for parameter in ["pipeline_name", "pipeline"]: + if parameter in required_parameters and parameter not in kwargs: + kwargs[parameter] = "missing-pipeline" with pytest.raises(_MissingPipelineConfigurationException): - getattr(pi, funcname).__call__("missing-pipeline", **kwargs) + func.__call__(**kwargs) @@ -147,7 +149,8 @@ def test_get_pipeline_name_inferred(self): pipelines = [name + ext for name, ext in zip(pipeline_names, extensions)] pi_config_data = {pipeline: None for pipeline in pipelines} - pi = PipelineInterface(pi_config_data) + with mock.patch("looper.models.PipelineInterface._expand_paths"): + pi = PipelineInterface(pi_config_data) for expected_name, pipeline in zip(pipeline_names, pipelines): assert expected_name == pi.get_pipeline_name(pipeline) @@ -160,7 +163,7 @@ class PipelineInterfaceResourcePackageTests: def test_requires_default( - self, use_new_file_size, pi_with_resources): + self, use_new_file_size, pi_with_resources, huge_resources): """ If provided, resources specification needs 'default.' """ pi = pi_with_resources for name, pipeline in pi: @@ -172,7 +175,7 @@ def test_requires_default( assert "default" not in pipeline["resources"] with pytest.raises(_InvalidResourceSpecificationException): pi.choose_resource_package( - name, file_size=HUGE_RESOURCES["file_size"] + 1) + name, file_size=huge_resources["file_size"] + 1) def test_negative_file_size_request( @@ -204,11 +207,11 @@ def test_resources_not_required( (16, "midsize"), (64, "huge")]) def test_selects_proper_resource_package( self, use_new_file_size, pi_with_resources, - file_size, expected_package_name): + file_size, expected_package_name, midsize_resources): """ Minimal resource package sufficient for pipeline and file size. """ for pipe_data in pi_with_resources.pipelines: pipe_data["resources"].update( - {"midsize": copy.deepcopy(MIDSIZE_RESOURCES)}) + {"midsize": copy.deepcopy(midsize_resources)}) for pipe_name, pipe_data in pi_with_resources: observed_package = pi_with_resources.choose_resource_package( pipe_name, file_size) @@ -232,7 +235,8 @@ def test_negative_file_size_prohibited( def test_file_size_spec_not_required_for_default( - self, use_new_file_size, basic_pipe_iface_data): + self, use_new_file_size, basic_pipe_iface_data, + default_resources, huge_resources, midsize_resources): """ Default package implies minimum file size of zero. """ def clear_file_size(resource_package): @@ -244,7 +248,7 @@ def clear_file_size(resource_package): resources_data = dict(zip( ["default", "midsize", "huge"], [copy.deepcopy(data) for data in - [DEFAULT_RESOURCES, MIDSIZE_RESOURCES, HUGE_RESOURCES]])) + [default_resources, midsize_resources, huge_resources]])) for pack_name, pack_data in resources_data.items(): # Use file size spec name as appropriate; clean default package. if pack_name == "default": @@ -301,13 +305,14 @@ def test_default_package_new_name_zero_size( def test_file_size_spec_required_for_non_default_packages( - self, use_new_file_size, basic_pipe_iface_data): + self, use_new_file_size, basic_pipe_iface_data, + default_resources, huge_resources): """ Resource packages besides default require file size. """ # Establish the resource specification. resource_package_data = { - "default": copy.deepcopy(DEFAULT_RESOURCES), - "huge": copy.deepcopy(HUGE_RESOURCES)} + "default": copy.deepcopy(default_resources), + "huge": copy.deepcopy(huge_resources)} # Remove file size for non-default; set it for default. del resource_package_data["huge"]["file_size"] @@ -330,6 +335,119 @@ def test_file_size_spec_required_for_non_default_packages( +class ConstructorPathParsingTests: + """ The constructor is responsible for expanding pipeline path(s). """ + + ADD_PATH = [True, False] + PIPELINE_KEYS = ["ATACSeq.py", "no_path.py"] + RELATIVE_PATH_DATA = [ + ("./arbitrary-test-pipelines", + {}, + "./arbitrary-test-pipelines"), + ("path/to/$TEMP_PIPE_LOCS", + {"TEMP_PIPE_LOCS": "validation-value"}, + "path/to/validation-value")] + ABSOLUTE_PATHS = [ + os.path.join("~", "code_home", "bioinformatics"), + os.path.join("$TEMP_TEST_HOME", "subfolder"), + os.path.join("~", "$TEMPORARY_SUBFOLDER", "leaf")] + ABSPATH_ENVVARS = {"TEMP_TEST_HOME": "tmptest-home-folder", + "TEMPORARY_SUBFOLDER": "temp-subfolder"} + EXPECTED_PATHS_ABSOLUTE = [ + os.path.join(os.path.expanduser("~"), "code_home", + "bioinformatics"), + os.path.join("tmptest-home-folder", "subfolder"), + os.path.join(os.path.expanduser("~"), "temp-subfolder", "leaf")] + + + @pytest.fixture(scope="function") + def pipe_iface_data(self, piface_config_bundles): + return dict(zip(self.PIPELINE_KEYS, piface_config_bundles)) + + + @pytest.fixture(scope="function", autouse=True) + def apply_envvars(self, request): + """ Use environment variables temporarily. """ + + if "envvars" not in request.fixturenames: + # We're autousing, so check for the relevant fixture. + return + + original_envvars = {} + new_envvars = request.getfixturevalue("envvars") + + # Remember values that are replaced as variables are updated. + for name, value in new_envvars.items(): + try: + original_envvars[name] = os.environ[name] + except KeyError: + pass + os.environ[name] = value + + def restore(): + # Restore swapped variables and delete added ones. + for k, v in new_envvars.items(): + try: + os.environ[k] = original_envvars[k] + except KeyError: + del os.environ[k] + request.addfinalizer(restore) + + + def test_no_path(self, config_bundles, piface_config_bundles, + pipe_iface_data): + """ PipelineInterface config sections need not specify path. """ + pi = PipelineInterface(pipe_iface_data) + for pipe_key in self.PIPELINE_KEYS: + piface_config = pi[pipe_key] + # Specific negative test of interest. + assert "path" not in piface_config + # Positive control validation. + assert pipe_iface_data[pipe_key] == piface_config + + + @pytest.mark.parametrize( + argnames=["pipe_path", "envvars", "expected"], + argvalues=RELATIVE_PATH_DATA) + def test_relative_path( + self, config_bundles, piface_config_bundles, pipe_iface_data, + pipe_path, envvars, expected, apply_envvars): + """ + PipelineInterface construction expands pipeline path. + + Environment variable(s) expand(s), but the path remains relative + if specified as such, deferring the joining with pipelines location, + which makes the path absolute, until the path is actually used. + + """ + for add_path, pipe_key in zip(self.ADD_PATH, self.PIPELINE_KEYS): + if add_path: + pipe_iface_data[pipe_key]["path"] = pipe_path + pi = PipelineInterface(pipe_iface_data) + for add_path, pipe_key in zip(self.ADD_PATH, self.PIPELINE_KEYS): + if add_path: + assert expected == pi[pipe_key]["path"] + else: + assert "path" not in pi[pipe_key] + + + @pytest.mark.parametrize( + argnames=["pipe_path", "envvars", "expected"], + argvalues=zip(ABSOLUTE_PATHS, + len(ABSOLUTE_PATHS) * [ABSPATH_ENVVARS], + EXPECTED_PATHS_ABSOLUTE)) + def test_path_expansion( + self, pipe_path, envvars, expected, + config_bundles, piface_config_bundles, pipe_iface_data): + """ User/environment variables are expanded. """ + for piface_data in pipe_iface_data.values(): + piface_data["path"] = pipe_path + pi = PipelineInterface(pipe_iface_data) + for _, piface_data in pi: + assert expected == piface_data["path"] + + + @pytest.mark.skip("Not implemented") class PipelineInterfaceArgstringTests: """ """ diff --git a/tests/models/independent/test_Project.py b/tests/models/independent/test_Project.py index e28c992c..3683a988 100644 --- a/tests/models/independent/test_Project.py +++ b/tests/models/independent/test_Project.py @@ -8,7 +8,7 @@ import yaml import looper from looper.models import \ - AttributeDict, Project, \ + AttributeDict, Project, Sample, \ _MissingMetadataException, SAMPLE_ANNOTATIONS_KEY @@ -23,7 +23,7 @@ def project_config_data(): "metadata": { SAMPLE_ANNOTATIONS_KEY: "sample-anns-filler.csv", "output_dir": "$HOME/sequencing/output", - "pipelines_dir": "${CODE}/pipelines"}, + "pipeline_interfaces": "${CODE}/pipelines"}, "data_sources": {"arbitrary": "placeholder/data/{filename}"}, "genomes": {"human": "hg19", "mouse": "mm10"}, "transcriptomes": {"human": "hg19_cdna", "mouse": "mm10_cdna"}} @@ -42,6 +42,105 @@ def pytest_generate_tests(metafunc): +class ProjectConstructorTests: + """ Tests of Project constructor, particularly behavioral details. """ + + + def test_no_samples(self, path_empty_project): + """ Lack of Samples is unproblematic. """ + p = Project(path_empty_project) + assert 0 == p.num_samples + assert [] == list(p.samples) + + + + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["as_null", "missing"], + ids=lambda spec: "spec_type={}".format(spec)) + @pytest.mark.parametrize( + argnames="lazy", argvalues=[False, True], + ids=lambda lazy: "lazy={}".format(lazy)) + def test_no_merge_table_in_config( + self, tmpdir, spec_type, lazy, proj_conf_data, path_sample_anns): + """ Merge table attribute remains null if config lacks merge_table. """ + metadata = proj_conf_data["metadata"] + try: + assert "merge_table" in metadata + except AssertionError: + print("Project metadata section lacks 'merge_table'") + print("All config data: {}".format(proj_conf_data)) + print("Config metadata section: {}".format(metadata)) + raise + if spec_type == "as_null": + metadata["merge_table"] = None + elif spec_type == "missing": + del metadata["merge_table"] + else: + raise ValueError("Unknown way to specify no merge table: {}". + format(spec_type)) + path_config_file = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(path_config_file, 'w') as conf_file: + yaml.safe_dump(proj_conf_data, conf_file) + p = Project(path_config_file, defer_sample_construction=lazy) + assert p.merge_table is None + + + @pytest.mark.skip("Not implemented") + def test_merge_table_construction( + self, tmpdir, project_config_data): + """ Merge table is constructed iff samples are constructed. """ + # TODO: implement + pass + + + def test_counting_samples_doesnt_create_samples( + self, sample_annotation_lines, + path_project_conf, path_sample_anns): + """ User can ask about sample count without creating samples. """ + # We're not parameterized in terms of Sample creation laziness here + # because a piece of the test's essence is Sample collection absence. + p = Project(path_project_conf, defer_sample_construction=True) + assert p._samples is None + expected_sample_count = sum(1 for _ in sample_annotation_lines) - 1 + assert expected_sample_count == p.num_samples + assert p._samples is None + + + @pytest.mark.parametrize(argnames="lazy", argvalues=[False, True]) + def test_sample_creation_laziness( + self, path_project_conf, path_sample_anns, lazy): + """ Project offers control over whether to create base Sample(s). """ + + p = Project(path_project_conf, defer_sample_construction=lazy) + + if lazy: + # Samples should remain null during lazy Project construction. + assert p._samples is None + + else: + # Eager Project construction builds Sample objects. + assert p._samples is not None + with open(path_sample_anns, 'r') as anns_file: + anns_file_lines = anns_file.readlines() + + # Sum excludes the header line. + num_samples_expected = sum(1 for l in anns_file_lines[1:] if l) + assert num_samples_expected == len(p._samples) + assert all([Sample == type(s) for s in p._samples]) + + + @pytest.mark.parametrize(argnames="lazy", argvalues=[False, True]) + def test_sample_name_availability( + self, path_project_conf, path_sample_anns, lazy): + """ Sample names always available on Project. """ + with open(path_sample_anns, 'r') as anns_file: + expected_sample_names = \ + [l.split(",")[0] for l in anns_file.readlines()[1:] if l] + p = Project(path_project_conf, defer_sample_construction=lazy) + assert expected_sample_names == list(p.sample_names) + + + class ProjectRequirementsTests: """ Tests for a Project's set of requirements. """ @@ -220,7 +319,7 @@ def _assert_null_compute_environment(project): @staticmethod def default_compute_settings(project): - settings_filepath = project.default_cmpenv_file + settings_filepath = project.default_compute_envfile with open(settings_filepath, 'r') as settings_data_file: settings = yaml.safe_load(settings_data_file) return {"environment": copy.deepcopy(settings), @@ -276,7 +375,7 @@ def create_project( # Write the config and build the Project. conf_file_path = _write_project_config( project_config_data, dirpath=dirpath) - with mock.patch("looper.models.Project.add_sample_sheet"): + with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=default_env_path) return expected_derived_columns, project @@ -489,7 +588,7 @@ def observed_argstring_elements( conf_file_path = _write_project_config(confdata, dirpath=confpath) # Subvert requirement for sample annotations file. - with mock.patch("looper.models.Project.add_sample_sheet"): + with mock.patch("looper.models.check_sheet"): project = Project(conf_file_path, default_compute=envpath) argstring = project.get_arg_string(pipeline) @@ -545,6 +644,7 @@ def _parse_flags_and_options(command_elements): return parsed_command_elements + def _write_project_config(config_data, dirpath, filename="proj-conf.yaml"): """ Write the configuration file for a Project. @@ -574,8 +674,6 @@ def _env_paths_to_names(envs): """ reduced = {} for env_name, env_data in envs.items(): - # DEBUG - print(env_name) reduced[env_name] = _compute_paths_to_names(env_data) return reduced @@ -594,15 +692,5 @@ def _compute_paths_to_names(env): """ reduced = copy.deepcopy(env) for pathvar in ["submission_template"]: - - # DEBUG - try: - _, reduced[pathvar] = os.path.split(reduced[pathvar]) - except KeyError: - print("REDUCED: {}".format(reduced)) - print("ENV: {}".format(env)) - print("KEYS: {}".format(reduced.keys())) - print("ENV KEYS: {}".format(env.keys())) - raise - + _, reduced[pathvar] = os.path.split(reduced[pathvar]) return reduced diff --git a/tests/models/independent/test_ProtocolInterface.py b/tests/models/independent/test_ProtocolInterface.py new file mode 100644 index 00000000..4c673dbb --- /dev/null +++ b/tests/models/independent/test_ProtocolInterface.py @@ -0,0 +1,690 @@ +""" Tests for ProtocolInterface, for Project/PipelineInterface interaction. """ + +import inspect +import itertools +import logging +import os +import sys +if sys.version_info < (3, ): + import __builtin__ as builtins +else: + import builtins + +import mock +import pytest +import yaml + +from looper import models, DEV_LOGGING_FMT +from looper.models import ProtocolInterface, Sample + + +__author__ = "Vince Reuter" +__email__ = "vreuter@virginia.edu" + + +SUBTYPES_KEY = ProtocolInterface.SUBTYPE_MAPPING_SECTION +ATAC_PROTOCOL_NAME = "ATAC" +SAMPLE_IMPORT = "from looper.models import Sample" + + +class CustomExceptionA(Exception): + def __init__(self, *args): + super(CustomExceptionA, self).__init__(*args) + +class CustomExceptionB(Exception): + def __init__(self, *args): + super(CustomExceptionB, self).__init__(*args) + +CUSTOM_EXCEPTIONS = [CustomExceptionA, CustomExceptionB] + + +# Test case parameterization, but here for import locality and +# to reduce clutter in the pararmeterization declaration. +_, BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS = \ + list(map(list, zip(*inspect.getmembers( + builtins, lambda o: inspect.isclass(o) and + issubclass(o, BaseException) and + not issubclass(o, UnicodeError))))) + + +def pytest_generate_tests(metafunc): + """ Customization of this module's test cases. """ + if "subtypes_section_spec_type" in metafunc.fixturenames: + # Subtypes section can be raw string or mapping. + metafunc.parametrize(argnames="subtypes_section_spec_type", + argvalues=[str, dict]) + + + +@pytest.fixture(scope="function") +def path_config_file(request, tmpdir, atac_pipe_name): + """ + Write PipelineInterface configuration data to disk. + + Grab the data from the test case's appropriate fixture. Also check the + test case parameterization for pipeline path specification, adding it to + the configuration data before writing to disk if the path specification is + present + + :param pytest._pytest.fixtures.SubRequest request: test case requesting + this fixture + :param py.path.local.LocalPath tmpdir: temporary directory fixture + :param str atac_pipe_name: name/key for ATAC-Seq pipeline; this should + also be used by the requesting test case if a path is to be added; + separating the name from the folder path allows parameterization of + the test case in terms of folder path, with pipeline name appended + after the fact (that is, the name fixture can't be used in the ) + :return str: path to the configuration file written + """ + conf_data = request.getfixturevalue("atacseq_piface_data") + if "pipe_path" in request.fixturenames: + pipeline_dirpath = request.getfixturevalue("pipe_path") + pipe_path = os.path.join(pipeline_dirpath, atac_pipe_name) + # Pipeline key/name is mapped to the interface data; insert path in + # that Mapping, not at the top level, in which name/key is mapped to + # interface data bundle. + for iface_bundle in conf_data.values(): + iface_bundle["path"] = pipe_path + return _write_config_data(protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=conf_data, dirpath=tmpdir.strpath) + + + +class PipelinePathResolutionTests: + """ Project requests pipeline information via an interface key. """ + + + def test_no_path(self, atacseq_piface_data, + path_config_file, atac_pipe_name): + """ Without explicit path, pipeline is assumed parallel to config. """ + + piface = ProtocolInterface(path_config_file) + + # The pipeline is assumed to live alongside its configuration file. + config_dirpath = os.path.dirname(path_config_file) + expected_pipe_path = os.path.join(config_dirpath, atac_pipe_name) + + _, full_pipe_path, _ = \ + piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert expected_pipe_path == full_pipe_path + + + def test_relpath_with_dot_becomes_absolute( + self, tmpdir, atac_pipe_name, atacseq_piface_data): + """ Leading dot drops from relative path, and it's made absolute. """ + path_parts = ["relpath", "to", "pipelines", atac_pipe_name] + sans_dot_path = os.path.join(*path_parts) + pipe_path = os.path.join(".", sans_dot_path) + atacseq_piface_data[atac_pipe_name]["path"] = pipe_path + + exp_path = os.path.join(tmpdir.strpath, sans_dot_path) + + path_config_file = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(path_config_file) + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + # Dot may remain in path, so assert equality of absolute paths. + assert os.path.abspath(exp_path) == os.path.abspath(obs_path) + + + @pytest.mark.parametrize( + argnames="pipe_path", argvalues=["relative/pipelines/path"]) + def test_non_dot_relpath_becomes_absolute( + self, atacseq_piface_data, path_config_file, + tmpdir, pipe_path, atac_pipe_name): + """ Relative pipeline path is made absolute when requested by key. """ + # TODO: constant-ify "path" and "ATACSeq.py", as well as possibly "pipelines" + # and "protocol_mapping" section names of PipelineInterface + exp_path = os.path.join( + tmpdir.strpath, pipe_path, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert exp_path == obs_path + + + @pytest.mark.parametrize( + argnames=["pipe_path", "expected_path_base"], + argvalues=[(os.path.join("$HOME", "code-base-home", "biopipes"), + os.path.join(os.path.expandvars("$HOME"), + "code-base-home", "biopipes")), + (os.path.join("~", "bioinformatics-pipelines"), + os.path.join(os.path.expanduser("~"), + "bioinformatics-pipelines"))]) + def test_absolute_path( + self, atacseq_piface_data, path_config_file, tmpdir, pipe_path, + expected_path_base, atac_pipe_name): + """ Absolute path regardless of variables works as pipeline path. """ + exp_path = os.path.join( + tmpdir.strpath, expected_path_base, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + _, obs_path, _ = piface.finalize_pipeline_key_and_paths(atac_pipe_name) + assert exp_path == obs_path + + + @pytest.mark.xfail( + condition=models._LOGGER.getEffectiveLevel() < logging.WARN, + reason="Insufficient logging level to capture warning message: {}". + format(models._LOGGER.getEffectiveLevel())) + @pytest.mark.parametrize( + argnames="pipe_path", + argvalues=["nonexistent.py", "path/to/missing.py", + "/abs/path/to/mythical"]) + def test_warns_about_nonexistent_pipeline_script_path( + self, atacseq_piface_data, path_config_file, + tmpdir, pipe_path, atac_pipe_name): + """ Nonexistent, resolved pipeline script path generates warning. """ + name_log_file = "temp-test-log.txt" + path_log_file = os.path.join(tmpdir.strpath, name_log_file) + temp_hdlr = logging.FileHandler(path_log_file, mode='w') + fmt = logging.Formatter(DEV_LOGGING_FMT) + temp_hdlr.setFormatter(fmt) + temp_hdlr.setLevel(logging.WARN) + models._LOGGER.handlers.append(temp_hdlr) + pi = ProtocolInterface(path_config_file) + pi.finalize_pipeline_key_and_paths(atac_pipe_name) + with open(path_log_file, 'r') as logfile: + loglines = logfile.readlines() + assert 1 == len(loglines) + logmsg = loglines[0] + assert "WARN" in logmsg and pipe_path in logmsg + + + +class SampleSubtypeTests: + """ ProtocolInterface attempts import of pipeline-specific Sample. """ + + # Basic cases + # 1 -- unmapped pipeline + # 2 -- subtypes section is single string + # 3 -- subtypes section is mapping () + # 4 -- subtypes section is missing (use single Sample subclass if there is one, base Sample for 0 or > 1 Sample subtypes defined) + # 5 -- subtypes section is null --> ALWAYS USE BASE SAMPLE (backdoor user side mechanism for making this be so) + + # Import trouble cases + # No __main__ + # Argument parsing + # missing import(s) + + # Subcases + # 2 -- single string + # 2a -- named class isn't defined in the module + # 2b -- named class is in module but isn't defined + # + + PROTOCOL_NAME_VARIANTS = [ + "ATAC-Seq", "ATACSeq", "ATACseq", "ATAC-seq", "ATAC", + "ATACSEQ", "ATAC-SEQ", "atac", "atacseq", "atac-seq"] + + + @pytest.mark.parametrize( + argnames="pipe_key", + argvalues=["{}.py".format(proto) for proto + in PROTOCOL_NAME_VARIANTS]) + @pytest.mark.parametrize( + argnames="protocol", + argvalues=PROTOCOL_NAME_VARIANTS) + def test_pipeline_key_match_is_strict( + self, tmpdir, pipe_key, protocol, atac_pipe_name, + atacseq_iface_with_resources): + """ Request for Sample subtype for unmapped pipeline is KeyError. """ + + # Create the ProtocolInterface. + strict_pipe_key = atac_pipe_name + protocol_mapping = {protocol: strict_pipe_key} + confpath = _write_config_data( + protomap=protocol_mapping, dirpath=tmpdir.strpath, + conf_data={strict_pipe_key: atacseq_iface_with_resources}) + piface = ProtocolInterface(confpath) + + # The absolute pipeline path is the pipeline name, joined to the + # ProtocolInterface's pipelines location. This location is the + # location from which a Sample subtype import is attempted. + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # TODO: update to pytest.raises(None) if/when 3.1 adoption. + # Match between pipeline key specified and the strict key used in + # the mapping --> no error while mismatch --> error. + if pipe_key == atac_pipe_name: + piface.fetch_sample_subtype( + protocol, pipe_key, full_pipe_path=full_pipe_path) + else: + with pytest.raises(KeyError): + piface.fetch_sample_subtype( + protocol, pipe_key, full_pipe_path=full_pipe_path) + + + @pytest.mark.parametrize( + argnames=["mapped_protocol", "requested_protocol"], + argvalues=itertools.combinations(PROTOCOL_NAME_VARIANTS, 2)) + def test_protocol_match_is_fuzzy( + self, tmpdir, mapped_protocol, atac_pipe_name, + requested_protocol, atacseq_piface_data): + """ Punctuation and case mismatches are tolerated in protocol name. """ + + # Needed to create the ProtocolInterface. + protomap = {mapped_protocol: atac_pipe_name} + # Needed to invoke the function under test. + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # PipelineInterface data provided maps name to actual interface data + # Mapping, so modify the ATAC-Seq mapping within that. + # In this test, we're interested in the resolution of the protocol + # name, that with it we can grab the name of a class. Thus, we + # need only an arbitrary class name about which we can make the + # relevant assertion(s). + test_class_name = "TotallyArbitrary" + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ + test_class_name + + # Write out configuration data and create the ProtocolInterface. + conf_path = _write_config_data( + protomap=protomap, conf_data=atacseq_piface_data, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # Make the call under test, patching the function protected + # function that's called iff the protocol name match succeeds. + with mock.patch("looper.models._import_sample_subtype", + return_value=None) as mocked_import: + # Return value is irrelevant; the effect of the protocol name + # match/resolution is entirely observable via the argument to the + # protected import function. + piface.fetch_sample_subtype( + protocol=requested_protocol, + strict_pipe_key=atac_pipe_name, + full_pipe_path=full_pipe_path) + # When the protocol name match/resolution succeeds, the name of the + # Sample subtype class to which it was mapped is passed as an + # argument to the protected import function. + mocked_import.assert_called_with(full_pipe_path, test_class_name) + + + + @pytest.mark.parametrize( + argnames="error_type", + argvalues=CUSTOM_EXCEPTIONS + + BUILTIN_EXCEPTIONS_WITHOUT_REQUIRED_ARGUMENTS) + def test_problematic_import_builtin_exception( + self, tmpdir, error_type, atac_pipe_name, atacseq_piface_data): + """ Base Sample is used if builtin exception on pipeline import. """ + + # Values needed for object creation and function invocation + protocol = ATAC_PROTOCOL_NAME + protocol_mapping = {protocol: atac_pipe_name} + full_pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Modify the data for the ProtocolInterface and create it. + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ + {protocol: "IrrelevantClassname"} + conf_path = _write_config_data( + protomap=protocol_mapping, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # We want to test the effect of an encounter with an exception during + # the import attempt, so patch the relevant function with a function + # to raise the parameterized exception type. + with mock.patch( + "looper.utils.import_from_source", + side_effect=error_type()): + subtype = piface.fetch_sample_subtype( + protocol=protocol, strict_pipe_key=atac_pipe_name, + full_pipe_path=full_pipe_path) + # When the import hits an exception, the base Sample type is used. + assert subtype is Sample + + + @pytest.mark.parametrize( + argnames="num_sample_subclasses", argvalues=[0, 1, 2], + ids=lambda n_samples: + " num_sample_subclasses = {} ".format(n_samples)) + @pytest.mark.parametrize( + argnames="decoy_class", argvalues=[False, True], + ids=lambda decoy: " decoy_class = {} ".format(decoy)) + def test_no_subtypes_section( + self, tmpdir, path_config_file, atac_pipe_name, + num_sample_subclasses, decoy_class): + """ DEPENDS ON PIPELINE MODULE CONTENT """ + + # Basic values to invoke the function under test + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + piface = ProtocolInterface(path_config_file) + + # How to define the Sample subtypes (and non-subtype) + sample_subclass_basename = "SampleSubclass" + sample_lines = [ + "class {basename}{index}(Sample):", + "\tdef __init__(*args, **kwargs):", + "\t\tsuper({basename}{index}, self).__init__(*args, **kwargs)"] + non_sample_class_lines = [ + "class NonSample(object):", "\tdef __init__(self):", + "\t\tsuper(NonSample, self).__init__()"] + + # We expect the subtype iff there's just one Sample subtype. + if num_sample_subclasses == 1: + exp_subtype_name = "{}0".format(sample_subclass_basename) + else: + exp_subtype_name = Sample.__name__ + + # Fill in the class definition template lines. + def populate_sample_lines(n_classes): + return [[sample_lines[0].format(basename=sample_subclass_basename, + index=class_index), + sample_lines[1], + sample_lines[2].format(basename=sample_subclass_basename, + index=class_index)] + for class_index in range(n_classes)] + + # Determine the groups of lines to permute. + class_lines_pool = populate_sample_lines(num_sample_subclasses) + if decoy_class: + class_lines_pool.append(non_sample_class_lines) + + # Subtype fetch is independent of class declaration order, + # so validate each permutation. + for lines_order in itertools.permutations(class_lines_pool): + # Write out class declarations and invoke the function under test. + _create_module(lines_by_class=lines_order, filepath=pipe_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + # Make the assertion on subtype name, getting additional + # information about the module that we defined if there's failure. + try: + assert exp_subtype_name == subtype.__name__ + except AssertionError: + with open(pipe_path, 'r') as f: + print("PIPELINE MODULE LINES: {}". + format("".join(f.readlines()))) + raise + + + @pytest.mark.parametrize( + argnames="subtype_name", argvalues=[Sample.__name__]) + def test_Sample_as_name( + self, tmpdir, subtype_name, atac_pipe_name, + subtypes_section_spec_type, atacseq_piface_data_with_subtypes): + """ A pipeline may redeclare Sample as a subtype name. """ + + # General values for the test + subtype_name = Sample.__name__ + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Define the subtype in the pipeline module. + lines = ["from looper.models import Sample\n", + "class {}({}):\n".format(subtype_name, subtype_name), + "\tdef __init__(self, *args, **kwargs):\n", + "\t\tsuper({}, self).__init__(*args, **kwargs)\n". + format(subtype_name)] + with open(pipe_path, 'w') as pipe_module_file: + for l in lines: + pipe_module_file.write(l) + + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + # Establish that subclass relationship is improper. + assert issubclass(Sample, Sample) + # Our subtype derives from base Sample... + assert issubclass(subtype, Sample) + # ...but not vice-versa. + assert not issubclass(Sample, subtype) + # And we retained the name. + assert subtype.__name__ == Sample.__name__ + + + @pytest.mark.parametrize(argnames="subtype_name", argvalues=["NonSample"]) + @pytest.mark.parametrize( + argnames="test_type", argvalues=["return_sample", "class_found"]) + def test_subtype_is_not_Sample( + self, tmpdir, atac_pipe_name, subtype_name, test_type, + atacseq_piface_data_with_subtypes, subtypes_section_spec_type): + """ Subtype in interface but not in pipeline is exceptional. """ + + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Write out pipeline module file with non-Sample class definition. + lines = _class_definition_lines(subtype_name, name_super_type="object") + with open(pipe_path, 'w') as pipe_module_file: + pipe_module_file.write("{}\n\n".format(SAMPLE_IMPORT)) + for l in lines: + pipe_module_file.write(l) + + # Create the ProtocolInterface and do the test call. + path_config_file = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(path_config_file) + with pytest.raises(ValueError): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + + @pytest.mark.parametrize(argnames="subtype_name", argvalues=["irrelevant"]) + @pytest.mark.parametrize(argnames="decoy_class", argvalues=[False, True], + ids=lambda decoy: " decoy = {} ".format(decoy)) + def test_subtype_not_implemented( + self, tmpdir, atac_pipe_name, subtype_name, decoy_class, + atacseq_piface_data_with_subtypes, subtypes_section_spec_type): + """ Subtype that doesn't extend Sample isn't used. """ + # Create the pipeline module. + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + lines = _class_definition_lines("Decoy", "object") \ + if decoy_class else [] + with open(pipe_path, 'w') as modfile: + modfile.write("{}\n\n".format(SAMPLE_IMPORT)) + for l in lines: + modfile.write(l) + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data_with_subtypes, + dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + with pytest.raises(ValueError): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + + @pytest.mark.parametrize( + argnames="subtype_name", argvalues=["SubsampleA", "SubsampleB"]) + def test_matches_sample_subtype( + self, tmpdir, atac_pipe_name, subtype_name, atacseq_piface_data): + """ Fetch of subtype is specific even from among multiple subtypes. """ + + # Basic values + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + decoy_class = "Decoy" + decoy_proto = "DECOY" + + # Update the ProtocolInterface data and write it out. + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = { + ATAC_PROTOCOL_NAME: subtype_name, decoy_proto: decoy_class} + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name, + decoy_proto: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + + # Create the collection of definition lines for each class. + legit_lines = _class_definition_lines(subtype_name, Sample.__name__) + decoy_lines = _class_definition_lines(decoy_class, Sample.__name__) + + for lines_order in itertools.permutations([legit_lines, decoy_lines]): + with open(pipe_path, 'w') as pipe_mod_file: + pipe_mod_file.write("{}\n\n".format(SAMPLE_IMPORT)) + for class_lines in lines_order: + for line in class_lines: + pipe_mod_file.write(line) + pipe_mod_file.write("\n\n") + + # We need the new pipeline module file in place before the + # ProtocolInterface is created. + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + assert subtype_name == subtype.__name__ + + + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["single", "nested"]) + def test_subtypes_list( + self, tmpdir, atac_pipe_name, atacseq_piface_data, spec_type): + """ As singleton or within mapping, only 1 subtype allowed. """ + + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + + # Define the classes, writing them in the pipeline module file. + subtype_names = ["ArbitraryA", "PlaceholderB"] + with open(pipe_path, 'w') as pipe_module_file: + pipe_module_file.write("{}\n\n".format(SAMPLE_IMPORT)) + for subtype_name in subtype_names: + # Have the classes be Sample subtypes. + for line in _class_definition_lines( + subtype_name, name_super_type=Sample.__name__): + pipe_module_file.write(line) + pipe_module_file.write("\n\n") + + # Update the ProtocolInterface data. + subtype_section = subtype_names if spec_type == "single" \ + else {ATAC_PROTOCOL_NAME: subtype_names} + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = subtype_section + + # Create the ProtocolInterface. + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + piface = ProtocolInterface(conf_path) + + # We don't really care about exception type, just that one arises. + with pytest.raises(Exception): + piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, + strict_pipe_key=atac_pipe_name, full_pipe_path=pipe_path) + + + @pytest.mark.parametrize( + argnames="target", argvalues=["Leaf", "Middle"]) + @pytest.mark.parametrize( + argnames="spec_type", argvalues=["single", "mapping"]) + def test_sample_grandchild( + self, tmpdir, spec_type, target, + atacseq_piface_data, atac_pipe_name): + """ The subtype to be used can be a grandchild of Sample. """ + + pipe_path = os.path.join(tmpdir.strpath, atac_pipe_name) + intermediate_sample_subtype = "Middle" + leaf_sample_subtype = "Leaf" + + intermediate_subtype_lines = _class_definition_lines( + intermediate_sample_subtype, Sample.__name__) + leaf_subtype_lines = _class_definition_lines( + leaf_sample_subtype, intermediate_sample_subtype) + with open(pipe_path, 'w') as pipe_mod_file: + pipe_mod_file.write("{}\n\n".format(SAMPLE_IMPORT)) + for l in intermediate_subtype_lines: + pipe_mod_file.write(l) + pipe_mod_file.write("\n\n") + for l in leaf_subtype_lines: + pipe_mod_file.write(l) + + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = \ + target if spec_type == "single" else \ + {ATAC_PROTOCOL_NAME: target} + conf_path = _write_config_data( + protomap={ATAC_PROTOCOL_NAME: atac_pipe_name}, + conf_data=atacseq_piface_data, dirpath=tmpdir.strpath) + + piface = ProtocolInterface(conf_path) + subtype = piface.fetch_sample_subtype( + protocol=ATAC_PROTOCOL_NAME, strict_pipe_key=atac_pipe_name, + full_pipe_path=pipe_path) + + assert target == subtype.__name__ + + + @pytest.fixture(scope="function") + def atacseq_piface_data_with_subtypes( + self, request, atacseq_piface_data, atac_pipe_name): + """ + Provide test case with ProtocolInterface data. + + :param pytest._pytest.fixtures.SubRequest request: test case + requesting the parameterization + :param Mapping atacseq_piface_data: the ProtocolInterface data + :param str atac_pipe_name: name for the pipeline + :return Mapping: same as input, but with Sample subtype specification + section mixed in + """ + + # Get the test case's parameterized values. + spec_type = request.getfixturevalue("subtypes_section_spec_type") + subtype_name = request.getfixturevalue("subtype_name") + + # Determine how to specify the subtype(s). + if spec_type is str: + section_value = subtype_name + elif spec_type is dict: + section_value = {ATAC_PROTOCOL_NAME: subtype_name} + else: + raise ValueError("Unexpected subtype section specification type: " + "{}".format(spec_type)) + + # Update and return the interface data. + atacseq_piface_data[atac_pipe_name][SUBTYPES_KEY] = section_value + return atacseq_piface_data + + + +def _class_definition_lines(name, name_super_type): + """ Create lines that define a class. """ + return ["class {t}({st}):\n".format(t=name, st=name_super_type), + "\tdef __init__(self, *args, **kwarggs):\n", + "\t\tsuper({t}, self).__init__(*args, **kwargs)".format( + t=name, st=name_super_type)] + + + +def _create_module(lines_by_class, filepath): + """ + Write out lines that will defined a module. + + :param Sequence[str] lines_by_class: lines that define a class + :param str filepath: path to module file to create + :return str: path to the module file written + """ + lines = "\n\n".join( + [SAMPLE_IMPORT] + ["\n".join(class_lines) + for class_lines in lines_by_class]) + with open(filepath, 'w') as modfile: + modfile.write("{}\n".format(lines)) + return filepath + + + +def _write_config_data(protomap, conf_data, dirpath): + """ + Write ProtocolInterface data to (temp)file. + + :param Mapping protomap: mapping from protocol name to pipeline key/name + :param Mapping conf_data: mapping from pipeline key/name to configuration + data for a PipelineInterface + :param str dirpath: path to filesystem location in which to place the + file to write + :return str: path to the (temp)file written + """ + full_conf_data = {"protocol_mapping": protomap, "pipelines": conf_data} + filepath = os.path.join(dirpath, "pipeline_interface.yaml") + with open(filepath, 'w') as conf_file: + yaml.safe_dump(full_conf_data, conf_file) + return filepath diff --git a/tests/models/independent/test_Sample.py b/tests/models/independent/test_Sample.py index 6532ee38..8c5772b4 100644 --- a/tests/models/independent/test_Sample.py +++ b/tests/models/independent/test_Sample.py @@ -1,12 +1,13 @@ """ Tests for the Sample. """ import os +import tempfile import mock import numpy as np from pandas import Series import pytest import looper -from looper.models import Sample +from looper.models import Sample, SAMPLE_NAME_COLNAME __author__ = "Vince Reuter" @@ -17,46 +18,33 @@ class ParseSampleImplicationsTests: """ Tests for appending columns/fields to a Sample based on a mapping. """ - IMPLIER_NAME = "sample_name" + IMPLIER_NAME = SAMPLE_NAME_COLNAME IMPLIER_VALUES = ["a", "b"] SAMPLE_A_IMPLICATIONS = {"genome": "hg38", "phenome": "hg72"} SAMPLE_B_IMPLICATIONS = {"genome": "hg38"} - IMPLICATIONS = [SAMPLE_A_IMPLICATIONS, SAMPLE_B_IMPLICATIONS] - IMPLICATIONS_MAP = { - IMPLIER_NAME: IMPLICATIONS - } + IMPLICATIONS = {"a": SAMPLE_A_IMPLICATIONS, "b": SAMPLE_B_IMPLICATIONS} + IMPLICATIONS_MAP = {IMPLIER_NAME: IMPLICATIONS} - def test_project_lacks_implications(self, sample): + @pytest.mark.parametrize(argnames="implications", argvalues=[None, {}, []]) + def test_project_no_implications(self, sample, implications): """ With no implications mapping, sample is unmodified. """ before_inference = sample.__dict__ - with mock.patch.object(sample, "prj", create=True): - sample.infer_columns() + sample.infer_columns(implications) after_inference = sample.__dict__ assert before_inference == after_inference - def test_empty_implications(self, sample): - """ Empty implications mapping --> unmodified sample. """ - before_inference = sample.__dict__ - implications = mock.MagicMock(implied_columns={}) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() - assert before_inference == sample.__dict__ - - def test_null_intersection_between_sample_and_implications(self, sample): """ Sample with none of implications' fields --> no change. """ before_inference = sample.__dict__ - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) assert before_inference == sample.__dict__ @pytest.mark.parametrize( argnames=["implier_value", "implications"], - argvalues=zip(IMPLIER_VALUES, IMPLICATIONS), + argvalues=IMPLICATIONS.items(), ids=lambda implier_and_implications: "implier='{}', implications={}".format( implier_and_implications[0], str(implier_and_implications[1]))) @@ -70,11 +58,7 @@ def test_intersection_between_sample_and_implications( # Set the parameterized value for the implications source field. setattr(sample, self.IMPLIER_NAME, implier_value) - - # Perform column inference based on mocked implications. - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) # Validate updates to sample based on column implications & inference. for implied_name, implied_value in implications.items(): @@ -84,29 +68,18 @@ def test_intersection_between_sample_and_implications( @pytest.mark.parametrize( argnames="unmapped_implier_value", argvalues=["totally-wacky-value", 62, None, np.nan]) - @pytest.mark.parametrize( - argnames="implications", argvalues=IMPLICATIONS, - ids=lambda implications: "implied={}".format(str(implications))) def test_sample_has_unmapped_value_for_implication( - self, sample, unmapped_implier_value, implications): + self, sample, unmapped_implier_value): """ Unknown value in implier field --> null inference. """ - # Negative control pre-/post-test. def no_implied_values(): assert all([not hasattr(sample, implied_field_name) - for implied_field_name in implications.keys()]) - + for implied_field_name in self.IMPLICATIONS.keys()]) no_implied_values() - - # Set the parameterized value for the implications source field. setattr(sample, self.IMPLIER_NAME, unmapped_implier_value) - - # Perform column inference based on mocked implications. - implications = mock.MagicMock(implied_columns=self.IMPLICATIONS_MAP) - with mock.patch.object(sample, "prj", create=True, new=implications): - sample.infer_columns() + sample.infer_columns(self.IMPLICATIONS_MAP) no_implied_values() @@ -128,7 +101,7 @@ def sample(self, request): data = request.getfixturevalue("data") else: data = {} - data.setdefault("sample_name", "test-sample") + data.setdefault(SAMPLE_NAME_COLNAME, "test-sample") # Mock the validation and return a new Sample. rubber_stamper = mock.MagicMock(return_value=[]) @@ -150,12 +123,11 @@ class SampleRequirementsTests: ids=lambda has_name: "has_name: {}".format(has_name)) def test_requires_sample_name(self, has_name, data_type): data = {} - sample_name_key = "sample_name" sample_name = "test-sample" if has_name: - data[sample_name_key] = sample_name + data[SAMPLE_NAME_COLNAME] = sample_name sample = Sample(data_type(data)) - assert sample_name == getattr(sample, sample_name_key) + assert sample_name == getattr(sample, SAMPLE_NAME_COLNAME) else: with pytest.raises(ValueError): Sample(data_type(data)) @@ -168,7 +140,7 @@ def test_requires_sample_name(self, has_name, data_type): @pytest.mark.parametrize(argnames="data_type", argvalues=[dict, Series]) def test_exception_type_matches_access_mode(data_type, accessor): """ Exception for attribute access failure reflects access mode. """ - data = {"sample_name": "placeholder"} + data = {SAMPLE_NAME_COLNAME: "placeholder"} sample = Sample(data_type(data)) if accessor == "attr": with pytest.raises(AttributeError): @@ -191,6 +163,7 @@ def test_exception_type_matches_access_mode(data_type, accessor): argnames="preexists", argvalues=[False, True], ids=lambda exists: "preexists={}".format(exists)) def test_make_sample_dirs(paths, preexists, tmpdir): + """ Existence guarantee Sample instance's folders is safe and valid. """ # Derive full paths and assure nonexistence before creation. fullpaths = [] @@ -202,7 +175,7 @@ def test_make_sample_dirs(paths, preexists, tmpdir): fullpaths.append(fullpath) # Make the sample and assure paths preexistence. - s = Sample({"sample_name": "placeholder"}) + s = Sample({SAMPLE_NAME_COLNAME: "placeholder"}) s.paths = fullpaths # Base the test's initial condition on the parameterization. diff --git a/tests/models/independent/test_SampleSheet.py b/tests/models/independent/test_SampleSheet.py deleted file mode 100644 index 27d56e78..00000000 --- a/tests/models/independent/test_SampleSheet.py +++ /dev/null @@ -1,16 +0,0 @@ -""" Tests for the SampleSheet model. """ - -import pandas as pd -import pytest -from looper.models import SampleSheet - - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - - -# TODO: implement a few of these. -@pytest.mark.skip("Not implemented") -class SampleSheetRoundtripTests: - pass diff --git a/tests/models/integration/test_Project_Sample_interaction.py b/tests/models/integration/test_Project_Sample_interaction.py index 0067ff82..12d0a7fb 100644 --- a/tests/models/integration/test_Project_Sample_interaction.py +++ b/tests/models/integration/test_Project_Sample_interaction.py @@ -1,11 +1,19 @@ """ Tests for interaction between a Project and a Sample. """ from collections import OrderedDict +import copy +import itertools import os +import random + import pandas as pd import pytest import yaml -from looper.models import Project, SAMPLE_ANNOTATIONS_KEY + +from looper.models import \ + Project, Sample, \ + SAMPLE_ANNOTATIONS_KEY, SAMPLE_NAME_COLNAME +from looper.utils import alpha_cased __author__ = "Vince Reuter" @@ -22,9 +30,171 @@ "input_dir": "dummy/sequencing/data", "tools_folder": "arbitrary-seq-tools-folder"} +NAME_ANNOTATIONS_FILE = "annotations.csv" +SAMPLE_NAMES = ["WGBS_mm10", "ATAC_mm10", "WGBS_rn6", "ATAC_rn6"] +COLUMNS = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] +VALUES1 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))] +VALUES2 = [random.randint(-5, 5) for _ in range(len(SAMPLE_NAMES))] +LIBRARIES = ["WGBS", "ATAC", "WGBS", "ATAC"] +DATA = list(zip(SAMPLE_NAMES, VALUES1, VALUES2, LIBRARIES)) +DATA_FOR_SAMPLES = [ + {SAMPLE_NAME_COLNAME: SAMPLE_NAMES}, + {"val1": VALUES1}, {"val2": VALUES2}, {"library": LIBRARIES}] +PROJECT_CONFIG_DATA = {"metadata": {"sample_annotation": NAME_ANNOTATIONS_FILE}} +PROTOCOLS = ["WGBS", "ATAC"] + + + +def pytest_generate_tests(metafunc): + """ Customization of test cases within this module. """ + if metafunc.cls == BuildSheetTests: + if "protocols" in metafunc.fixturenames: + # Apply the test case to each of the possible combinations of + # protocols, from none at all up to all of them. + metafunc.parametrize( + argnames="protocols", + argvalues=list(itertools.chain.from_iterable( + itertools.combinations(PROTOCOLS, x) + for x in range(1 + len(PROTOCOLS)))), + ids=lambda protos: + " protocols = {} ".format(",".join(protos))) + if "delimiter" in metafunc.fixturenames: + metafunc.parametrize(argnames="delimiter", argvalues=[",", "\t"]) + + + +@pytest.fixture(scope="function") +def proj_conf(): + """ Provide the basic configuration data. """ + return copy.deepcopy(PROJECT_CONFIG_DATA) + + + +@pytest.fixture(scope="function") +def path_proj_conf_file(tmpdir, proj_conf): + """ Write basic project configuration data and provide filepath. """ + conf_path = os.path.join(tmpdir.strpath, "project_config.yaml") + with open(conf_path, 'w') as conf: + yaml.safe_dump(proj_conf, conf) + return conf_path + + + +@pytest.fixture(scope="function") +def path_anns_file(request, tmpdir, sample_sheet): + """ Write basic annotations, optionally using a different delimiter. """ + filepath = os.path.join(tmpdir.strpath, NAME_ANNOTATIONS_FILE) + if "delimiter" in request.fixturenames: + delimiter = request.getfixturevalue("delimiter") + else: + delimiter = "," + with open(filepath, 'w') as anns_file: + sample_sheet.to_csv(anns_file, sep=delimiter, index=False) + return filepath + + + +@pytest.fixture(scope="function") +def samples_rawdata(): + return copy.deepcopy(DATA) + + + +@pytest.fixture(scope="function") +def sample_sheet(samples_rawdata): + df = pd.DataFrame(samples_rawdata) + df.columns = [SAMPLE_NAME_COLNAME, "val1", "val2", "library"] + return df + + + +def test_samples_are_generic(path_anns_file, path_proj_conf_file): + """ Regardless of protocol, Samples for sheet are generic. """ + # Annotations filepath fixture is also writes that file, so + # it's needed even though that return value isn't used locally. + p = Project(path_proj_conf_file) + assert len(SAMPLE_NAMES) == p.num_samples + samples = list(p.samples) + assert p.num_samples == len(samples) + assert all([Sample is type(s) for s in samples]) + + + +class BuildSheetTests: + """ Tests for construction of sheet of Project's Samples. """ + + # Note: seemingly unused parameters may affect parameterization + # logic of other fixtures used by a test case; tread lightly. + + + def test_no_samples(self, protocols, delimiter, path_empty_project): + """ Lack of Samples is unproblematic for the sheet build. """ + # Regardless of protocol(s), the sheet should be empty. + p = Project(path_empty_project) + sheet = p.build_sheet(*protocols) + assert sheet.empty + + + @pytest.mark.parametrize( + argnames="which_sample_index", argvalues=range(len(SAMPLE_NAMES))) + def test_single_sample( + self, tmpdir, path_proj_conf_file, which_sample_index): + """ Single Sample is perfectly valid for Project and sheet. """ + + # Pull out the values for the current sample. + values = DATA[which_sample_index] + + # Write the annotations. + anns_path = os.path.join(tmpdir.strpath, NAME_ANNOTATIONS_FILE) + with open(anns_path, 'w') as anns_file: + anns_file.write("{}\n".format(",".join(COLUMNS))) + anns_file.write("{}\n".format(",".join([str(v) for v in values]))) + + # Build the sheet. + p = Project(path_proj_conf_file) + sheet = p.build_sheet() + + # It should be a single-row DataFrame. + assert isinstance(sheet, pd.DataFrame) + assert 1 == len(sheet) + assert 1 == p.num_samples + + # There will be additional values added from the Project, + # but the core data values will have remained the same. + sample = list(p.samples)[0] + for attr, exp_val in zip(COLUMNS, values): + obs_val = getattr(sample, attr) + try: + assert exp_val == obs_val + except AssertionError as e: + try: + assert exp_val == int(obs_val) + except AssertionError: + raise e + + + def test_multiple_samples( + self, protocols, path_anns_file, path_proj_conf_file): + """ Project also processes multiple Sample fine. """ + + p = Project(path_proj_conf_file) + + # Total sample count is constant. + assert len(SAMPLE_NAMES) == sum(1 for _ in p.samples) + + # But the sheet permits filtering to specific protocol(s). + exp_num_samples = len(SAMPLE_NAMES) if not protocols else \ + sum(sum(1 for l in LIBRARIES if l == p) for p in protocols) + sheet = p.build_sheet(*protocols) + assert exp_num_samples == len(sheet) + if protocols: + fuzzy_protos = {alpha_cased(p) for p in protocols} + for _, sample_data in sheet.iterrows(): + assert alpha_cased(sample_data.library) in fuzzy_protos + -class ProjectSampleInteractionTests: +class SampleFolderCreationTests: """ Tests for interaction between Project and Sample. """ CONFIG_DATA_PATHS_HOOK = "uses_paths_section" diff --git a/tests/models/test_models_smoke.py b/tests/models/test_models_smoke.py index 97dd411e..83c07879 100644 --- a/tests/models/test_models_smoke.py +++ b/tests/models/test_models_smoke.py @@ -1,36 +1,117 @@ """ Basic smoketests for models """ +import logging import pytest -from looper.models import AttributeDict +import looper +from looper.models import AttributeDict, Project + __author__ = "Vince Reuter" __email__ = "vreuter@virgnia.edu" +_LOGGER = logging.getLogger(__name__) + + def pytest_generate_tests(metafunc): """ Dynamic test case parameterization. """ - if metafunc.cls == AttributeDictRepresentationSmokeTests: - metafunc.parametrize(argnames="representation_method", - argvalues=["__repr__", "__str__"]) + if "funcname" in metafunc.fixturenames: + metafunc.parametrize( + argnames="funcname", argvalues=["__repr__", "__str__"]) @pytest.mark.usefixtures("write_project_files") -class AttributeDictRepresentationSmokeTests: +class AttributeDictRepresentationTests: """ Non-fail validation of AttributeDict representations. """ @pytest.mark.parametrize( argnames="data", argvalues=[[('CO', 145)], {'CO': {"US-50": [550, 62, 145]}}]) - def test_AttributeDict_representations( - self, data, representation_method): + def test_AttributeDict_representations_smoke( + self, data, funcname): """ Text representation of base AttributeDict doesn't fail. """ attrdict = AttributeDict(data) - getattr(attrdict, representation_method).__call__() + getattr(attrdict, funcname).__call__() - def test_Project_representations(self, proj, representation_method): + def test_Project_representations_smoke(self, proj, funcname): """ Representation of Project (AttributeDict subclass) is failsafe. """ - getattr(proj, representation_method).__call__() + getattr(proj, funcname).__call__() + + + def test_project_repr_name_inclusion(self, proj, funcname): + """ Test Project text representation. """ + func = getattr(proj, funcname) + result = func.__call__() + assert type(result) is str + classname = proj.__class__.__name__ + if funcname == "__str__": + assert classname in result + elif funcname == "__repr__": + assert classname not in result + else: + raise ValueError("Unexpected representation function: {}". + format(funcname)) + + + +class ModelCreationSmokeTests: + """ Smoketests for creation of various types of project-related models. """ + + # TODO: migrate these to pytest.raises(None) with 3.1. + + def test_empty_project(self, path_empty_project): + """ It's unproblematic to create a Project that lacks samples. """ + Project(path_empty_project) + + + +class ModelRepresentationSmokeTests: + """ Tests for the text representation of important ADTs. """ + + # NOTE: similar parameterization, but Project construction needs + # to be handled with greater care when testing the actual call. + + @pytest.mark.parametrize( + argnames="class_name", argvalues=looper.models.__classes__) + def test_implements_repr_smoke(self, class_name): + """ Each important ADT must implement a representation method. """ + + funcname = "__repr__" + + # Attempt a control assertion, that a subclass that doesn't override + # the given method of its superclass, uses the superclass version of + # the function in question. + class ObjectSubclass(object): + def __init__(self): + super(ObjectSubclass, self).__init__() + assert getattr(ObjectSubclass, funcname) is getattr(object, funcname) + + # Make the actual assertion of interest. + adt = getattr(looper.models, class_name) + assert getattr(adt, funcname) != \ + getattr(adt.__bases__[0], funcname) + + + @pytest.mark.parametrize( + argnames="class_name", + argvalues=[cn for cn in looper.models.__classes__ + if cn != "Project"]) + def test_repr_smoke( + self, tmpdir, class_name, basic_instance_data, funcname): + """ Object representation method successfully returns string. """ + # Note that tmpdir is used when config file needs to be written. + cls = getattr(looper.models, class_name) + instance = cls(basic_instance_data) + func = getattr(instance, funcname) + result = func.__call__() + if funcname == "__str__": + assert class_name in result + elif funcname == "__repr__": + assert type(result) is str + else: + raise ValueError("Unexpected representation method: {}". + format(funcname)) diff --git a/tests/test_looper.py b/tests/test_looper.py index 14b8b331..ff3e24c2 100644 --- a/tests/test_looper.py +++ b/tests/test_looper.py @@ -14,13 +14,12 @@ import numpy.random as nprand import pytest -import yaml from looper.looper import aggregate_exec_skip_reasons import looper.models -from looper.models import AttributeDict, ATTRDICT_METADATA, COL_KEY_SUFFIX +from looper.models import COL_KEY_SUFFIX from .conftest import \ - DERIVED_COLNAMES, EXPECTED_MERGED_SAMPLE_FILES, FILE_BY_SAMPLE, \ + DERIVED_COLNAMES, EXPECTED_MERGED_SAMPLE_FILES, \ LOOPER_ARGS_BY_PIPELINE, MERGED_SAMPLE_INDICES, NGS_SAMPLE_INDICES, \ NUM_SAMPLES, PIPELINE_TO_REQD_INFILES_BY_SAMPLE @@ -32,15 +31,11 @@ @pytest.mark.usefixtures("write_project_files") class ProjectConstructorTest: - # TODO: docstrings and atomicity/encapsulation. - # TODO: conversion to pytest for consistency. - @pytest.mark.parametrize(argnames="attr_name", argvalues=["required_inputs", "all_input_attr"]) def test_sample_required_inputs_not_set(self, proj, attr_name): """ Samples' inputs are not set in `Project` ctor. """ - # TODO: update this to check for null if design is changed as may be. with pytest.raises(AttributeError): getattr(proj.samples[nprand.randint(len(proj.samples))], attr_name) @@ -67,11 +62,12 @@ def test_data_sources_derivation(self, proj, sample_index): merged_columns = filter( lambda col_key: (col_key != "col_modifier") and not col_key.endswith(COL_KEY_SUFFIX), - proj.samples[sample_index].merged_cols.keys() - ) + proj.samples[sample_index].merged_cols.keys()) # Order may be lost due to mapping. # We don't care about that here, or about duplicates. - assert set(DERIVED_COLNAMES) == set(merged_columns) + expected = set(DERIVED_COLNAMES) + observed = set(merged_columns) + assert expected == observed @pytest.mark.parametrize(argnames="sample_index", @@ -94,29 +90,13 @@ def test_unmerged_samples_lack_merged_cols(self, proj, sample_index): assert not proj.samples[sample_index].merged_cols - @pytest.mark.parametrize(argnames="sample_index", - argvalues=range(NUM_SAMPLES)) - def test_multiple_add_sample_sheet_calls_no_rederivation(self, proj, - sample_index): - """ Don't rederive `derived_columns` for multiple calls. """ - expected_files = FILE_BY_SAMPLE[sample_index] - def _observed(p): - return [os.path.basename(f) - for f in p.samples[sample_index].file.split(" ")] - assert expected_files == _observed(proj) - proj.add_sample_sheet() - proj.add_sample_sheet() - assert expected_files == _observed(proj) - proj.add_sample_sheet() - assert expected_files == _observed(proj) - - def test_duplicate_derived_columns_still_derived(self, proj): sample_index = 2 observed_nonmerged_col_basename = \ os.path.basename(proj.samples[sample_index].nonmerged_col) assert "c.txt" == observed_nonmerged_col_basename - assert "" == proj.samples[sample_index].locate_data_source('file') + assert "" == proj.samples[sample_index].locate_data_source( + proj.data_sources, 'file') @@ -139,7 +119,8 @@ def test_required_inputs(self, proj, pipe_iface, sample_index): observed_required_inputs = [os.path.basename(f) for f in sample.required_inputs] assert expected_required_inputs == observed_required_inputs - assert sample.confirm_required_inputs() + error_type, error_message = sample.determine_missing_requirements() + assert error_type is None and not error_message @pytest.mark.parametrize(argnames="sample_index", @@ -154,7 +135,8 @@ def test_ngs_pipe_ngs_sample(self, proj, pipe_iface, sample_index): [sample_index][0]) observed_required_input_basename = \ os.path.basename(sample.required_inputs[0]) - assert sample.confirm_required_inputs() + error_type, error_message = sample.determine_missing_requirements() + assert error_type is None and not error_message assert 1 == len(sample.required_inputs) assert expected_required_input_basename == \ observed_required_input_basename @@ -223,102 +205,6 @@ def test_looper_args_usage(self, pipe_iface, pipeline, expected): -@pytest.mark.usefixtures("write_project_files") -class SampleRoundtripTests: - """ Test equality of objects written to and from YAML files. """ - - - def test_default_behavioral_metadata_retention(self, tmpdir, proj): - """ With default metadata, writing to file and restoring is OK. """ - tempfolder = str(tmpdir) - sample_tempfiles = [] - for sample in proj.samples: - path_sample_tempfile = os.path.join(tempfolder, - "{}.yaml".format(sample.name)) - sample.to_yaml(path_sample_tempfile) - sample_tempfiles.append(path_sample_tempfile) - for original_sample, temp_sample_path in zip(proj.samples, - sample_tempfiles): - with open(temp_sample_path, 'r') as sample_file: - restored_sample_data = yaml.load(sample_file) - ad = AttributeDict(restored_sample_data) - self._metadata_equality(original_sample.prj, ad) - - - def test_modified_behavioral_metadata_preservation(self, tmpdir, proj): - """ Behavior metadata modifications are preserved to/from disk. """ - tempfolder = str(tmpdir) - sample_tempfiles = [] - samples = proj.samples - assert 1 < len(samples), "Too few samples: {}".format(len(samples)) - - # TODO: note that this may fail if metadata - # modification prohibition is implemented. - samples[0].prj.__dict__["_force_nulls"] = True - samples[1].prj.__dict__["_attribute_identity"] = True - - for sample in proj.samples[:2]: - path_sample_tempfile = os.path.join(tempfolder, - "{}.yaml".format(sample.name)) - sample.to_yaml(path_sample_tempfile) - sample_tempfiles.append(path_sample_tempfile) - - with open(sample_tempfiles[0], 'r') as f: - sample_0_data = yaml.load(f) - assert AttributeDict(sample_0_data).prj._force_nulls is True - - with open(sample_tempfiles[1], 'r') as f: - sample_1_data = yaml.load(f) - sample_1_restored_attrdict = AttributeDict(sample_1_data) - assert sample_1_restored_attrdict.prj.does_not_exist == "does_not_exist" - - - def _check_nested_metadata(self, original, restored): - """ - Check equality for metadata items, accounting for nesting within - instances of AttributeDict and its child classes. - - :param AttributeDict original: original AttributeDict (or child) object - :param AttributeDict restored: instance restored from writing - original object to file, then reparsing and constructing - AttributeDict instance - :return bool: whether metadata items are equivalent between objects - at all nesting levels - """ - for key, data in original.items(): - if key not in restored: - return False - equal_level = self._metadata_equality(original, restored) - if not equal_level: - return False - if isinstance(original, AttributeDict): - return isinstance(restored, AttributeDict) and \ - self._check_nested_metadata(data, restored[key]) - else: - return True - - - @staticmethod - def _metadata_equality(original, restored): - """ - Check nested levels of metadata equality. - - :param AttributeDict original: a raw AttributeDict or an - instance of a child class that was serialized and written to disk - :param AttributeDict restored: an AttributeDict instance created by - parsing the file associated with the original object - :return bool: whether all metadata keys/items have equal value - when comparing original object to restored version - """ - for metadata_item in ATTRDICT_METADATA: - if metadata_item not in original or \ - metadata_item not in restored or \ - original[metadata_item] != restored[metadata_item]: - return False - return True - - - class RunErrorReportTests: """ Tests for aggregation of submission failures. """